### What problem does this PR solve? #474 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.3.1
| @@ -67,7 +67,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||
| pdf_parser = None | |||
| sections, tbls = [], [] | |||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| doc_parser = DocxParser() | |||
| # TODO: table of contents need to be removed | |||
| @@ -93,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||
| pdf_parser = None | |||
| sections = [] | |||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| for txt in Docx()(filename, binary): | |||
| sections.append(txt) | |||
| @@ -119,7 +119,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| res = [] | |||
| pdf_parser = None | |||
| sections = [] | |||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections, tbls = Docx()(filename, binary) | |||
| res = tokenize_table(tbls, doc, eng) | |||
| @@ -60,7 +60,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| eng = lang.lower() == "english" # is_english(cks) | |||
| if re.search(r"\.docx?$", filename, re.IGNORECASE): | |||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| sections = [txt for txt in laws.Docx()(filename, binary) if txt] | |||
| callback(0.8, "Finish parsing.") | |||