Browse Source

Refactor parser code (#9042)

### What problem does this PR solve?

Refactor code

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
tags/v0.20.0
Jin Hai 3 months ago
parent
commit
03daf4618c
No account linked to committer's email address

+ 2
- 2
deepdoc/parser/docx_parser.py View File

def __compose_table_content(self, df): def __compose_table_content(self, df):


def blockType(b): def blockType(b):
patt = [
pattern = [
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
(r"^(20|19)[0-9]{2}年$", "Dt"), (r"^(20|19)[0-9]{2}年$", "Dt"),
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"), (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"), (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
(r"^.{1}$", "Sg") (r"^.{1}$", "Sg")
] ]
for p, n in patt:
for p, n in pattern:
if re.search(p, b): if re.search(p, b):
return n return n
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1] tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]

+ 4
- 4
deepdoc/parser/excel_parser.py View File

file_like_object.seek(0) file_like_object.seek(0)


if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')): if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook")
logging.info("Not an Excel file, converting CSV to Excel Workbook")


try: try:
file_like_object.seek(0) file_like_object.seek(0)
return RAGFlowExcelParser._dataframe_to_workbook(df) return RAGFlowExcelParser._dataframe_to_workbook(df)


except Exception as e_csv: except Exception as e_csv:
raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}")
raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")


try: try:
return load_workbook(file_like_object,data_only= True) return load_workbook(file_like_object,data_only= True)
except Exception as e: except Exception as e:
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
logging.info(f"openpyxl load error: {e}, try pandas instead")
try: try:
file_like_object.seek(0) file_like_object.seek(0)
df = pd.read_excel(file_like_object) df = pd.read_excel(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df) return RAGFlowExcelParser._dataframe_to_workbook(df)
except Exception as e_pandas: except Exception as e_pandas:
raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")


@staticmethod @staticmethod
def _dataframe_to_workbook(df): def _dataframe_to_workbook(df):

+ 1
- 1
deepdoc/parser/html_parser.py View File

@classmethod @classmethod
def parser_txt(cls, txt): def parser_txt(cls, txt):
if not isinstance(txt, str): if not isinstance(txt, str):
raise TypeError("txt type should be str!")
raise TypeError("txt type should be string!")
html_doc = readability.Document(txt) html_doc = readability.Document(txt)
title = html_doc.title() title = html_doc.title()
content = html_text.extract_text(html_doc.summary(html_partial=True)) content = html_text.extract_text(html_doc.summary(html_partial=True))

Loading…
Cancel
Save