浏览代码

extract docx filter comment element (#7092)

tags/0.7.0
Jyong 1年前
父节点
当前提交
12095f8cd6
没有帐户链接到提交者的电子邮件
共有 1 个文件被更改,包括 10 次插入9 次删除
  1. 10
    9
      api/core/rag/extractor/word_extractor.py

+ 10
- 9
api/core/rag/extractor/word_extractor.py 查看文件

@@ -228,7 +228,7 @@ class WordExtractor(BaseExtractor):
def parse_paragraph(paragraph):
paragraph_content = []
for run in paragraph.runs:
if run.element.tag.endswith('r'):
if hasattr(run.element, 'tag') and isinstance(element.tag, str) and run.element.tag.endswith('r'):
drawing_elements = run.element.findall(
'.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
for drawing in drawing_elements:
@@ -248,13 +248,14 @@ class WordExtractor(BaseExtractor):
paragraphs = doc.paragraphs.copy()
tables = doc.tables.copy()
for element in doc.element.body:
if element.tag.endswith('p'): # paragraph
para = paragraphs.pop(0)
parsed_paragraph = parse_paragraph(para)
if parsed_paragraph:
content.append(parsed_paragraph)
elif element.tag.endswith('tbl'): # table
table = tables.pop(0)
content.append(self._table_to_markdown(table,image_map))
if hasattr(element, 'tag'):
if isinstance(element.tag, str) and element.tag.endswith('p'): # paragraph
para = paragraphs.pop(0)
parsed_paragraph = parse_paragraph(para)
if parsed_paragraph:
content.append(parsed_paragraph)
elif isinstance(element.tag, str) and element.tag.endswith('tbl'): # table
table = tables.pop(0)
content.append(self._table_to_markdown(table,image_map))
return '\n'.join(content)


正在加载...
取消
保存