1年前 · 12095f8cd6
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@@ -228,7 +228,7 @@ class WordExtractor(BaseExtractor):
        def parse_paragraph(paragraph):
            paragraph_content = []
            for run in paragraph.runs:
                if run.element.tag.endswith('r'):
                if hasattr(run.element, 'tag') and isinstance(element.tag, str) and run.element.tag.endswith('r'):
                    drawing_elements = run.element.findall(
                        './/{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
                    for drawing in drawing_elements:
@@ -248,13 +248,14 @@ class WordExtractor(BaseExtractor):
        paragraphs = doc.paragraphs.copy()
        tables = doc.tables.copy()
        for element in doc.element.body:
            if element.tag.endswith('p'):  # paragraph
                para = paragraphs.pop(0)
                parsed_paragraph = parse_paragraph(para)
                if parsed_paragraph:
                    content.append(parsed_paragraph)
            elif element.tag.endswith('tbl'):  # table
                table = tables.pop(0)
                content.append(self._table_to_markdown(table,image_map))
            if hasattr(element, 'tag'):
                if isinstance(element.tag, str) and element.tag.endswith('p'):  # paragraph
                    para = paragraphs.pop(0)
                    parsed_paragraph = parse_paragraph(para)
                    if parsed_paragraph:
                        content.append(parsed_paragraph)
                elif isinstance(element.tag, str) and element.tag.endswith('tbl'):  # table
                    table = tables.pop(0)
                    content.append(self._table_to_markdown(table,image_map))
        return '\n'.join(content)