Explorar el Código

Use `python-docx` to extract docx files (#2654)

tags/0.5.9
Bowen Liang hace 1 año
padre
commit
b163545771
No account linked to committer's email address
Se han modificado 2 ficheros con 12 adiciones y 10 borrados
  1. 11
    9
      api/core/rag/extractor/word_extractor.py
  2. 1
    1
      api/requirements.txt

+ 11
- 9
api/core/rag/extractor/word_extractor.py Ver fichero

@@ -10,7 +10,7 @@ from core.rag.models.document import Document


class WordExtractor(BaseExtractor):
"""Load pdf files.
"""Load docx files.


Args:
@@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor):

def extract(self) -> list[Document]:
"""Load given path as single page."""
import docx2txt

return [
Document(
page_content=docx2txt.process(self.file_path),
metadata={"source": self.file_path},
)
]
from docx import Document as docx_Document

document = docx_Document(self.file_path)
doc_texts = [paragraph.text for paragraph in document.paragraphs]
content = '\n'.join(doc_texts)

return [Document(
page_content=content,
metadata={"source": self.file_path},
)]

@staticmethod
def _is_valid_url(url: str) -> bool:

+ 1
- 1
api/requirements.txt Ver fichero

@@ -32,7 +32,7 @@ celery==5.2.7
redis~=4.5.4
openpyxl==3.1.2
chardet~=5.1.0
docx2txt==0.8
python-docx~=1.1.0
pypdfium2==4.16.0
resend~=0.7.0
pyjwt~=2.8.0

Cargando…
Cancelar
Guardar