|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WordExtractor(BaseExtractor): |
|
|
class WordExtractor(BaseExtractor): |
|
|
"""Load pdf files. |
|
|
|
|
|
|
|
|
"""Load docx files. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args: |
|
|
Args: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract(self) -> list[Document]: |
|
|
def extract(self) -> list[Document]: |
|
|
"""Load given path as single page.""" |
|
|
"""Load given path as single page.""" |
|
|
import docx2txt |
|
|
|
|
|
|
|
|
|
|
|
return [ |
|
|
|
|
|
Document( |
|
|
|
|
|
page_content=docx2txt.process(self.file_path), |
|
|
|
|
|
metadata={"source": self.file_path}, |
|
|
|
|
|
) |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
from docx import Document as docx_Document |
|
|
|
|
|
|
|
|
|
|
|
document = docx_Document(self.file_path) |
|
|
|
|
|
doc_texts = [paragraph.text for paragraph in document.paragraphs] |
|
|
|
|
|
content = '\n'.join(doc_texts) |
|
|
|
|
|
|
|
|
|
|
|
return [Document( |
|
|
|
|
|
page_content=content, |
|
|
|
|
|
metadata={"source": self.file_path}, |
|
|
|
|
|
)] |
|
|
|
|
|
|
|
|
@staticmethod |
|
|
@staticmethod |
|
|
def _is_valid_url(url: str) -> bool: |
|
|
def _is_valid_url(url: str) -> bool: |