| @@ -26,7 +26,7 @@ class UnstructuredEmailExtractor(BaseExtractor): | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.email import partition_email | |||
| elements = partition_email(filename=self._file_path, api_url=self._api_url) | |||
| elements = partition_email(filename=self._file_path) | |||
| # noinspection PyBroadException | |||
| try: | |||
| @@ -36,7 +36,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor): | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.md import partition_md | |||
| elements = partition_md(filename=self._file_path, api_url=self._api_url) | |||
| elements = partition_md(filename=self._file_path) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) | |||
| documents = [] | |||
| @@ -26,7 +26,7 @@ class UnstructuredMsgExtractor(BaseExtractor): | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.msg import partition_msg | |||
| elements = partition_msg(filename=self._file_path, api_url=self._api_url) | |||
| elements = partition_msg(filename=self._file_path) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) | |||
| documents = [] | |||
| @@ -24,9 +24,9 @@ class UnstructuredPPTExtractor(BaseExtractor): | |||
| self._api_url = api_url | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.ppt import partition_ppt | |||
| from unstructured.partition.api import partition_via_api | |||
| elements = partition_ppt(filename=self._file_path, api_url=self._api_url) | |||
| elements = partition_via_api(filename=self._file_path, api_url=self._api_url) | |||
| text_by_page = {} | |||
| for element in elements: | |||
| page = element.metadata.page_number | |||
| @@ -26,7 +26,7 @@ class UnstructuredPPTXExtractor(BaseExtractor): | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.pptx import partition_pptx | |||
| elements = partition_pptx(filename=self._file_path, api_url=self._api_url) | |||
| elements = partition_pptx(filename=self._file_path) | |||
| text_by_page = {} | |||
| for element in elements: | |||
| page = element.metadata.page_number | |||
| @@ -26,7 +26,7 @@ class UnstructuredTextExtractor(BaseExtractor): | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.text import partition_text | |||
| elements = partition_text(filename=self._file_path, api_url=self._api_url) | |||
| elements = partition_text(filename=self._file_path) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) | |||
| documents = [] | |||
| @@ -26,7 +26,7 @@ class UnstructuredXmlExtractor(BaseExtractor): | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.xml import partition_xml | |||
| elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url) | |||
| elements = partition_xml(filename=self._file_path, xml_keep_tags=True) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) | |||
| documents = [] | |||