| @@ -144,6 +144,7 @@ NOTION_INTERNAL_SECRET=you-internal-secret | |||
| ETL_TYPE=dify | |||
| UNSTRUCTURED_API_URL= | |||
| UNSTRUCTURED_API_KEY= | |||
| SSRF_PROXY_HTTP_URL= | |||
| SSRF_PROXY_HTTPS_URL= | |||
| @@ -365,6 +365,7 @@ class Config: | |||
| self.ETL_TYPE = get_env('ETL_TYPE') | |||
| self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL') | |||
| self.UNSTRUCTURED_API_KEY = get_env('UNSTRUCTURED_API_KEY') | |||
| self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED') | |||
| self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO') | |||
| @@ -96,6 +96,7 @@ class ExtractProcessor: | |||
| file_extension = input_file.suffix.lower() | |||
| etl_type = current_app.config['ETL_TYPE'] | |||
| unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] | |||
| unstructured_api_key = current_app.config['UNSTRUCTURED_API_KEY'] | |||
| if etl_type == 'Unstructured': | |||
| if file_extension == '.xlsx' or file_extension == '.xls': | |||
| extractor = ExcelExtractor(file_path) | |||
| @@ -115,7 +116,7 @@ class ExtractProcessor: | |||
| elif file_extension == '.eml': | |||
| extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url) | |||
| elif file_extension == '.ppt': | |||
| extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url) | |||
| extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key) | |||
| elif file_extension == '.pptx': | |||
| extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url) | |||
| elif file_extension == '.xml': | |||
| @@ -17,16 +17,18 @@ class UnstructuredPPTExtractor(BaseExtractor): | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str | |||
| api_url: str, | |||
| api_key: str | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| self._api_key = api_key | |||
| def extract(self) -> list[Document]: | |||
| from unstructured.partition.api import partition_via_api | |||
| elements = partition_via_api(filename=self._file_path, api_url=self._api_url) | |||
| elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) | |||
| text_by_page = {} | |||
| for element in elements: | |||
| page = element.metadata.page_number | |||