| if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true': | if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true': | ||||
| from gevent import monkey | from gevent import monkey | ||||
| monkey.patch_all() | monkey.patch_all() | ||||
| # if os.environ.get("VECTOR_STORE") == 'milvus': | # if os.environ.get("VECTOR_STORE") == 'milvus': | ||||
| import grpc.experimental.gevent | import grpc.experimental.gevent | ||||
| grpc.experimental.gevent.init_gevent() | grpc.experimental.gevent.init_gevent() | ||||
| import langchain | import langchain | ||||
| langchain.verbose = True | langchain.verbose = True | ||||
| import json | import json | ||||
| # DO NOT REMOVE BELOW | # DO NOT REMOVE BELOW | ||||
| from events import event_handlers | from events import event_handlers | ||||
| from models import account, dataset, model, source, task, tool, tools, web | from models import account, dataset, model, source, task, tool, tools, web | ||||
| # DO NOT REMOVE ABOVE | # DO NOT REMOVE ABOVE | ||||
| # fix windows platform | # fix windows platform | ||||
| if os.name == "nt": | if os.name == "nt": | ||||
| os.system('tzutil /s "UTC"') | |||||
| os.system('tzutil /s "UTC"') | |||||
| else: | else: | ||||
| os.environ['TZ'] = 'UTC' | os.environ['TZ'] = 'UTC' | ||||
| time.tzset() | time.tzset() | ||||
| class DifyApp(Flask): | class DifyApp(Flask): | ||||
| pass | pass | ||||
| # ------------- | # ------------- | ||||
| # Configuration | # Configuration | ||||
| # ------------- | # ------------- | ||||
| config_type = os.getenv('EDITION', default='SELF_HOSTED') # ce edition first | config_type = os.getenv('EDITION', default='SELF_HOSTED') # ce edition first | ||||
| # ---------------------------- | # ---------------------------- | ||||
| # Application Factory Function | # Application Factory Function | ||||
| # ---------------------------- | # ---------------------------- | ||||
| app = create_app() | app = create_app() | ||||
| celery = app.extensions["celery"] | celery = app.extensions["celery"] | ||||
| if app.config['TESTING']: | if app.config['TESTING']: | ||||
| print("App is running in TESTING mode") | print("App is running in TESTING mode") | ||||
| from typing import Optional | from typing import Optional | ||||
| import pandas as pd | import pandas as pd | ||||
| import xlrd | |||||
| from core.rag.extractor.extractor_base import BaseExtractor | from core.rag.extractor.extractor_base import BaseExtractor | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| self._autodetect_encoding = autodetect_encoding | self._autodetect_encoding = autodetect_encoding | ||||
| def extract(self) -> list[Document]: | def extract(self) -> list[Document]: | ||||
| """ parse excel file""" | |||||
| if self._file_path.endswith('.xls'): | |||||
| return self._extract4xls() | |||||
| elif self._file_path.endswith('.xlsx'): | |||||
| return self._extract4xlsx() | |||||
| def _extract4xls(self) -> list[Document]: | |||||
| wb = xlrd.open_workbook(filename=self._file_path) | |||||
| documents = [] | |||||
| # loop over all sheets | |||||
| for sheet in wb.sheets(): | |||||
| for row_index, row in enumerate(sheet.get_rows(), start=1): | |||||
| row_header = None | |||||
| if self.is_blank_row(row): | |||||
| continue | |||||
| if row_header is None: | |||||
| row_header = row | |||||
| continue | |||||
| item_arr = [] | |||||
| for index, cell in enumerate(row): | |||||
| txt_value = str(cell.value) | |||||
| item_arr.append(f'{row_header[index].value}:{txt_value}') | |||||
| item_str = "\n".join(item_arr) | |||||
| document = Document(page_content=item_str, metadata={'source': self._file_path}) | |||||
| documents.append(document) | |||||
| return documents | |||||
| def _extract4xlsx(self) -> list[Document]: | |||||
| """Load from file path using Pandas.""" | """Load from file path using Pandas.""" | ||||
| data = [] | data = [] | ||||
| # 使用 Pandas 读取 Excel 文件的每个工作表 | |||||
| # Read each worksheet of an Excel file using Pandas | |||||
| xls = pd.ExcelFile(self._file_path) | xls = pd.ExcelFile(self._file_path) | ||||
| for sheet_name in xls.sheet_names: | for sheet_name in xls.sheet_names: | ||||
| df = pd.read_excel(xls, sheet_name=sheet_name) | df = pd.read_excel(xls, sheet_name=sheet_name) | ||||
| item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) | item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) | ||||
| document = Document(page_content=item, metadata={'source': self._file_path}) | document = Document(page_content=item, metadata={'source': self._file_path}) | ||||
| data.append(document) | data.append(document) | ||||
| return data | return data | ||||
| @staticmethod | |||||
| def is_blank_row(row): | |||||
| """ | |||||
| Determine whether the specified line is a blank line. | |||||
| :param row: row object。 | |||||
| :return: Returns True if the row is blank, False otherwise. | |||||
| """ | |||||
| # Iterates through the cells and returns False if a non-empty cell is found | |||||
| for cell in row: | |||||
| if cell.value is not None and cell.value != '': | |||||
| return False | |||||
| return True |
| etl_type = current_app.config['ETL_TYPE'] | etl_type = current_app.config['ETL_TYPE'] | ||||
| unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] | unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] | ||||
| if etl_type == 'Unstructured': | if etl_type == 'Unstructured': | ||||
| if file_extension == '.xlsx': | |||||
| if file_extension == '.xlsx' or file_extension == '.xls': | |||||
| extractor = ExcelExtractor(file_path) | extractor = ExcelExtractor(file_path) | ||||
| elif file_extension == '.pdf': | elif file_extension == '.pdf': | ||||
| extractor = PdfExtractor(file_path) | extractor = PdfExtractor(file_path) | ||||
| extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \ | extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \ | ||||
| else TextExtractor(file_path, autodetect_encoding=True) | else TextExtractor(file_path, autodetect_encoding=True) | ||||
| else: | else: | ||||
| if file_extension == '.xlsx': | |||||
| if file_extension == '.xlsx' or file_extension == '.xls': | |||||
| extractor = ExcelExtractor(file_path) | extractor = ExcelExtractor(file_path) | ||||
| elif file_extension == '.pdf': | elif file_extension == '.pdf': | ||||
| extractor = PdfExtractor(file_path) | extractor = PdfExtractor(file_path) |
| azure-storage-blob==12.9.0 | azure-storage-blob==12.9.0 | ||||
| azure-identity==1.15.0 | azure-identity==1.15.0 | ||||
| lxml==5.1.0 | lxml==5.1.0 | ||||
| xlrd~=2.0.1 |
| IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] | IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] | ||||
| IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) | IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) | ||||
| ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] | |||||
| UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', | |||||
| ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv'] | |||||
| UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', | |||||
| 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub'] | 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub'] | ||||
| PREVIEW_WORDS_LIMIT = 3000 | PREVIEW_WORDS_LIMIT = 3000 | ||||