| @@ -4,12 +4,15 @@ from werkzeug.exceptions import Unauthorized | |||
| if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true': | |||
| from gevent import monkey | |||
| monkey.patch_all() | |||
| # if os.environ.get("VECTOR_STORE") == 'milvus': | |||
| import grpc.experimental.gevent | |||
| grpc.experimental.gevent.init_gevent() | |||
| import langchain | |||
| langchain.verbose = True | |||
| import json | |||
| @@ -44,6 +47,7 @@ from services.account_service import AccountService | |||
| # DO NOT REMOVE BELOW | |||
| from events import event_handlers | |||
| from models import account, dataset, model, source, task, tool, tools, web | |||
| # DO NOT REMOVE ABOVE | |||
| @@ -51,7 +55,7 @@ warnings.simplefilter("ignore", ResourceWarning) | |||
| # fix windows platform | |||
| if os.name == "nt": | |||
| os.system('tzutil /s "UTC"') | |||
| os.system('tzutil /s "UTC"') | |||
| else: | |||
| os.environ['TZ'] = 'UTC' | |||
| time.tzset() | |||
| @@ -60,6 +64,7 @@ else: | |||
| class DifyApp(Flask): | |||
| pass | |||
| # ------------- | |||
| # Configuration | |||
| # ------------- | |||
| @@ -67,6 +72,7 @@ class DifyApp(Flask): | |||
| config_type = os.getenv('EDITION', default='SELF_HOSTED') # ce edition first | |||
| # ---------------------------- | |||
| # Application Factory Function | |||
| # ---------------------------- | |||
| @@ -192,7 +198,6 @@ def register_blueprints(app): | |||
| app = create_app() | |||
| celery = app.extensions["celery"] | |||
| if app.config['TESTING']: | |||
| print("App is running in TESTING mode") | |||
| @@ -2,6 +2,7 @@ | |||
| from typing import Optional | |||
| import pandas as pd | |||
| import xlrd | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| @@ -27,10 +28,37 @@ class ExcelExtractor(BaseExtractor): | |||
| self._autodetect_encoding = autodetect_encoding | |||
| def extract(self) -> list[Document]: | |||
| """ parse excel file""" | |||
| if self._file_path.endswith('.xls'): | |||
| return self._extract4xls() | |||
| elif self._file_path.endswith('.xlsx'): | |||
| return self._extract4xlsx() | |||
| def _extract4xls(self) -> list[Document]: | |||
| wb = xlrd.open_workbook(filename=self._file_path) | |||
| documents = [] | |||
| # loop over all sheets | |||
| for sheet in wb.sheets(): | |||
| for row_index, row in enumerate(sheet.get_rows(), start=1): | |||
| row_header = None | |||
| if self.is_blank_row(row): | |||
| continue | |||
| if row_header is None: | |||
| row_header = row | |||
| continue | |||
| item_arr = [] | |||
| for index, cell in enumerate(row): | |||
| txt_value = str(cell.value) | |||
| item_arr.append(f'{row_header[index].value}:{txt_value}') | |||
| item_str = "\n".join(item_arr) | |||
| document = Document(page_content=item_str, metadata={'source': self._file_path}) | |||
| documents.append(document) | |||
| return documents | |||
| def _extract4xlsx(self) -> list[Document]: | |||
| """Load from file path using Pandas.""" | |||
| data = [] | |||
| # 使用 Pandas 读取 Excel 文件的每个工作表 | |||
| # Read each worksheet of an Excel file using Pandas | |||
| xls = pd.ExcelFile(self._file_path) | |||
| for sheet_name in xls.sheet_names: | |||
| df = pd.read_excel(xls, sheet_name=sheet_name) | |||
| @@ -43,5 +71,18 @@ class ExcelExtractor(BaseExtractor): | |||
| item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) | |||
| document = Document(page_content=item, metadata={'source': self._file_path}) | |||
| data.append(document) | |||
| return data | |||
| @staticmethod | |||
| def is_blank_row(row): | |||
| """ | |||
| Determine whether the specified line is a blank line. | |||
| :param row: row object。 | |||
| :return: Returns True if the row is blank, False otherwise. | |||
| """ | |||
| # Iterates through the cells and returns False if a non-empty cell is found | |||
| for cell in row: | |||
| if cell.value is not None and cell.value != '': | |||
| return False | |||
| return True | |||
| @@ -84,7 +84,7 @@ class ExtractProcessor: | |||
| etl_type = current_app.config['ETL_TYPE'] | |||
| unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] | |||
| if etl_type == 'Unstructured': | |||
| if file_extension == '.xlsx': | |||
| if file_extension == '.xlsx' or file_extension == '.xls': | |||
| extractor = ExcelExtractor(file_path) | |||
| elif file_extension == '.pdf': | |||
| extractor = PdfExtractor(file_path) | |||
| @@ -114,7 +114,7 @@ class ExtractProcessor: | |||
| extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \ | |||
| else TextExtractor(file_path, autodetect_encoding=True) | |||
| else: | |||
| if file_extension == '.xlsx': | |||
| if file_extension == '.xlsx' or file_extension == '.xls': | |||
| extractor = ExcelExtractor(file_path) | |||
| elif file_extension == '.pdf': | |||
| extractor = PdfExtractor(file_path) | |||
| @@ -82,3 +82,4 @@ qrcode~=7.4.2 | |||
| azure-storage-blob==12.9.0 | |||
| azure-identity==1.15.0 | |||
| lxml==5.1.0 | |||
| xlrd~=2.0.1 | |||
| @@ -20,9 +20,10 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError | |||
| IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] | |||
| IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) | |||
| ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] | |||
| UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', | |||
| ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv'] | |||
| UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', | |||
| 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub'] | |||
| PREVIEW_WORDS_LIMIT = 3000 | |||