| @@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource): | |||
| notion_workspace_id=workspace_id, | |||
| notion_obj_id=page_id, | |||
| notion_page_type=page_type, | |||
| notion_access_token=data_source_binding.access_token | |||
| notion_access_token=data_source_binding.access_token, | |||
| tenant_id=current_user.current_tenant_id | |||
| ) | |||
| text_docs = extractor.extract() | |||
| @@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource): | |||
| notion_info={ | |||
| "notion_workspace_id": workspace_id, | |||
| "notion_obj_id": page['page_id'], | |||
| "notion_page_type": page['type'] | |||
| "notion_page_type": page['type'], | |||
| "tenant_id": current_user.current_tenant_id | |||
| }, | |||
| document_model=args['doc_form'] | |||
| ) | |||
| @@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource): | |||
| notion_info={ | |||
| "notion_workspace_id": workspace_id, | |||
| "notion_obj_id": page['page_id'], | |||
| "notion_page_type": page['type'] | |||
| "notion_page_type": page['type'], | |||
| "tenant_id": current_user.current_tenant_id | |||
| }, | |||
| document_model=args['doc_form'] | |||
| ) | |||
| @@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource): | |||
| notion_info={ | |||
| "notion_workspace_id": data_source_info['notion_workspace_id'], | |||
| "notion_obj_id": data_source_info['notion_page_id'], | |||
| "notion_page_type": data_source_info['type'] | |||
| "notion_page_type": data_source_info['type'], | |||
| "tenant_id": current_user.current_tenant_id | |||
| }, | |||
| document_model=document.doc_form | |||
| ) | |||
| @@ -366,7 +366,8 @@ class IndexingRunner: | |||
| "notion_workspace_id": data_source_info['notion_workspace_id'], | |||
| "notion_obj_id": data_source_info['notion_page_id'], | |||
| "notion_page_type": data_source_info['type'], | |||
| "document": dataset_document | |||
| "document": dataset_document, | |||
| "tenant_id": dataset_document.tenant_id | |||
| }, | |||
| document_model=dataset_document.doc_form | |||
| ) | |||
| @@ -39,7 +39,8 @@ class RetrievalService: | |||
| 'flask_app': current_app._get_current_object(), | |||
| 'dataset_id': dataset_id, | |||
| 'query': query, | |||
| 'top_k': top_k | |||
| 'top_k': top_k, | |||
| 'all_documents': all_documents | |||
| }) | |||
| threads.append(keyword_thread) | |||
| keyword_thread.start() | |||
| @@ -12,6 +12,7 @@ class NotionInfo(BaseModel): | |||
| notion_obj_id: str | |||
| notion_page_type: str | |||
| document: Document = None | |||
| tenant_id: str | |||
| class Config: | |||
| arbitrary_types_allowed = True | |||
| @@ -132,7 +132,8 @@ class ExtractProcessor: | |||
| notion_workspace_id=extract_setting.notion_info.notion_workspace_id, | |||
| notion_obj_id=extract_setting.notion_info.notion_obj_id, | |||
| notion_page_type=extract_setting.notion_info.notion_page_type, | |||
| document_model=extract_setting.notion_info.document | |||
| document_model=extract_setting.notion_info.document, | |||
| tenant_id=extract_setting.notion_info.tenant_id, | |||
| ) | |||
| return extractor.extract() | |||
| else: | |||
| @@ -1,13 +1,14 @@ | |||
| """Abstract interface for document loader implementations.""" | |||
| from typing import Optional | |||
| from bs4 import BeautifulSoup | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.extractor.helpers import detect_file_encodings | |||
| from core.rag.models.document import Document | |||
| class HtmlExtractor(BaseExtractor): | |||
| """Load html files. | |||
| """ | |||
| Load html files. | |||
| Args: | |||
| @@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor): | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| encoding: Optional[str] = None, | |||
| autodetect_encoding: bool = False, | |||
| source_column: Optional[str] = None, | |||
| csv_args: Optional[dict] = None, | |||
| self, | |||
| file_path: str | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._encoding = encoding | |||
| self._autodetect_encoding = autodetect_encoding | |||
| self.source_column = source_column | |||
| self.csv_args = csv_args or {} | |||
| def extract(self) -> list[Document]: | |||
| """Load data into document objects.""" | |||
| try: | |||
| with open(self._file_path, newline="", encoding=self._encoding) as csvfile: | |||
| docs = self._read_from_file(csvfile) | |||
| except UnicodeDecodeError as e: | |||
| if self._autodetect_encoding: | |||
| detected_encodings = detect_file_encodings(self._file_path) | |||
| for encoding in detected_encodings: | |||
| try: | |||
| with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile: | |||
| docs = self._read_from_file(csvfile) | |||
| break | |||
| except UnicodeDecodeError: | |||
| continue | |||
| else: | |||
| raise RuntimeError(f"Error loading {self._file_path}") from e | |||
| return docs | |||
| return [Document(page_content=self._load_as_text())] | |||
| def _read_from_file(self, csvfile) -> list[Document]: | |||
| docs = [] | |||
| csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore | |||
| for i, row in enumerate(csv_reader): | |||
| content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items()) | |||
| try: | |||
| source = ( | |||
| row[self.source_column] | |||
| if self.source_column is not None | |||
| else '' | |||
| ) | |||
| except KeyError: | |||
| raise ValueError( | |||
| f"Source column '{self.source_column}' not found in CSV file." | |||
| ) | |||
| metadata = {"source": source, "row": i} | |||
| doc = Document(page_content=content, metadata=metadata) | |||
| docs.append(doc) | |||
| def _load_as_text(self) -> str: | |||
| with open(self._file_path, "rb") as fp: | |||
| soup = BeautifulSoup(fp, 'html.parser') | |||
| text = soup.get_text() | |||
| text = text.strip() if text else '' | |||
| return docs | |||
| return text | |||
| @@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor): | |||
| notion_workspace_id: str, | |||
| notion_obj_id: str, | |||
| notion_page_type: str, | |||
| tenant_id: str, | |||
| document_model: Optional[DocumentModel] = None, | |||
| notion_access_token: Optional[str] = None | |||
| notion_access_token: Optional[str] = None, | |||
| ): | |||
| self._notion_access_token = None | |||
| self._document_model = document_model | |||
| @@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str): | |||
| notion_workspace_id=workspace_id, | |||
| notion_obj_id=page_id, | |||
| notion_page_type=page_type, | |||
| notion_access_token=data_source_binding.access_token | |||
| notion_access_token=data_source_binding.access_token, | |||
| tenant_id=document.tenant_id | |||
| ) | |||
| last_edited_time = loader.get_notion_last_edited_time() | |||