| notion_workspace_id=workspace_id, | notion_workspace_id=workspace_id, | ||||
| notion_obj_id=page_id, | notion_obj_id=page_id, | ||||
| notion_page_type=page_type, | notion_page_type=page_type, | ||||
| notion_access_token=data_source_binding.access_token | |||||
| notion_access_token=data_source_binding.access_token, | |||||
| tenant_id=current_user.current_tenant_id | |||||
| ) | ) | ||||
| text_docs = extractor.extract() | text_docs = extractor.extract() | ||||
| notion_info={ | notion_info={ | ||||
| "notion_workspace_id": workspace_id, | "notion_workspace_id": workspace_id, | ||||
| "notion_obj_id": page['page_id'], | "notion_obj_id": page['page_id'], | ||||
| "notion_page_type": page['type'] | |||||
| "notion_page_type": page['type'], | |||||
| "tenant_id": current_user.current_tenant_id | |||||
| }, | }, | ||||
| document_model=args['doc_form'] | document_model=args['doc_form'] | ||||
| ) | ) |
| notion_info={ | notion_info={ | ||||
| "notion_workspace_id": workspace_id, | "notion_workspace_id": workspace_id, | ||||
| "notion_obj_id": page['page_id'], | "notion_obj_id": page['page_id'], | ||||
| "notion_page_type": page['type'] | |||||
| "notion_page_type": page['type'], | |||||
| "tenant_id": current_user.current_tenant_id | |||||
| }, | }, | ||||
| document_model=args['doc_form'] | document_model=args['doc_form'] | ||||
| ) | ) |
| notion_info={ | notion_info={ | ||||
| "notion_workspace_id": data_source_info['notion_workspace_id'], | "notion_workspace_id": data_source_info['notion_workspace_id'], | ||||
| "notion_obj_id": data_source_info['notion_page_id'], | "notion_obj_id": data_source_info['notion_page_id'], | ||||
| "notion_page_type": data_source_info['type'] | |||||
| "notion_page_type": data_source_info['type'], | |||||
| "tenant_id": current_user.current_tenant_id | |||||
| }, | }, | ||||
| document_model=document.doc_form | document_model=document.doc_form | ||||
| ) | ) |
| "notion_workspace_id": data_source_info['notion_workspace_id'], | "notion_workspace_id": data_source_info['notion_workspace_id'], | ||||
| "notion_obj_id": data_source_info['notion_page_id'], | "notion_obj_id": data_source_info['notion_page_id'], | ||||
| "notion_page_type": data_source_info['type'], | "notion_page_type": data_source_info['type'], | ||||
| "document": dataset_document | |||||
| "document": dataset_document, | |||||
| "tenant_id": dataset_document.tenant_id | |||||
| }, | }, | ||||
| document_model=dataset_document.doc_form | document_model=dataset_document.doc_form | ||||
| ) | ) |
| 'flask_app': current_app._get_current_object(), | 'flask_app': current_app._get_current_object(), | ||||
| 'dataset_id': dataset_id, | 'dataset_id': dataset_id, | ||||
| 'query': query, | 'query': query, | ||||
| 'top_k': top_k | |||||
| 'top_k': top_k, | |||||
| 'all_documents': all_documents | |||||
| }) | }) | ||||
| threads.append(keyword_thread) | threads.append(keyword_thread) | ||||
| keyword_thread.start() | keyword_thread.start() |
| notion_obj_id: str | notion_obj_id: str | ||||
| notion_page_type: str | notion_page_type: str | ||||
| document: Document = None | document: Document = None | ||||
| tenant_id: str | |||||
| class Config: | class Config: | ||||
| arbitrary_types_allowed = True | arbitrary_types_allowed = True |
| notion_workspace_id=extract_setting.notion_info.notion_workspace_id, | notion_workspace_id=extract_setting.notion_info.notion_workspace_id, | ||||
| notion_obj_id=extract_setting.notion_info.notion_obj_id, | notion_obj_id=extract_setting.notion_info.notion_obj_id, | ||||
| notion_page_type=extract_setting.notion_info.notion_page_type, | notion_page_type=extract_setting.notion_info.notion_page_type, | ||||
| document_model=extract_setting.notion_info.document | |||||
| document_model=extract_setting.notion_info.document, | |||||
| tenant_id=extract_setting.notion_info.tenant_id, | |||||
| ) | ) | ||||
| return extractor.extract() | return extractor.extract() | ||||
| else: | else: |
| """Abstract interface for document loader implementations.""" | """Abstract interface for document loader implementations.""" | ||||
| from typing import Optional | |||||
| from bs4 import BeautifulSoup | |||||
| from core.rag.extractor.extractor_base import BaseExtractor | from core.rag.extractor.extractor_base import BaseExtractor | ||||
| from core.rag.extractor.helpers import detect_file_encodings | |||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| class HtmlExtractor(BaseExtractor): | class HtmlExtractor(BaseExtractor): | ||||
| """Load html files. | |||||
| """ | |||||
| Load html files. | |||||
| Args: | Args: | ||||
| """ | """ | ||||
| def __init__( | def __init__( | ||||
| self, | |||||
| file_path: str, | |||||
| encoding: Optional[str] = None, | |||||
| autodetect_encoding: bool = False, | |||||
| source_column: Optional[str] = None, | |||||
| csv_args: Optional[dict] = None, | |||||
| self, | |||||
| file_path: str | |||||
| ): | ): | ||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| self._encoding = encoding | |||||
| self._autodetect_encoding = autodetect_encoding | |||||
| self.source_column = source_column | |||||
| self.csv_args = csv_args or {} | |||||
| def extract(self) -> list[Document]: | def extract(self) -> list[Document]: | ||||
| """Load data into document objects.""" | |||||
| try: | |||||
| with open(self._file_path, newline="", encoding=self._encoding) as csvfile: | |||||
| docs = self._read_from_file(csvfile) | |||||
| except UnicodeDecodeError as e: | |||||
| if self._autodetect_encoding: | |||||
| detected_encodings = detect_file_encodings(self._file_path) | |||||
| for encoding in detected_encodings: | |||||
| try: | |||||
| with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile: | |||||
| docs = self._read_from_file(csvfile) | |||||
| break | |||||
| except UnicodeDecodeError: | |||||
| continue | |||||
| else: | |||||
| raise RuntimeError(f"Error loading {self._file_path}") from e | |||||
| return docs | |||||
| return [Document(page_content=self._load_as_text())] | |||||
| def _read_from_file(self, csvfile) -> list[Document]: | |||||
| docs = [] | |||||
| csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore | |||||
| for i, row in enumerate(csv_reader): | |||||
| content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items()) | |||||
| try: | |||||
| source = ( | |||||
| row[self.source_column] | |||||
| if self.source_column is not None | |||||
| else '' | |||||
| ) | |||||
| except KeyError: | |||||
| raise ValueError( | |||||
| f"Source column '{self.source_column}' not found in CSV file." | |||||
| ) | |||||
| metadata = {"source": source, "row": i} | |||||
| doc = Document(page_content=content, metadata=metadata) | |||||
| docs.append(doc) | |||||
| def _load_as_text(self) -> str: | |||||
| with open(self._file_path, "rb") as fp: | |||||
| soup = BeautifulSoup(fp, 'html.parser') | |||||
| text = soup.get_text() | |||||
| text = text.strip() if text else '' | |||||
| return docs | |||||
| return text |
| notion_workspace_id: str, | notion_workspace_id: str, | ||||
| notion_obj_id: str, | notion_obj_id: str, | ||||
| notion_page_type: str, | notion_page_type: str, | ||||
| tenant_id: str, | |||||
| document_model: Optional[DocumentModel] = None, | document_model: Optional[DocumentModel] = None, | ||||
| notion_access_token: Optional[str] = None | |||||
| notion_access_token: Optional[str] = None, | |||||
| ): | ): | ||||
| self._notion_access_token = None | self._notion_access_token = None | ||||
| self._document_model = document_model | self._document_model = document_model |
| notion_workspace_id=workspace_id, | notion_workspace_id=workspace_id, | ||||
| notion_obj_id=page_id, | notion_obj_id=page_id, | ||||
| notion_page_type=page_type, | notion_page_type=page_type, | ||||
| notion_access_token=data_source_binding.access_token | |||||
| notion_access_token=data_source_binding.access_token, | |||||
| tenant_id=document.tenant_id | |||||
| ) | ) | ||||
| last_edited_time = loader.get_notion_last_edited_time() | last_edited_time = loader.get_notion_last_edited_time() |