| import asyncio | |||||
| import concurrent | import concurrent | ||||
| import datetime | import datetime | ||||
| import json | import json | ||||
| import time | import time | ||||
| import uuid | import uuid | ||||
| from concurrent.futures import ThreadPoolExecutor | from concurrent.futures import ThreadPoolExecutor | ||||
| from multiprocessing import Process | |||||
| from typing import Optional, List, cast | from typing import Optional, List, cast | ||||
| import openai | |||||
| from billiard.pool import Pool | |||||
| from flask import current_app, Flask | |||||
| from flask_login import current_user | from flask_login import current_user | ||||
| from langchain.embeddings import OpenAIEmbeddings | |||||
| from langchain.schema import Document | from langchain.schema import Document | ||||
| from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter | from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter | ||||
| from core.data_loader.file_extractor import FileExtractor | from core.data_loader.file_extractor import FileExtractor | ||||
| from core.data_loader.loader.notion import NotionLoader | from core.data_loader.loader.notion import NotionLoader | ||||
| from core.docstore.dataset_docstore import DatesetDocumentStore | from core.docstore.dataset_docstore import DatesetDocumentStore | ||||
| from core.embedding.cached_embedding import CacheEmbedding | |||||
| from core.generator.llm_generator import LLMGenerator | from core.generator.llm_generator import LLMGenerator | ||||
| from core.index.index import IndexBuilder | from core.index.index import IndexBuilder | ||||
| from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig | |||||
| from core.index.vector_index.vector_index import VectorIndex | |||||
| from core.llm.error import ProviderTokenNotInitError | from core.llm.error import ProviderTokenNotInitError | ||||
| from core.llm.llm_builder import LLMBuilder | from core.llm.llm_builder import LLMBuilder | ||||
| from core.llm.streamable_open_ai import StreamableOpenAI | from core.llm.streamable_open_ai import StreamableOpenAI | ||||
| model_name='gpt-3.5-turbo', | model_name='gpt-3.5-turbo', | ||||
| max_tokens=2000 | max_tokens=2000 | ||||
| ) | ) | ||||
| threads = [] | |||||
| for doc in documents: | |||||
| document_format_thread = threading.Thread(target=self.format_document, kwargs={ | |||||
| 'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form}) | |||||
| threads.append(document_format_thread) | |||||
| document_format_thread.start() | |||||
| for thread in threads: | |||||
| thread.join() | |||||
| for i in range(0, len(documents), 10): | |||||
| threads = [] | |||||
| sub_documents = documents[i:i + 10] | |||||
| for doc in sub_documents: | |||||
| document_format_thread = threading.Thread(target=self.format_document, kwargs={ | |||||
| 'llm': llm, 'document_node': doc, 'split_documents': split_documents, | |||||
| 'document_form': document_form}) | |||||
| threads.append(document_format_thread) | |||||
| document_format_thread.start() | |||||
| for thread in threads: | |||||
| thread.join() | |||||
| all_documents.extend(split_documents) | all_documents.extend(split_documents) | ||||
| return all_documents | return all_documents | ||||
| def format_document(self, llm: StreamableOpenAI, document_node, split_documents: List, document_form: str): | |||||
| print(document_node.page_content) | |||||
| def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str): | |||||
| format_documents = [] | format_documents = [] | ||||
| if document_node.page_content is None or not document_node.page_content.strip(): | if document_node.page_content is None or not document_node.page_content.strip(): | ||||
| return format_documents | return format_documents |