| @@ -1,4 +1,3 @@ | |||
| import asyncio | |||
| import concurrent | |||
| import datetime | |||
| import json | |||
| @@ -8,25 +7,17 @@ import threading | |||
| import time | |||
| import uuid | |||
| from concurrent.futures import ThreadPoolExecutor | |||
| from multiprocessing import Process | |||
| from typing import Optional, List, cast | |||
| import openai | |||
| from billiard.pool import Pool | |||
| from flask import current_app, Flask | |||
| from flask_login import current_user | |||
| from langchain.embeddings import OpenAIEmbeddings | |||
| from langchain.schema import Document | |||
| from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter | |||
| from core.data_loader.file_extractor import FileExtractor | |||
| from core.data_loader.loader.notion import NotionLoader | |||
| from core.docstore.dataset_docstore import DatesetDocumentStore | |||
| from core.embedding.cached_embedding import CacheEmbedding | |||
| from core.generator.llm_generator import LLMGenerator | |||
| from core.index.index import IndexBuilder | |||
| from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig | |||
| from core.index.vector_index.vector_index import VectorIndex | |||
| from core.llm.error import ProviderTokenNotInitError | |||
| from core.llm.llm_builder import LLMBuilder | |||
| from core.llm.streamable_open_ai import StreamableOpenAI | |||
| @@ -516,20 +507,23 @@ class IndexingRunner: | |||
| model_name='gpt-3.5-turbo', | |||
| max_tokens=2000 | |||
| ) | |||
| threads = [] | |||
| for doc in documents: | |||
| document_format_thread = threading.Thread(target=self.format_document, kwargs={ | |||
| 'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form}) | |||
| threads.append(document_format_thread) | |||
| document_format_thread.start() | |||
| for thread in threads: | |||
| thread.join() | |||
| for i in range(0, len(documents), 10): | |||
| threads = [] | |||
| sub_documents = documents[i:i + 10] | |||
| for doc in sub_documents: | |||
| document_format_thread = threading.Thread(target=self.format_document, kwargs={ | |||
| 'llm': llm, 'document_node': doc, 'split_documents': split_documents, | |||
| 'document_form': document_form}) | |||
| threads.append(document_format_thread) | |||
| document_format_thread.start() | |||
| for thread in threads: | |||
| thread.join() | |||
| all_documents.extend(split_documents) | |||
| return all_documents | |||
| def format_document(self, llm: StreamableOpenAI, document_node, split_documents: List, document_form: str): | |||
| print(document_node.page_content) | |||
| def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str): | |||
| format_documents = [] | |||
| if document_node.page_content is None or not document_node.page_content.strip(): | |||
| return format_documents | |||