2 years ago · 9eaae770a6
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
 import asyncio
 import concurrent
 import datetime
 import json
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from multiprocessing import Process
 from typing import Optional, List, cast
 import openai
 from billiard.pool import Pool
 from flask import current_app, Flask
 from flask_login import current_user
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
 from core.data_loader.file_extractor import FileExtractor
 from core.data_loader.loader.notion import NotionLoader
 from core.docstore.dataset_docstore import DatesetDocumentStore
 from core.embedding.cached_embedding import CacheEmbedding
 from core.generator.llm_generator import LLMGenerator
 from core.index.index import IndexBuilder
 from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
 from core.index.vector_index.vector_index import VectorIndex
 from core.llm.error import ProviderTokenNotInitError
 from core.llm.llm_builder import LLMBuilder
 from core.llm.streamable_open_ai import StreamableOpenAI
                model_name='gpt-3.5-turbo',
                max_tokens=2000
            )
            threads = []
            for doc in documents:
                document_format_thread = threading.Thread(target=self.format_document, kwargs={
                    'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form})
                threads.append(document_format_thread)
                document_format_thread.start()
            for thread in threads:
                thread.join()
            for i in range(0, len(documents), 10):
                threads = []
                sub_documents = documents[i:i + 10]
                for doc in sub_documents:
                    document_format_thread = threading.Thread(target=self.format_document, kwargs={
                        'llm': llm, 'document_node': doc, 'split_documents': split_documents,
                        'document_form': document_form})
                    threads.append(document_format_thread)
                    document_format_thread.start()
                for thread in threads:
                    thread.join()
            all_documents.extend(split_documents)
        return all_documents
    def format_document(self, llm: StreamableOpenAI, document_node, split_documents: List, document_form: str):
        print(document_node.page_content)
    def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str):
        format_documents = []
        if document_node.page_content is None or not document_node.page_content.strip():
            return format_documents