### What problem does this PR solve? #3646 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.14.1
| @@ -65,7 +65,3 @@ class Crawler(ComponentBase, ABC): | |||
| elif self._param.extract_type == 'content': | |||
| result.extracted_content | |||
| return result.markdown | |||
| @@ -64,27 +64,27 @@ def build_knowledge_graph_chunks(tenant_id: str, chunks: list[str], callback, en | |||
| BATCH_SIZE=4 | |||
| texts, graphs = [], [] | |||
| cnt = 0 | |||
| threads = [] | |||
| max_workers = int(os.environ.get('GRAPH_EXTRACTOR_MAX_WORKERS', 50)) | |||
| exe = ThreadPoolExecutor(max_workers=max_workers) | |||
| for i in range(len(chunks)): | |||
| tkn_cnt = num_tokens_from_string(chunks[i]) | |||
| if cnt+tkn_cnt >= left_token_count and texts: | |||
| with ThreadPoolExecutor(max_workers=max_workers) as exe: | |||
| threads = [] | |||
| for i in range(len(chunks)): | |||
| tkn_cnt = num_tokens_from_string(chunks[i]) | |||
| if cnt+tkn_cnt >= left_token_count and texts: | |||
| for b in range(0, len(texts), BATCH_SIZE): | |||
| threads.append(exe.submit(ext, ["\n".join(texts[b:b+BATCH_SIZE])], {"entity_types": entity_types}, callback)) | |||
| texts = [] | |||
| cnt = 0 | |||
| texts.append(chunks[i]) | |||
| cnt += tkn_cnt | |||
| if texts: | |||
| for b in range(0, len(texts), BATCH_SIZE): | |||
| threads.append(exe.submit(ext, ["\n".join(texts[b:b+BATCH_SIZE])], {"entity_types": entity_types}, callback)) | |||
| texts = [] | |||
| cnt = 0 | |||
| texts.append(chunks[i]) | |||
| cnt += tkn_cnt | |||
| if texts: | |||
| for b in range(0, len(texts), BATCH_SIZE): | |||
| threads.append(exe.submit(ext, ["\n".join(texts[b:b+BATCH_SIZE])], {"entity_types": entity_types}, callback)) | |||
| callback(0.5, "Extracting entities.") | |||
| graphs = [] | |||
| for i, _ in enumerate(threads): | |||
| graphs.append(_.result().output) | |||
| callback(0.5 + 0.1*i/len(threads), f"Entities extraction progress ... {i+1}/{len(threads)}") | |||
| callback(0.5, "Extracting entities.") | |||
| graphs = [] | |||
| for i, _ in enumerate(threads): | |||
| graphs.append(_.result().output) | |||
| callback(0.5 + 0.1*i/len(threads), f"Entities extraction progress ... {i+1}/{len(threads)}") | |||
| graph = reduce(graph_merge, graphs) if graphs else nx.Graph() | |||
| er = EntityResolution(llm_bdl) | |||
| @@ -88,26 +88,26 @@ class MindMapExtractor: | |||
| prompt_variables = {} | |||
| try: | |||
| max_workers = int(os.environ.get('MINDMAP_EXTRACTOR_MAX_WORKERS', 12)) | |||
| exe = ThreadPoolExecutor(max_workers=max_workers) | |||
| threads = [] | |||
| token_count = max(self._llm.max_length * 0.8, self._llm.max_length - 512) | |||
| texts = [] | |||
| res = [] | |||
| cnt = 0 | |||
| for i in range(len(sections)): | |||
| section_cnt = num_tokens_from_string(sections[i]) | |||
| if cnt + section_cnt >= token_count and texts: | |||
| max_workers = int(os.environ.get('MINDMAP_EXTRACTOR_MAX_WORKERS', 12)) | |||
| with ThreadPoolExecutor(max_workers=max_workers) as exe: | |||
| threads = [] | |||
| token_count = max(self._llm.max_length * 0.8, self._llm.max_length - 512) | |||
| texts = [] | |||
| cnt = 0 | |||
| for i in range(len(sections)): | |||
| section_cnt = num_tokens_from_string(sections[i]) | |||
| if cnt + section_cnt >= token_count and texts: | |||
| threads.append(exe.submit(self._process_document, "".join(texts), prompt_variables)) | |||
| texts = [] | |||
| cnt = 0 | |||
| texts.append(sections[i]) | |||
| cnt += section_cnt | |||
| if texts: | |||
| threads.append(exe.submit(self._process_document, "".join(texts), prompt_variables)) | |||
| texts = [] | |||
| cnt = 0 | |||
| texts.append(sections[i]) | |||
| cnt += section_cnt | |||
| if texts: | |||
| threads.append(exe.submit(self._process_document, "".join(texts), prompt_variables)) | |||
| for i, _ in enumerate(threads): | |||
| res.append(_.result()) | |||
| for i, _ in enumerate(threads): | |||
| res.append(_.result()) | |||
| if not res: | |||
| return MindMapResult(output={"id": "root", "children": []}) | |||
| @@ -366,7 +366,7 @@ class OllamaChat(Base): | |||
| keep_alive=-1 | |||
| ) | |||
| ans = response["message"]["content"].strip() | |||
| return ans, response["eval_count"] + response.get("prompt_eval_count", 0) | |||
| return ans, response.get("eval_count", 0) + response.get("prompt_eval_count", 0) | |||
| except Exception as e: | |||
| return "**ERROR**: " + str(e), 0 | |||
| @@ -492,6 +492,7 @@ def report_status(): | |||
| logging.exception("report_status got exception") | |||
| time.sleep(30) | |||
| def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapshot, snapshot_id: int, dump_full: bool): | |||
| msg = "" | |||
| if dump_full: | |||
| @@ -508,6 +509,7 @@ def analyze_heap(snapshot1: tracemalloc.Snapshot, snapshot2: tracemalloc.Snapsho | |||
| msg += '\n'.join(stat.traceback.format()) | |||
| logging.info(msg) | |||
| def main(): | |||
| settings.init_settings() | |||
| background_thread = threading.Thread(target=report_status) | |||