### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.9.0
| @@ -142,7 +142,7 @@ class DocumentService(CommonService): | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_unfinished_docs(cls): | |||
| fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg] | |||
| fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, cls.model.run] | |||
| docs = cls.model.select(*fields) \ | |||
| .where( | |||
| cls.model.status == StatusEnum.VALID.value, | |||
| @@ -311,7 +311,7 @@ class DocumentService(CommonService): | |||
| prg = 0 | |||
| finished = True | |||
| bad = 0 | |||
| status = TaskStatus.RUNNING.value | |||
| status = d["run"]#TaskStatus.RUNNING.value | |||
| for t in tsks: | |||
| if 0 <= t.progress < 1: | |||
| finished = False | |||
| @@ -92,12 +92,6 @@ | |||
| "max_tokens": 32768, | |||
| "model_type": "chat" | |||
| }, | |||
| { | |||
| "llm_name": "qwen-max-1201", | |||
| "tags": "LLM,CHAT,6K", | |||
| "max_tokens": 5899, | |||
| "model_type": "chat" | |||
| }, | |||
| { | |||
| "llm_name": "text-embedding-v2", | |||
| "tags": "TEXT EMBEDDING,2K", | |||
| @@ -22,7 +22,6 @@ MIND_MAP_EXTRACTION_PROMPT = """ | |||
| 3. If the subject matter is really complex, split them into sub-sections. | |||
| - Output requirement: | |||
| - Always try to maximize the number of sub-sections. | |||
| - In language of | |||
| - MUST IN FORMAT OF MARKDOWN | |||
| @@ -13,7 +13,8 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, | |||
| eng = lang.lower() == "english" | |||
| parser_config["layout_recognize"] = False | |||
| sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, callback=callback ,parser_config=parser_config) | |||
| sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, | |||
| parser_config=parser_config, callback=callback) | |||
| chunks = build_knowlege_graph_chunks(tenant_id, sections, callback, | |||
| parser_config.get("entity_types", ["organization", "person", "location", "event", "time"]) | |||
| ) | |||
| @@ -27,4 +28,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, | |||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||
| chunks.extend(tokenize_chunks(sections, doc, eng)) | |||
| return chunks | |||
| return chunks | |||