### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.9.0
| @classmethod | @classmethod | ||||
| @DB.connection_context() | @DB.connection_context() | ||||
| def get_unfinished_docs(cls): | def get_unfinished_docs(cls): | ||||
| fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg] | |||||
| fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, cls.model.run] | |||||
| docs = cls.model.select(*fields) \ | docs = cls.model.select(*fields) \ | ||||
| .where( | .where( | ||||
| cls.model.status == StatusEnum.VALID.value, | cls.model.status == StatusEnum.VALID.value, | ||||
| prg = 0 | prg = 0 | ||||
| finished = True | finished = True | ||||
| bad = 0 | bad = 0 | ||||
| status = TaskStatus.RUNNING.value | |||||
| status = d["run"]#TaskStatus.RUNNING.value | |||||
| for t in tsks: | for t in tsks: | ||||
| if 0 <= t.progress < 1: | if 0 <= t.progress < 1: | ||||
| finished = False | finished = False |
| "max_tokens": 32768, | "max_tokens": 32768, | ||||
| "model_type": "chat" | "model_type": "chat" | ||||
| }, | }, | ||||
| { | |||||
| "llm_name": "qwen-max-1201", | |||||
| "tags": "LLM,CHAT,6K", | |||||
| "max_tokens": 5899, | |||||
| "model_type": "chat" | |||||
| }, | |||||
| { | { | ||||
| "llm_name": "text-embedding-v2", | "llm_name": "text-embedding-v2", | ||||
| "tags": "TEXT EMBEDDING,2K", | "tags": "TEXT EMBEDDING,2K", |
| 3. If the subject matter is really complex, split them into sub-sections. | 3. If the subject matter is really complex, split them into sub-sections. | ||||
| - Output requirement: | - Output requirement: | ||||
| - Always try to maximize the number of sub-sections. | |||||
| - In language of | - In language of | ||||
| - MUST IN FORMAT OF MARKDOWN | - MUST IN FORMAT OF MARKDOWN | ||||
| eng = lang.lower() == "english" | eng = lang.lower() == "english" | ||||
| parser_config["layout_recognize"] = False | parser_config["layout_recognize"] = False | ||||
| sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, callback=callback ,parser_config=parser_config) | |||||
| sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, | |||||
| parser_config=parser_config, callback=callback) | |||||
| chunks = build_knowlege_graph_chunks(tenant_id, sections, callback, | chunks = build_knowlege_graph_chunks(tenant_id, sections, callback, | ||||
| parser_config.get("entity_types", ["organization", "person", "location", "event", "time"]) | parser_config.get("entity_types", ["organization", "person", "location", "event", "time"]) | ||||
| ) | ) | ||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | ||||
| chunks.extend(tokenize_chunks(sections, doc, eng)) | chunks.extend(tokenize_chunks(sections, doc, eng)) | ||||
| return chunks | |||||
| return chunks |