瀏覽代碼

refine mindmap prompt (#1808)

### What problem does this PR solve?



### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.9.0
Kevin Hu 1 年之前
父節點
當前提交
a5c03ccd4c
沒有連結到貢獻者的電子郵件帳戶。
共有 4 個檔案被更改,包括 5 行新增11 行删除
  1. 2
    2
      api/db/services/document_service.py
  2. 0
    6
      conf/llm_factories.json
  3. 0
    1
      graphrag/mind_map_prompt.py
  4. 3
    2
      rag/app/knowledge_graph.py

+ 2
- 2
api/db/services/document_service.py 查看文件

@classmethod @classmethod
@DB.connection_context() @DB.connection_context()
def get_unfinished_docs(cls): def get_unfinished_docs(cls):
fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg]
fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, cls.model.run]
docs = cls.model.select(*fields) \ docs = cls.model.select(*fields) \
.where( .where(
cls.model.status == StatusEnum.VALID.value, cls.model.status == StatusEnum.VALID.value,
prg = 0 prg = 0
finished = True finished = True
bad = 0 bad = 0
status = TaskStatus.RUNNING.value
status = d["run"]#TaskStatus.RUNNING.value
for t in tsks: for t in tsks:
if 0 <= t.progress < 1: if 0 <= t.progress < 1:
finished = False finished = False

+ 0
- 6
conf/llm_factories.json 查看文件

"max_tokens": 32768, "max_tokens": 32768,
"model_type": "chat" "model_type": "chat"
}, },
{
"llm_name": "qwen-max-1201",
"tags": "LLM,CHAT,6K",
"max_tokens": 5899,
"model_type": "chat"
},
{ {
"llm_name": "text-embedding-v2", "llm_name": "text-embedding-v2",
"tags": "TEXT EMBEDDING,2K", "tags": "TEXT EMBEDDING,2K",

+ 0
- 1
graphrag/mind_map_prompt.py 查看文件

3. If the subject matter is really complex, split them into sub-sections. 3. If the subject matter is really complex, split them into sub-sections.


- Output requirement: - Output requirement:
- Always try to maximize the number of sub-sections.
- In language of - In language of
- MUST IN FORMAT OF MARKDOWN - MUST IN FORMAT OF MARKDOWN

+ 3
- 2
rag/app/knowledge_graph.py 查看文件

eng = lang.lower() == "english" eng = lang.lower() == "english"


parser_config["layout_recognize"] = False parser_config["layout_recognize"] = False
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, callback=callback ,parser_config=parser_config)
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
parser_config=parser_config, callback=callback)
chunks = build_knowlege_graph_chunks(tenant_id, sections, callback, chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"]) parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
) )
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
chunks.extend(tokenize_chunks(sections, doc, eng)) chunks.extend(tokenize_chunks(sections, doc, eng))


return chunks
return chunks

Loading…
取消
儲存