Sfoglia il codice sorgente

refactor auto keywords and auto question (#2990)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
tags/v0.13.0
Kevin Hu 1 anno fa
parent
commit
7f81fc8f9b
Nessun account collegato all'indirizzo email del committer
1 ha cambiato i file con 20 aggiunte e 17 eliminazioni
  1. 20
    17
      rag/svr/task_executor.py

+ 20
- 17
rag/svr/task_executor.py Vedi File

d["_id"] = md5.hexdigest() d["_id"] = md5.hexdigest()
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp() d["create_timestamp_flt"] = datetime.datetime.now().timestamp()

if row["parser_config"].get("auto_keywords", 0):
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
d["important_kwd"] = keyword_extraction(chat_mdl, ck["content_with_weight"],
row["parser_config"]["auto_keywords"]).split(",")
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))

if row["parser_config"].get("auto_questions", 0):
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
qst = question_proposal(chat_mdl, ck["content_with_weight"], row["parser_config"]["auto_keywords"])
ck["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + ck["content_with_weight"]
qst = rag_tokenizer.tokenize(qst)
if "content_ltks" in ck:
ck["content_ltks"] += " " + qst
if "content_sm_ltks" in ck:
ck["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)

if not d.get("image"): if not d.get("image"):
docs.append(d) docs.append(d)
continue continue
docs.append(d) docs.append(d)
cron_logger.info("MINIO PUT({}):{}".format(row["name"], el)) cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))


if row["parser_config"].get("auto_keywords", 0):
callback(msg="Start to generate keywords for every chunk ...")
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
for d in docs:
d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
row["parser_config"]["auto_keywords"]).split(",")
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))

if row["parser_config"].get("auto_questions", 0):
callback(msg="Start to generate questions for every chunk ...")
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
for d in docs:
qst = question_proposal(chat_mdl, d["content_with_weight"], row["parser_config"]["auto_questions"])
d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"]
qst = rag_tokenizer.tokenize(qst)
if "content_ltks" in d:
d["content_ltks"] += " " + qst
if "content_sm_ltks" in d:
d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)

return docs return docs





Loading…
Annulla
Salva