ソースを参照

Feat: add image preview to retrieval test. (#7610)

### What problem does this PR solve?

#7608

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.19.0
Kevin Hu 5ヶ月前
コミット
321a280031
コミッターのメールアドレスに関連付けられたアカウントが存在しません
7個のファイルの変更21行の追加7行の削除
  1. 3
    1
      conf/infinity_mapping.json
  2. 3
    1
      rag/app/manual.py
  3. 2
    1
      rag/app/picture.py
  4. 1
    0
      rag/app/presentation.py
  5. 6
    2
      rag/app/qa.py
  6. 4
    1
      rag/nlp/__init__.py
  7. 2
    1
      rag/nlp/search.py

+ 3
- 1
conf/infinity_mapping.json ファイルの表示

"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"n_hop_with_weight": {"type": "varchar", "default": ""}, "n_hop_with_weight": {"type": "varchar", "default": ""},
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},

"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
} }

+ 3
- 1
rag/app/manual.py ファイルの表示

res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
for text, image in ti_list: for text, image in ti_list:
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
d['image'] = image
if image:
d['image'] = image
d["doc_type_kwd"] = "image"
tokenize(d, text, eng) tokenize(d, text, eng)
res.append(d) res.append(d)
return res return res

+ 2
- 1
rag/app/picture.py ファイルの表示

doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
"image": img
"image": img,
"doc_type_kwd": "image"
} }
bxs = ocr(np.array(img)) bxs = ocr(np.array(img))
txt = "\n".join([t[0] for _, t in bxs if t[0]]) txt = "\n".join([t[0] for _, t in bxs if t[0]])

+ 1
- 0
rag/app/presentation.py ファイルの表示

d = copy.deepcopy(doc) d = copy.deepcopy(doc)
pn += from_page pn += from_page
d["image"] = img d["image"] = img
d["doc_type_kwd"] = "image"
d["page_num_int"] = [pn + 1] d["page_num_int"] = [pn + 1]
d["top_int"] = [0] d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]

+ 6
- 2
rag/app/qa.py ファイルの表示

[qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q) d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["image"] = image
if image:
d["image"] = image
d["doc_type_kwd"] = "image"
add_positions(d, poss) add_positions(d, poss)
return d return d


[qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = rag_tokenizer.tokenize(q) d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["image"] = image
if image:
d["image"] = image
d["doc_type_kwd"] = "image"
if row_num >= 0: if row_num >= 0:
d["top_int"] = [row_num] d["top_int"] = [row_num]
return d return d

+ 4
- 1
rag/nlp/__init__.py ファイルの表示

d["content_with_weight"] = rows d["content_with_weight"] = rows
if img: if img:
d["image"] = img d["image"] = img
d["doc_type_kwd"] = "image"
if poss: if poss:
add_positions(d, poss) add_positions(d, poss)
res.append(d) res.append(d)
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
r = de.join(rows[i:i + batch_size]) r = de.join(rows[i:i + batch_size])
tokenize(d, r, eng) tokenize(d, r, eng)
d["image"] = img
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
add_positions(d, poss) add_positions(d, poss)
res.append(d) res.append(d)
return res return res

+ 2
- 1
rag/nlp/search.py ファイルの表示

src = req.get("fields", src = req.get("fields",
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
"question_kwd", "question_tks",
"question_kwd", "question_tks", "doc_type_kwd",
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD]) "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
kwds = set([]) kwds = set([])


"term_similarity": tsim[i], "term_similarity": tsim[i],
"vector": chunk.get(vector_column, zero_vector), "vector": chunk.get(vector_column, zero_vector),
"positions": position_int, "positions": position_int,
"doc_type_kwd": chunk.get("doc_type_kwd", "")
} }
if highlight and sres.highlight: if highlight and sres.highlight:
if id in sres.highlight: if id in sres.highlight:

読み込み中…
キャンセル
保存