### What problem does this PR solve? #7608 ### Type of change - [x] New Feature (non-breaking change which adds functionality)

5ヶ月前 · 321a280031
--- a/conf/infinity_mapping.json
+++ b/conf/infinity_mapping.json
 	"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"n_hop_with_weight": {"type": "varchar", "default": ""},
 	"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
 	"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
 }
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
        res = tokenize_table(tbls, doc, eng)
        for text, image in ti_list:
            d = copy.deepcopy(doc)
            d['image'] = image
            if image:
                d['image'] = image
                d["doc_type_kwd"] = "image"
            tokenize(d, text, eng)
            res.append(d)
        return res
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
        "image": img
        "image": img,
        "doc_type_kwd": "image"
    }
    bxs = ocr(np.array(img))
    txt = "\n".join([t[0] for _, t in bxs if t[0]])
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
            d = copy.deepcopy(doc)
            pn += from_page
            d["image"] = img
            d["doc_type_kwd"] = "image"
            d["page_num_int"] = [pn + 1]
            d["top_int"] = [0]
            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    d["content_ltks"] = rag_tokenizer.tokenize(q)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    d["image"] = image
    if image:
        d["image"] = image
        d["doc_type_kwd"] = "image"
    add_positions(d, poss)
    return d
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    d["content_ltks"] = rag_tokenizer.tokenize(q)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    d["image"] = image
    if image:
        d["image"] = image
        d["doc_type_kwd"] = "image"
    if row_num >= 0:
        d["top_int"] = [row_num]
    return d
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
            d["content_with_weight"] = rows
            if img:
                d["image"] = img
                d["doc_type_kwd"] = "image"
            if poss:
                add_positions(d, poss)
            res.append(d)
            d = copy.deepcopy(doc)
            r = de.join(rows[i:i + batch_size])
            tokenize(d, r, eng)
            d["image"] = img
            if img:
                d["image"] = img
                d["doc_type_kwd"] = "image"
            add_positions(d, poss)
            res.append(d)
    return res
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
        src = req.get("fields",
                      ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
                       "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
                       "question_kwd", "question_tks",
                       "question_kwd", "question_tks", "doc_type_kwd",
                       "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
        kwds = set([])
                "term_similarity": tsim[i],
                "vector": chunk.get(vector_column, zero_vector),
                "positions": position_int,
                "doc_type_kwd": chunk.get("doc_type_kwd", "")
            }
            if highlight and sres.highlight:
                if id in sres.highlight: