### What problem does this PR solve? #7608 ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.19.0
| "entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | "entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | ||||
| "source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | "source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | ||||
| "n_hop_with_weight": {"type": "varchar", "default": ""}, | "n_hop_with_weight": {"type": "varchar", "default": ""}, | ||||
| "removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"} | |||||
| "removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | |||||
| "doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"} | |||||
| } | } |
| res = tokenize_table(tbls, doc, eng) | res = tokenize_table(tbls, doc, eng) | ||||
| for text, image in ti_list: | for text, image in ti_list: | ||||
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) | ||||
| d['image'] = image | |||||
| if image: | |||||
| d['image'] = image | |||||
| d["doc_type_kwd"] = "image" | |||||
| tokenize(d, text, eng) | tokenize(d, text, eng) | ||||
| res.append(d) | res.append(d) | ||||
| return res | return res |
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), | "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), | ||||
| "image": img | |||||
| "image": img, | |||||
| "doc_type_kwd": "image" | |||||
| } | } | ||||
| bxs = ocr(np.array(img)) | bxs = ocr(np.array(img)) | ||||
| txt = "\n".join([t[0] for _, t in bxs if t[0]]) | txt = "\n".join([t[0] for _, t in bxs if t[0]]) |
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) | ||||
| pn += from_page | pn += from_page | ||||
| d["image"] = img | d["image"] = img | ||||
| d["doc_type_kwd"] = "image" | |||||
| d["page_num_int"] = [pn + 1] | d["page_num_int"] = [pn + 1] | ||||
| d["top_int"] = [0] | d["top_int"] = [0] | ||||
| d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] | d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] |
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | ||||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | d["content_ltks"] = rag_tokenizer.tokenize(q) | ||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | ||||
| d["image"] = image | |||||
| if image: | |||||
| d["image"] = image | |||||
| d["doc_type_kwd"] = "image" | |||||
| add_positions(d, poss) | add_positions(d, poss) | ||||
| return d | return d | ||||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | ||||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | d["content_ltks"] = rag_tokenizer.tokenize(q) | ||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | ||||
| d["image"] = image | |||||
| if image: | |||||
| d["image"] = image | |||||
| d["doc_type_kwd"] = "image" | |||||
| if row_num >= 0: | if row_num >= 0: | ||||
| d["top_int"] = [row_num] | d["top_int"] = [row_num] | ||||
| return d | return d |
| d["content_with_weight"] = rows | d["content_with_weight"] = rows | ||||
| if img: | if img: | ||||
| d["image"] = img | d["image"] = img | ||||
| d["doc_type_kwd"] = "image" | |||||
| if poss: | if poss: | ||||
| add_positions(d, poss) | add_positions(d, poss) | ||||
| res.append(d) | res.append(d) | ||||
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) | ||||
| r = de.join(rows[i:i + batch_size]) | r = de.join(rows[i:i + batch_size]) | ||||
| tokenize(d, r, eng) | tokenize(d, r, eng) | ||||
| d["image"] = img | |||||
| if img: | |||||
| d["image"] = img | |||||
| d["doc_type_kwd"] = "image" | |||||
| add_positions(d, poss) | add_positions(d, poss) | ||||
| res.append(d) | res.append(d) | ||||
| return res | return res |
| src = req.get("fields", | src = req.get("fields", | ||||
| ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", | ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", | ||||
| "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", | "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", | ||||
| "question_kwd", "question_tks", | |||||
| "question_kwd", "question_tks", "doc_type_kwd", | |||||
| "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD]) | "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD]) | ||||
| kwds = set([]) | kwds = set([]) | ||||
| "term_similarity": tsim[i], | "term_similarity": tsim[i], | ||||
| "vector": chunk.get(vector_column, zero_vector), | "vector": chunk.get(vector_column, zero_vector), | ||||
| "positions": position_int, | "positions": position_int, | ||||
| "doc_type_kwd": chunk.get("doc_type_kwd", "") | |||||
| } | } | ||||
| if highlight and sres.highlight: | if highlight and sres.highlight: | ||||
| if id in sres.highlight: | if id in sres.highlight: |