### What problem does this PR solve? #6177 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.18.0
| from api.utils import get_uuid | from api.utils import get_uuid | ||||
| from graphrag.query_analyze_prompt import PROMPTS | from graphrag.query_analyze_prompt import PROMPTS | ||||
| from graphrag.utils import get_entity_type2sampels, get_llm_cache, set_llm_cache, get_relation | from graphrag.utils import get_entity_type2sampels, get_llm_cache, set_llm_cache, get_relation | ||||
| from rag.utils import num_tokens_from_string | |||||
| from rag.utils import num_tokens_from_string, get_float | |||||
| from rag.utils.doc_store_conn import OrderByExpr | from rag.utils.doc_store_conn import OrderByExpr | ||||
| from rag.nlp.search import Dealer, index_name | from rag.nlp.search import Dealer, index_name | ||||
| for f in flds: | for f in flds: | ||||
| if f in ent and ent[f] is None: | if f in ent and ent[f] is None: | ||||
| del ent[f] | del ent[f] | ||||
| if float(ent.get("_score", 0)) < sim_thr: | |||||
| if get_float(ent.get("_score", 0)) < sim_thr: | |||||
| continue | continue | ||||
| if isinstance(ent["entity_kwd"], list): | if isinstance(ent["entity_kwd"], list): | ||||
| ent["entity_kwd"] = ent["entity_kwd"][0] | ent["entity_kwd"] = ent["entity_kwd"][0] | ||||
| res[ent["entity_kwd"]] = { | res[ent["entity_kwd"]] = { | ||||
| "sim": float(ent.get("_score", 0)), | |||||
| "pagerank": float(ent.get("rank_flt", 0)), | |||||
| "sim": get_float(ent.get("_score", 0)), | |||||
| "pagerank": get_float(ent.get("rank_flt", 0)), | |||||
| "n_hop_ents": json.loads(ent.get("n_hop_with_weight", "[]")), | "n_hop_ents": json.loads(ent.get("n_hop_with_weight", "[]")), | ||||
| "description": ent.get("content_with_weight", "{}") | "description": ent.get("content_with_weight", "{}") | ||||
| } | } | ||||
| es_res = self.dataStore.getFields(es_res, ["content_with_weight", "_score", "from_entity_kwd", "to_entity_kwd", | es_res = self.dataStore.getFields(es_res, ["content_with_weight", "_score", "from_entity_kwd", "to_entity_kwd", | ||||
| "weight_int"]) | "weight_int"]) | ||||
| for _, ent in es_res.items(): | for _, ent in es_res.items(): | ||||
| if float(ent["_score"]) < sim_thr: | |||||
| if get_float(ent["_score"]) < sim_thr: | |||||
| continue | continue | ||||
| f, t = sorted([ent["from_entity_kwd"], ent["to_entity_kwd"]]) | f, t = sorted([ent["from_entity_kwd"], ent["to_entity_kwd"]]) | ||||
| if isinstance(f, list): | if isinstance(f, list): | ||||
| if isinstance(t, list): | if isinstance(t, list): | ||||
| t = t[0] | t = t[0] | ||||
| res[(f, t)] = { | res[(f, t)] = { | ||||
| "sim": float(ent["_score"]), | |||||
| "pagerank": float(ent.get("weight_int", 0)), | |||||
| "sim": get_float(ent["_score"]), | |||||
| "pagerank": get_float(ent.get("weight_int", 0)), | |||||
| "description": ent["content_with_weight"] | "description": ent["content_with_weight"] | ||||
| } | } | ||||
| return res | return res |
| from PIL import Image | from PIL import Image | ||||
| from markdown import markdown | from markdown import markdown | ||||
| from rag.utils import get_float | |||||
| class Excel(ExcelParser): | class Excel(ExcelParser): | ||||
| def __call__(self, fnm, binary=None, callback=None): | def __call__(self, fnm, binary=None, callback=None): | ||||
| section, line_tag = box['text'], self._line_tag(box, zoomin) | section, line_tag = box['text'], self._line_tag(box, zoomin) | ||||
| has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list) | has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list) | ||||
| last_box, last_index, last_bull = box, index, has_bull | last_box, last_index, last_bull = box, index, has_bull | ||||
| line_pn = float(line_tag.lstrip('@@').split('\t')[0]) | |||||
| line_top = float(line_tag.rstrip('##').split('\t')[3]) | |||||
| line_pn = get_float(line_tag.lstrip('@@').split('\t')[0]) | |||||
| line_top = get_float(line_tag.rstrip('##').split('\t')[3]) | |||||
| tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) | ||||
| if not has_bull: # No question bullet | if not has_bull: # No question bullet | ||||
| if not last_q: | if not last_q: |
| from dataclasses import dataclass | from dataclasses import dataclass | ||||
| from rag.settings import TAG_FLD, PAGERANK_FLD | from rag.settings import TAG_FLD, PAGERANK_FLD | ||||
| from rag.utils import rmSpace | |||||
| from rag.utils import rmSpace, get_float | |||||
| from rag.nlp import rag_tokenizer, query | from rag.nlp import rag_tokenizer, query | ||||
| import numpy as np | import numpy as np | ||||
| from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr | from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr | ||||
| if len(shape) > 1: | if len(shape) > 1: | ||||
| raise Exception( | raise Exception( | ||||
| f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).") | f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).") | ||||
| embedding_data = [float(v) for v in qv] | |||||
| embedding_data = [get_float(v) for v in qv] | |||||
| vector_column_name = f"q_{len(embedding_data)}_vec" | vector_column_name = f"q_{len(embedding_data)}_vec" | ||||
| return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity}) | return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity}) | ||||
| @staticmethod | @staticmethod | ||||
| def trans2floats(txt): | def trans2floats(txt): | ||||
| return [float(t) for t in txt.split("\t")] | |||||
| return [get_float(t) for t in txt.split("\t")] | |||||
| def insert_citations(self, answer, chunks, chunk_v, | def insert_citations(self, answer, chunks, chunk_v, | ||||
| embd_mdl, tkweight=0.1, vtweight=0.9): | embd_mdl, tkweight=0.1, vtweight=0.9): | ||||
| for chunk_id in sres.ids: | for chunk_id in sres.ids: | ||||
| vector = sres.field[chunk_id].get(vector_column, zero_vector) | vector = sres.field[chunk_id].get(vector_column, zero_vector) | ||||
| if isinstance(vector, str): | if isinstance(vector, str): | ||||
| vector = [float(v) for v in vector.split("\t")] | |||||
| vector = [get_float(v) for v in vector.split("\t")] | |||||
| ins_embd.append(vector) | ins_embd.append(vector) | ||||
| if not ins_embd: | if not ins_embd: | ||||
| return [], [], [] | return [], [], [] |
| import tiktoken | import tiktoken | ||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| def singleton(cls, *args, **kw): | def singleton(cls, *args, **kw): | ||||
| instances = {} | instances = {} | ||||
| def truncate(string: str, max_len: int) -> str: | def truncate(string: str, max_len: int) -> str: | ||||
| """Returns truncated text if the length of text exceed max_len.""" | """Returns truncated text if the length of text exceed max_len.""" | ||||
| return encoder.decode(encoder.encode(string)[:max_len]) | return encoder.decode(encoder.encode(string)[:max_len]) | ||||
| def get_float(v: str | None): | |||||
| if v is None: | |||||
| return float('-inf') | |||||
| try: | |||||
| return float(v) | |||||
| except Exception: | |||||
| return float('-inf') |
| from elastic_transport import ConnectionTimeout | from elastic_transport import ConnectionTimeout | ||||
| from rag import settings | from rag import settings | ||||
| from rag.settings import TAG_FLD, PAGERANK_FLD | from rag.settings import TAG_FLD, PAGERANK_FLD | ||||
| from rag.utils import singleton | |||||
| from rag.utils import singleton, get_float | |||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \ | from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \ | ||||
| FusionExpr | FusionExpr | ||||
| MatchDenseExpr) and isinstance( | MatchDenseExpr) and isinstance( | ||||
| matchExprs[2], FusionExpr) | matchExprs[2], FusionExpr) | ||||
| weights = m.fusion_params["weights"] | weights = m.fusion_params["weights"] | ||||
| vector_similarity_weight = float(weights.split(",")[1]) | |||||
| vector_similarity_weight = get_float(weights.split(",")[1]) | |||||
| for m in matchExprs: | for m in matchExprs: | ||||
| if isinstance(m, MatchTextExpr): | if isinstance(m, MatchTextExpr): | ||||
| minimum_should_match = m.extra_options.get("minimum_should_match", 0.0) | minimum_should_match = m.extra_options.get("minimum_should_match", 0.0) |