You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

infinity_conn.py 27KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import re
  19. import json
  20. import time
  21. import copy
  22. import infinity
  23. from infinity.common import ConflictType, InfinityException, SortType
  24. from infinity.index import IndexInfo, IndexType
  25. from infinity.connection_pool import ConnectionPool
  26. from infinity.errors import ErrorCode
  27. from rag import settings
  28. from rag.settings import PAGERANK_FLD
  29. from rag.utils import singleton
  30. import pandas as pd
  31. from api.utils.file_utils import get_project_base_directory
  32. from rag.utils.doc_store_conn import (
  33. DocStoreConnection,
  34. MatchExpr,
  35. MatchTextExpr,
  36. MatchDenseExpr,
  37. FusionExpr,
  38. OrderByExpr,
  39. )
  40. logger = logging.getLogger('ragflow.infinity_conn')
  41. def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | None:
  42. assert "_id" not in condition
  43. clmns = {}
  44. if table_instance:
  45. for n, ty, de, _ in table_instance.show_columns().rows():
  46. clmns[n] = (ty, de)
  47. def exists(cln):
  48. nonlocal clmns
  49. assert cln in clmns, f"'{cln}' should be in '{clmns}'."
  50. ty, de = clmns[cln]
  51. if ty.lower().find("cha"):
  52. if not de:
  53. de = ""
  54. return f" {cln}!='{de}' "
  55. return f"{cln}!={de}"
  56. cond = list()
  57. for k, v in condition.items():
  58. if not isinstance(k, str) or k in ["kb_id"] or not v:
  59. continue
  60. if isinstance(v, list):
  61. inCond = list()
  62. for item in v:
  63. if isinstance(item, str):
  64. inCond.append(f"'{item}'")
  65. else:
  66. inCond.append(str(item))
  67. if inCond:
  68. strInCond = ", ".join(inCond)
  69. strInCond = f"{k} IN ({strInCond})"
  70. cond.append(strInCond)
  71. elif k == "must_not":
  72. if isinstance(v, dict):
  73. for kk, vv in v.items():
  74. if kk == "exists":
  75. cond.append("NOT (%s)" % exists(vv))
  76. elif isinstance(v, str):
  77. cond.append(f"{k}='{v}'")
  78. elif k == "exists":
  79. cond.append(exists(v))
  80. else:
  81. cond.append(f"{k}={str(v)}")
  82. return " AND ".join(cond) if cond else "1=1"
  83. def concat_dataframes(df_list: list[pd.DataFrame], selectFields: list[str]) -> pd.DataFrame:
  84. df_list2 = [df for df in df_list if not df.empty]
  85. if df_list2:
  86. return pd.concat(df_list2, axis=0).reset_index(drop=True)
  87. schema = []
  88. for field_name in selectFields:
  89. if field_name == 'score()': # Workaround: fix schema is changed to score()
  90. schema.append('SCORE')
  91. elif field_name == 'similarity()': # Workaround: fix schema is changed to similarity()
  92. schema.append('SIMILARITY')
  93. else:
  94. schema.append(field_name)
  95. return pd.DataFrame(columns=schema)
  96. @singleton
  97. class InfinityConnection(DocStoreConnection):
  98. def __init__(self):
  99. self.dbName = settings.INFINITY.get("db_name", "default_db")
  100. infinity_uri = settings.INFINITY["uri"]
  101. if ":" in infinity_uri:
  102. host, port = infinity_uri.split(":")
  103. infinity_uri = infinity.common.NetworkAddress(host, int(port))
  104. self.connPool = None
  105. logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
  106. for _ in range(24):
  107. try:
  108. connPool = ConnectionPool(infinity_uri)
  109. inf_conn = connPool.get_conn()
  110. res = inf_conn.show_current_node()
  111. if res.error_code == ErrorCode.OK and res.server_status in ["started", "alive"]:
  112. self._migrate_db(inf_conn)
  113. self.connPool = connPool
  114. connPool.release_conn(inf_conn)
  115. break
  116. connPool.release_conn(inf_conn)
  117. logger.warn(f"Infinity status: {res.server_status}. Waiting Infinity {infinity_uri} to be healthy.")
  118. time.sleep(5)
  119. except Exception as e:
  120. logger.warning(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
  121. time.sleep(5)
  122. if self.connPool is None:
  123. msg = f"Infinity {infinity_uri} is unhealthy in 120s."
  124. logger.error(msg)
  125. raise Exception(msg)
  126. logger.info(f"Infinity {infinity_uri} is healthy.")
  127. def _migrate_db(self, inf_conn):
  128. inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
  129. fp_mapping = os.path.join(
  130. get_project_base_directory(), "conf", "infinity_mapping.json"
  131. )
  132. if not os.path.exists(fp_mapping):
  133. raise Exception(f"Mapping file not found at {fp_mapping}")
  134. schema = json.load(open(fp_mapping))
  135. table_names = inf_db.list_tables().table_names
  136. for table_name in table_names:
  137. inf_table = inf_db.get_table(table_name)
  138. index_names = inf_table.list_indexes().index_names
  139. if "q_vec_idx" not in index_names:
  140. # Skip tables not created by me
  141. continue
  142. column_names = inf_table.show_columns()["name"]
  143. column_names = set(column_names)
  144. for field_name, field_info in schema.items():
  145. if field_name in column_names:
  146. continue
  147. res = inf_table.add_columns({field_name: field_info})
  148. assert res.error_code == infinity.ErrorCode.OK
  149. logger.info(
  150. f"INFINITY added following column to table {table_name}: {field_name} {field_info}"
  151. )
  152. if field_info["type"] != "varchar" or "analyzer" not in field_info:
  153. continue
  154. inf_table.create_index(
  155. f"text_idx_{field_name}",
  156. IndexInfo(
  157. field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
  158. ),
  159. ConflictType.Ignore,
  160. )
  161. """
  162. Database operations
  163. """
  164. def dbType(self) -> str:
  165. return "infinity"
  166. def health(self) -> dict:
  167. """
  168. Return the health status of the database.
  169. """
  170. inf_conn = self.connPool.get_conn()
  171. res = inf_conn.show_current_node()
  172. self.connPool.release_conn(inf_conn)
  173. res2 = {
  174. "type": "infinity",
  175. "status": "green" if res.error_code == 0 and res.server_status in ["started", "alive"] else "red",
  176. "error": res.error_msg,
  177. }
  178. return res2
  179. """
  180. Table operations
  181. """
  182. def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
  183. table_name = f"{indexName}_{knowledgebaseId}"
  184. inf_conn = self.connPool.get_conn()
  185. inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
  186. fp_mapping = os.path.join(
  187. get_project_base_directory(), "conf", "infinity_mapping.json"
  188. )
  189. if not os.path.exists(fp_mapping):
  190. raise Exception(f"Mapping file not found at {fp_mapping}")
  191. schema = json.load(open(fp_mapping))
  192. vector_name = f"q_{vectorSize}_vec"
  193. schema[vector_name] = {"type": f"vector,{vectorSize},float"}
  194. inf_table = inf_db.create_table(
  195. table_name,
  196. schema,
  197. ConflictType.Ignore,
  198. )
  199. inf_table.create_index(
  200. "q_vec_idx",
  201. IndexInfo(
  202. vector_name,
  203. IndexType.Hnsw,
  204. {
  205. "M": "16",
  206. "ef_construction": "50",
  207. "metric": "cosine",
  208. "encode": "lvq",
  209. },
  210. ),
  211. ConflictType.Ignore,
  212. )
  213. for field_name, field_info in schema.items():
  214. if field_info["type"] != "varchar" or "analyzer" not in field_info:
  215. continue
  216. inf_table.create_index(
  217. f"text_idx_{field_name}",
  218. IndexInfo(
  219. field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
  220. ),
  221. ConflictType.Ignore,
  222. )
  223. self.connPool.release_conn(inf_conn)
  224. logger.info(
  225. f"INFINITY created table {table_name}, vector size {vectorSize}"
  226. )
  227. def deleteIdx(self, indexName: str, knowledgebaseId: str):
  228. table_name = f"{indexName}_{knowledgebaseId}"
  229. inf_conn = self.connPool.get_conn()
  230. db_instance = inf_conn.get_database(self.dbName)
  231. db_instance.drop_table(table_name, ConflictType.Ignore)
  232. self.connPool.release_conn(inf_conn)
  233. logger.info(f"INFINITY dropped table {table_name}")
  234. def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
  235. table_name = f"{indexName}_{knowledgebaseId}"
  236. try:
  237. inf_conn = self.connPool.get_conn()
  238. db_instance = inf_conn.get_database(self.dbName)
  239. _ = db_instance.get_table(table_name)
  240. self.connPool.release_conn(inf_conn)
  241. return True
  242. except Exception as e:
  243. logger.warning(f"INFINITY indexExist {str(e)}")
  244. return False
  245. """
  246. CRUD operations
  247. """
  248. def search(
  249. self, selectFields: list[str],
  250. highlightFields: list[str],
  251. condition: dict,
  252. matchExprs: list[MatchExpr],
  253. orderBy: OrderByExpr,
  254. offset: int,
  255. limit: int,
  256. indexNames: str | list[str],
  257. knowledgebaseIds: list[str],
  258. aggFields: list[str] = [],
  259. rank_feature: dict | None = None
  260. ) -> tuple[pd.DataFrame, int]:
  261. """
  262. TODO: Infinity doesn't provide highlight
  263. """
  264. if isinstance(indexNames, str):
  265. indexNames = indexNames.split(",")
  266. assert isinstance(indexNames, list) and len(indexNames) > 0
  267. inf_conn = self.connPool.get_conn()
  268. db_instance = inf_conn.get_database(self.dbName)
  269. df_list = list()
  270. table_list = list()
  271. output = selectFields.copy()
  272. for essential_field in ["id"]:
  273. if essential_field not in output:
  274. output.append(essential_field)
  275. score_func = ""
  276. score_column = ""
  277. for matchExpr in matchExprs:
  278. if isinstance(matchExpr, MatchTextExpr):
  279. score_func = "score()"
  280. score_column = "SCORE"
  281. break
  282. if not score_func:
  283. for matchExpr in matchExprs:
  284. if isinstance(matchExpr, MatchDenseExpr):
  285. score_func = "similarity()"
  286. score_column = "SIMILARITY"
  287. break
  288. if matchExprs:
  289. if score_func not in output:
  290. output.append(score_func)
  291. if PAGERANK_FLD not in output:
  292. output.append(PAGERANK_FLD)
  293. output = [f for f in output if f != "_score"]
  294. # Prepare expressions common to all tables
  295. filter_cond = None
  296. filter_fulltext = ""
  297. if condition:
  298. for indexName in indexNames:
  299. table_name = f"{indexName}_{knowledgebaseIds[0]}"
  300. filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
  301. break
  302. for matchExpr in matchExprs:
  303. if isinstance(matchExpr, MatchTextExpr):
  304. if filter_cond and "filter" not in matchExpr.extra_options:
  305. matchExpr.extra_options.update({"filter": filter_cond})
  306. fields = ",".join(matchExpr.fields)
  307. filter_fulltext = f"filter_fulltext('{fields}', '{matchExpr.matching_text}')"
  308. if filter_cond:
  309. filter_fulltext = f"({filter_cond}) AND {filter_fulltext}"
  310. minimum_should_match = matchExpr.extra_options.get("minimum_should_match", 0.0)
  311. if isinstance(minimum_should_match, float):
  312. str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
  313. matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match
  314. for k, v in matchExpr.extra_options.items():
  315. if not isinstance(v, str):
  316. matchExpr.extra_options[k] = str(v)
  317. logger.debug(f"INFINITY search MatchTextExpr: {json.dumps(matchExpr.__dict__)}")
  318. elif isinstance(matchExpr, MatchDenseExpr):
  319. if filter_fulltext and "filter" not in matchExpr.extra_options:
  320. matchExpr.extra_options.update({"filter": filter_fulltext})
  321. for k, v in matchExpr.extra_options.items():
  322. if not isinstance(v, str):
  323. matchExpr.extra_options[k] = str(v)
  324. similarity = matchExpr.extra_options.get("similarity")
  325. if similarity:
  326. matchExpr.extra_options["threshold"] = similarity
  327. del matchExpr.extra_options["similarity"]
  328. logger.debug(f"INFINITY search MatchDenseExpr: {json.dumps(matchExpr.__dict__)}")
  329. elif isinstance(matchExpr, FusionExpr):
  330. logger.debug(f"INFINITY search FusionExpr: {json.dumps(matchExpr.__dict__)}")
  331. order_by_expr_list = list()
  332. if orderBy.fields:
  333. for order_field in orderBy.fields:
  334. if order_field[1] == 0:
  335. order_by_expr_list.append((order_field[0], SortType.Asc))
  336. else:
  337. order_by_expr_list.append((order_field[0], SortType.Desc))
  338. total_hits_count = 0
  339. # Scatter search tables and gather the results
  340. for indexName in indexNames:
  341. for knowledgebaseId in knowledgebaseIds:
  342. table_name = f"{indexName}_{knowledgebaseId}"
  343. try:
  344. table_instance = db_instance.get_table(table_name)
  345. except Exception:
  346. continue
  347. table_list.append(table_name)
  348. builder = table_instance.output(output)
  349. if len(matchExprs) > 0:
  350. for matchExpr in matchExprs:
  351. if isinstance(matchExpr, MatchTextExpr):
  352. fields = ",".join(matchExpr.fields)
  353. builder = builder.match_text(
  354. fields,
  355. matchExpr.matching_text,
  356. matchExpr.topn,
  357. matchExpr.extra_options.copy(),
  358. )
  359. elif isinstance(matchExpr, MatchDenseExpr):
  360. builder = builder.match_dense(
  361. matchExpr.vector_column_name,
  362. matchExpr.embedding_data,
  363. matchExpr.embedding_data_type,
  364. matchExpr.distance_type,
  365. matchExpr.topn,
  366. matchExpr.extra_options.copy(),
  367. )
  368. elif isinstance(matchExpr, FusionExpr):
  369. builder = builder.fusion(
  370. matchExpr.method, matchExpr.topn, matchExpr.fusion_params
  371. )
  372. else:
  373. if len(filter_cond) > 0:
  374. builder.filter(filter_cond)
  375. if orderBy.fields:
  376. builder.sort(order_by_expr_list)
  377. builder.offset(offset).limit(limit)
  378. kb_res, extra_result = builder.option({"total_hits_count": True}).to_df()
  379. if extra_result:
  380. total_hits_count += int(extra_result["total_hits_count"])
  381. logger.debug(f"INFINITY search table: {str(table_name)}, result: {str(kb_res)}")
  382. df_list.append(kb_res)
  383. self.connPool.release_conn(inf_conn)
  384. res = concat_dataframes(df_list, output)
  385. if matchExprs:
  386. res['Sum'] = res[score_column] + res[PAGERANK_FLD]
  387. res = res.sort_values(by='Sum', ascending=False).reset_index(drop=True).drop(columns=['Sum'])
  388. res = res.head(limit)
  389. logger.debug(f"INFINITY search final result: {str(res)}")
  390. return res, total_hits_count
  391. def get(
  392. self, chunkId: str, indexName: str, knowledgebaseIds: list[str]
  393. ) -> dict | None:
  394. inf_conn = self.connPool.get_conn()
  395. db_instance = inf_conn.get_database(self.dbName)
  396. df_list = list()
  397. assert isinstance(knowledgebaseIds, list)
  398. table_list = list()
  399. for knowledgebaseId in knowledgebaseIds:
  400. table_name = f"{indexName}_{knowledgebaseId}"
  401. table_list.append(table_name)
  402. table_instance = None
  403. try:
  404. table_instance = db_instance.get_table(table_name)
  405. except Exception:
  406. logger.warning(
  407. f"Table not found: {table_name}, this knowledge base isn't created in Infinity. Maybe it is created in other document engine.")
  408. continue
  409. kb_res, _ = table_instance.output(["*"]).filter(f"id = '{chunkId}'").to_df()
  410. logger.debug(f"INFINITY get table: {str(table_list)}, result: {str(kb_res)}")
  411. df_list.append(kb_res)
  412. self.connPool.release_conn(inf_conn)
  413. res = concat_dataframes(df_list, ["id"])
  414. res_fields = self.getFields(res, res.columns.tolist())
  415. return res_fields.get(chunkId, None)
  416. def insert(
  417. self, documents: list[dict], indexName: str, knowledgebaseId: str = None
  418. ) -> list[str]:
  419. inf_conn = self.connPool.get_conn()
  420. db_instance = inf_conn.get_database(self.dbName)
  421. table_name = f"{indexName}_{knowledgebaseId}"
  422. try:
  423. table_instance = db_instance.get_table(table_name)
  424. except InfinityException as e:
  425. # src/common/status.cppm, kTableNotExist = 3022
  426. if e.error_code != ErrorCode.TABLE_NOT_EXIST:
  427. raise
  428. vector_size = 0
  429. patt = re.compile(r"q_(?P<vector_size>\d+)_vec")
  430. for k in documents[0].keys():
  431. m = patt.match(k)
  432. if m:
  433. vector_size = int(m.group("vector_size"))
  434. break
  435. if vector_size == 0:
  436. raise ValueError("Cannot infer vector size from documents")
  437. self.createIdx(indexName, knowledgebaseId, vector_size)
  438. table_instance = db_instance.get_table(table_name)
  439. # embedding fields can't have a default value....
  440. embedding_clmns = []
  441. clmns = table_instance.show_columns().rows()
  442. for n, ty, _, _ in clmns:
  443. r = re.search(r"Embedding\([a-z]+,([0-9]+)\)", ty)
  444. if not r:
  445. continue
  446. embedding_clmns.append((n, int(r.group(1))))
  447. docs = copy.deepcopy(documents)
  448. for d in docs:
  449. assert "_id" not in d
  450. assert "id" in d
  451. for k, v in d.items():
  452. if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd", "source_id"]:
  453. assert isinstance(v, list)
  454. d[k] = "###".join(v)
  455. elif re.search(r"_feas$", k):
  456. d[k] = json.dumps(v)
  457. elif k == 'kb_id':
  458. if isinstance(d[k], list):
  459. d[k] = d[k][0] # since d[k] is a list, but we need a str
  460. elif k == "position_int":
  461. assert isinstance(v, list)
  462. arr = [num for row in v for num in row]
  463. d[k] = "_".join(f"{num:08x}" for num in arr)
  464. elif k in ["page_num_int", "top_int"]:
  465. assert isinstance(v, list)
  466. d[k] = "_".join(f"{num:08x}" for num in v)
  467. for n, vs in embedding_clmns:
  468. if n in d:
  469. continue
  470. d[n] = [0] * vs
  471. ids = ["'{}'".format(d["id"]) for d in docs]
  472. str_ids = ", ".join(ids)
  473. str_filter = f"id IN ({str_ids})"
  474. table_instance.delete(str_filter)
  475. # for doc in documents:
  476. # logger.info(f"insert position_int: {doc['position_int']}")
  477. # logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
  478. table_instance.insert(docs)
  479. self.connPool.release_conn(inf_conn)
  480. logger.debug(f"INFINITY inserted into {table_name} {str_ids}.")
  481. return []
  482. def update(
  483. self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
  484. ) -> bool:
  485. # if 'position_int' in newValue:
  486. # logger.info(f"update position_int: {newValue['position_int']}")
  487. inf_conn = self.connPool.get_conn()
  488. db_instance = inf_conn.get_database(self.dbName)
  489. table_name = f"{indexName}_{knowledgebaseId}"
  490. table_instance = db_instance.get_table(table_name)
  491. #if "exists" in condition:
  492. # del condition["exists"]
  493. filter = equivalent_condition_to_str(condition, table_instance)
  494. for k, v in list(newValue.items()):
  495. if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd", "source_id"]:
  496. assert isinstance(v, list)
  497. newValue[k] = "###".join(v)
  498. elif re.search(r"_feas$", k):
  499. newValue[k] = json.dumps(v)
  500. elif k.endswith("_kwd") and isinstance(v, list):
  501. newValue[k] = " ".join(v)
  502. elif k == 'kb_id':
  503. if isinstance(newValue[k], list):
  504. newValue[k] = newValue[k][0] # since d[k] is a list, but we need a str
  505. elif k == "position_int":
  506. assert isinstance(v, list)
  507. arr = [num for row in v for num in row]
  508. newValue[k] = "_".join(f"{num:08x}" for num in arr)
  509. elif k in ["page_num_int", "top_int"]:
  510. assert isinstance(v, list)
  511. newValue[k] = "_".join(f"{num:08x}" for num in v)
  512. elif k == "remove":
  513. del newValue[k]
  514. if v in [PAGERANK_FLD]:
  515. newValue[v] = 0
  516. logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")
  517. table_instance.update(filter, newValue)
  518. self.connPool.release_conn(inf_conn)
  519. return True
  520. def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
  521. inf_conn = self.connPool.get_conn()
  522. db_instance = inf_conn.get_database(self.dbName)
  523. table_name = f"{indexName}_{knowledgebaseId}"
  524. try:
  525. table_instance = db_instance.get_table(table_name)
  526. except Exception:
  527. logger.warning(
  528. f"Skipped deleting from table {table_name} since the table doesn't exist."
  529. )
  530. return 0
  531. filter = equivalent_condition_to_str(condition, table_instance)
  532. logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
  533. res = table_instance.delete(filter)
  534. self.connPool.release_conn(inf_conn)
  535. return res.deleted_rows
  536. """
  537. Helper functions for search result
  538. """
  539. def getTotal(self, res: tuple[pd.DataFrame, int] | pd.DataFrame) -> int:
  540. if isinstance(res, tuple):
  541. return res[1]
  542. return len(res)
  543. def getChunkIds(self, res: tuple[pd.DataFrame, int] | pd.DataFrame) -> list[str]:
  544. if isinstance(res, tuple):
  545. res = res[0]
  546. return list(res["id"])
  547. def getFields(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fields: list[str]) -> dict[str, dict]:
  548. if isinstance(res, tuple):
  549. res = res[0]
  550. if not fields:
  551. return {}
  552. fieldsAll = fields.copy()
  553. fieldsAll.append('id')
  554. column_map = {col.lower(): col for col in res.columns}
  555. matched_columns = {column_map[col.lower()]:col for col in set(fieldsAll) if col.lower() in column_map}
  556. none_columns = [col for col in set(fieldsAll) if col.lower() not in column_map]
  557. res2 = res[matched_columns.keys()]
  558. res2 = res2.rename(columns=matched_columns)
  559. res2.drop_duplicates(subset=['id'], inplace=True)
  560. for column in res2.columns:
  561. k = column.lower()
  562. if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd", "source_id"]:
  563. res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd])
  564. elif k == "position_int":
  565. def to_position_int(v):
  566. if v:
  567. arr = [int(hex_val, 16) for hex_val in v.split('_')]
  568. v = [arr[i:i + 5] for i in range(0, len(arr), 5)]
  569. else:
  570. v = []
  571. return v
  572. res2[column] = res2[column].apply(to_position_int)
  573. elif k in ["page_num_int", "top_int"]:
  574. res2[column] = res2[column].apply(lambda v:[int(hex_val, 16) for hex_val in v.split('_')] if v else [])
  575. else:
  576. pass
  577. for column in none_columns:
  578. res2[column] = None
  579. return res2.set_index("id").to_dict(orient="index")
  580. def getHighlight(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, keywords: list[str], fieldnm: str):
  581. if isinstance(res, tuple):
  582. res = res[0]
  583. ans = {}
  584. num_rows = len(res)
  585. column_id = res["id"]
  586. if fieldnm not in res:
  587. return {}
  588. for i in range(num_rows):
  589. id = column_id[i]
  590. txt = res[fieldnm][i]
  591. txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
  592. txts = []
  593. for t in re.split(r"[.?!;\n]", txt):
  594. for w in keywords:
  595. t = re.sub(
  596. r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])"
  597. % re.escape(w),
  598. r"\1<em>\2</em>\3",
  599. t,
  600. flags=re.IGNORECASE | re.MULTILINE,
  601. )
  602. if not re.search(
  603. r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE
  604. ):
  605. continue
  606. txts.append(t)
  607. ans[id] = "...".join(txts)
  608. return ans
  609. def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
  610. """
  611. TODO: Infinity doesn't provide aggregation
  612. """
  613. return list()
  614. """
  615. SQL
  616. """
  617. def sql(sql: str, fetch_size: int, format: str):
  618. raise NotImplementedError("Not implemented")