Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

infinity_conn.py 27KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import re
  19. import json
  20. import time
  21. import copy
  22. import infinity
  23. from infinity.common import ConflictType, InfinityException, SortType
  24. from infinity.index import IndexInfo, IndexType
  25. from infinity.connection_pool import ConnectionPool
  26. from infinity.errors import ErrorCode
  27. from rag import settings
  28. from rag.settings import PAGERANK_FLD
  29. from rag.utils import singleton
  30. import pandas as pd
  31. from api.utils.file_utils import get_project_base_directory
  32. from rag.utils.doc_store_conn import (
  33. DocStoreConnection,
  34. MatchExpr,
  35. MatchTextExpr,
  36. MatchDenseExpr,
  37. FusionExpr,
  38. OrderByExpr,
  39. )
  40. logger = logging.getLogger('ragflow.infinity_conn')
  41. def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | None:
  42. assert "_id" not in condition
  43. clmns = {}
  44. if table_instance:
  45. for n, ty, de, _ in table_instance.show_columns().rows():
  46. clmns[n] = (ty, de)
  47. def exists(cln):
  48. nonlocal clmns
  49. assert cln in clmns, f"'{cln}' should be in '{clmns}'."
  50. ty, de = clmns[cln]
  51. if ty.lower().find("cha"):
  52. if not de:
  53. de = ""
  54. return f" {cln}!='{de}' "
  55. return f"{cln}!={de}"
  56. cond = list()
  57. for k, v in condition.items():
  58. if not isinstance(k, str) or k in ["kb_id"] or not v:
  59. continue
  60. if isinstance(v, list):
  61. inCond = list()
  62. for item in v:
  63. if isinstance(item, str):
  64. inCond.append(f"'{item}'")
  65. else:
  66. inCond.append(str(item))
  67. if inCond:
  68. strInCond = ", ".join(inCond)
  69. strInCond = f"{k} IN ({strInCond})"
  70. cond.append(strInCond)
  71. elif k == "must_not":
  72. if isinstance(v, dict):
  73. for kk, vv in v.items():
  74. if kk == "exists":
  75. cond.append("NOT (%s)" % exists(vv))
  76. elif isinstance(v, str):
  77. cond.append(f"{k}='{v}'")
  78. elif k == "exists":
  79. cond.append(exists(v))
  80. else:
  81. cond.append(f"{k}={str(v)}")
  82. return " AND ".join(cond) if cond else "1=1"
  83. def concat_dataframes(df_list: list[pd.DataFrame], selectFields: list[str]) -> pd.DataFrame:
  84. df_list2 = [df for df in df_list if not df.empty]
  85. if df_list2:
  86. return pd.concat(df_list2, axis=0).reset_index(drop=True)
  87. schema = []
  88. for field_name in selectFields:
  89. if field_name == 'score()': # Workaround: fix schema is changed to score()
  90. schema.append('SCORE')
  91. elif field_name == 'similarity()': # Workaround: fix schema is changed to similarity()
  92. schema.append('SIMILARITY')
  93. else:
  94. schema.append(field_name)
  95. return pd.DataFrame(columns=schema)
  96. @singleton
  97. class InfinityConnection(DocStoreConnection):
  98. def __init__(self):
  99. self.dbName = settings.INFINITY.get("db_name", "default_db")
  100. infinity_uri = settings.INFINITY["uri"]
  101. if ":" in infinity_uri:
  102. host, port = infinity_uri.split(":")
  103. infinity_uri = infinity.common.NetworkAddress(host, int(port))
  104. self.connPool = None
  105. logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
  106. for _ in range(24):
  107. try:
  108. connPool = ConnectionPool(infinity_uri)
  109. inf_conn = connPool.get_conn()
  110. res = inf_conn.show_current_node()
  111. if res.error_code == ErrorCode.OK and res.server_status in ["started", "alive"]:
  112. self._migrate_db(inf_conn)
  113. self.connPool = connPool
  114. connPool.release_conn(inf_conn)
  115. break
  116. connPool.release_conn(inf_conn)
  117. logger.warn(f"Infinity status: {res.server_status}. Waiting Infinity {infinity_uri} to be healthy.")
  118. time.sleep(5)
  119. except Exception as e:
  120. logger.warning(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
  121. time.sleep(5)
  122. if self.connPool is None:
  123. msg = f"Infinity {infinity_uri} is unhealthy in 120s."
  124. logger.error(msg)
  125. raise Exception(msg)
  126. logger.info(f"Infinity {infinity_uri} is healthy.")
  127. def _migrate_db(self, inf_conn):
  128. inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
  129. fp_mapping = os.path.join(
  130. get_project_base_directory(), "conf", "infinity_mapping.json"
  131. )
  132. if not os.path.exists(fp_mapping):
  133. raise Exception(f"Mapping file not found at {fp_mapping}")
  134. schema = json.load(open(fp_mapping))
  135. table_names = inf_db.list_tables().table_names
  136. for table_name in table_names:
  137. inf_table = inf_db.get_table(table_name)
  138. index_names = inf_table.list_indexes().index_names
  139. if "q_vec_idx" not in index_names:
  140. # Skip tables not created by me
  141. continue
  142. column_names = inf_table.show_columns()["name"]
  143. column_names = set(column_names)
  144. for field_name, field_info in schema.items():
  145. if field_name in column_names:
  146. continue
  147. res = inf_table.add_columns({field_name: field_info})
  148. assert res.error_code == infinity.ErrorCode.OK
  149. logger.info(
  150. f"INFINITY added following column to table {table_name}: {field_name} {field_info}"
  151. )
  152. if field_info["type"] != "varchar" or "analyzer" not in field_info:
  153. continue
  154. inf_table.create_index(
  155. f"text_idx_{field_name}",
  156. IndexInfo(
  157. field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
  158. ),
  159. ConflictType.Ignore,
  160. )
  161. def field_keyword(self, field_name: str):
  162. # The "docnm_kwd" field is always a string, not list.
  163. if field_name == "source_id" or (field_name.endswith("_kwd") and field_name != "docnm_kwd" and field_name != "knowledge_graph_kwd"):
  164. return True
  165. return False
  166. """
  167. Database operations
  168. """
  169. def dbType(self) -> str:
  170. return "infinity"
  171. def health(self) -> dict:
  172. """
  173. Return the health status of the database.
  174. """
  175. inf_conn = self.connPool.get_conn()
  176. res = inf_conn.show_current_node()
  177. self.connPool.release_conn(inf_conn)
  178. res2 = {
  179. "type": "infinity",
  180. "status": "green" if res.error_code == 0 and res.server_status in ["started", "alive"] else "red",
  181. "error": res.error_msg,
  182. }
  183. return res2
  184. """
  185. Table operations
  186. """
  187. def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
  188. table_name = f"{indexName}_{knowledgebaseId}"
  189. inf_conn = self.connPool.get_conn()
  190. inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
  191. fp_mapping = os.path.join(
  192. get_project_base_directory(), "conf", "infinity_mapping.json"
  193. )
  194. if not os.path.exists(fp_mapping):
  195. raise Exception(f"Mapping file not found at {fp_mapping}")
  196. schema = json.load(open(fp_mapping))
  197. vector_name = f"q_{vectorSize}_vec"
  198. schema[vector_name] = {"type": f"vector,{vectorSize},float"}
  199. inf_table = inf_db.create_table(
  200. table_name,
  201. schema,
  202. ConflictType.Ignore,
  203. )
  204. inf_table.create_index(
  205. "q_vec_idx",
  206. IndexInfo(
  207. vector_name,
  208. IndexType.Hnsw,
  209. {
  210. "M": "16",
  211. "ef_construction": "50",
  212. "metric": "cosine",
  213. "encode": "lvq",
  214. },
  215. ),
  216. ConflictType.Ignore,
  217. )
  218. for field_name, field_info in schema.items():
  219. if field_info["type"] != "varchar" or "analyzer" not in field_info:
  220. continue
  221. inf_table.create_index(
  222. f"text_idx_{field_name}",
  223. IndexInfo(
  224. field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
  225. ),
  226. ConflictType.Ignore,
  227. )
  228. self.connPool.release_conn(inf_conn)
  229. logger.info(
  230. f"INFINITY created table {table_name}, vector size {vectorSize}"
  231. )
  232. def deleteIdx(self, indexName: str, knowledgebaseId: str):
  233. table_name = f"{indexName}_{knowledgebaseId}"
  234. inf_conn = self.connPool.get_conn()
  235. db_instance = inf_conn.get_database(self.dbName)
  236. db_instance.drop_table(table_name, ConflictType.Ignore)
  237. self.connPool.release_conn(inf_conn)
  238. logger.info(f"INFINITY dropped table {table_name}")
  239. def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
  240. table_name = f"{indexName}_{knowledgebaseId}"
  241. try:
  242. inf_conn = self.connPool.get_conn()
  243. db_instance = inf_conn.get_database(self.dbName)
  244. _ = db_instance.get_table(table_name)
  245. self.connPool.release_conn(inf_conn)
  246. return True
  247. except Exception as e:
  248. logger.warning(f"INFINITY indexExist {str(e)}")
  249. return False
  250. """
  251. CRUD operations
  252. """
  253. def search(
  254. self, selectFields: list[str],
  255. highlightFields: list[str],
  256. condition: dict,
  257. matchExprs: list[MatchExpr],
  258. orderBy: OrderByExpr,
  259. offset: int,
  260. limit: int,
  261. indexNames: str | list[str],
  262. knowledgebaseIds: list[str],
  263. aggFields: list[str] = [],
  264. rank_feature: dict | None = None
  265. ) -> tuple[pd.DataFrame, int]:
  266. """
  267. TODO: Infinity doesn't provide highlight
  268. """
  269. if isinstance(indexNames, str):
  270. indexNames = indexNames.split(",")
  271. assert isinstance(indexNames, list) and len(indexNames) > 0
  272. inf_conn = self.connPool.get_conn()
  273. db_instance = inf_conn.get_database(self.dbName)
  274. df_list = list()
  275. table_list = list()
  276. output = selectFields.copy()
  277. for essential_field in ["id"]:
  278. if essential_field not in output:
  279. output.append(essential_field)
  280. score_func = ""
  281. score_column = ""
  282. for matchExpr in matchExprs:
  283. if isinstance(matchExpr, MatchTextExpr):
  284. score_func = "score()"
  285. score_column = "SCORE"
  286. break
  287. if not score_func:
  288. for matchExpr in matchExprs:
  289. if isinstance(matchExpr, MatchDenseExpr):
  290. score_func = "similarity()"
  291. score_column = "SIMILARITY"
  292. break
  293. if matchExprs:
  294. if score_func not in output:
  295. output.append(score_func)
  296. if PAGERANK_FLD not in output:
  297. output.append(PAGERANK_FLD)
  298. output = [f for f in output if f != "_score"]
  299. # Prepare expressions common to all tables
  300. filter_cond = None
  301. filter_fulltext = ""
  302. if condition:
  303. for indexName in indexNames:
  304. table_name = f"{indexName}_{knowledgebaseIds[0]}"
  305. filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
  306. break
  307. for matchExpr in matchExprs:
  308. if isinstance(matchExpr, MatchTextExpr):
  309. if filter_cond and "filter" not in matchExpr.extra_options:
  310. matchExpr.extra_options.update({"filter": filter_cond})
  311. fields = ",".join(matchExpr.fields)
  312. filter_fulltext = f"filter_fulltext('{fields}', '{matchExpr.matching_text}')"
  313. if filter_cond:
  314. filter_fulltext = f"({filter_cond}) AND {filter_fulltext}"
  315. minimum_should_match = matchExpr.extra_options.get("minimum_should_match", 0.0)
  316. if isinstance(minimum_should_match, float):
  317. str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
  318. matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match
  319. for k, v in matchExpr.extra_options.items():
  320. if not isinstance(v, str):
  321. matchExpr.extra_options[k] = str(v)
  322. logger.debug(f"INFINITY search MatchTextExpr: {json.dumps(matchExpr.__dict__)}")
  323. elif isinstance(matchExpr, MatchDenseExpr):
  324. if filter_fulltext and "filter" not in matchExpr.extra_options:
  325. matchExpr.extra_options.update({"filter": filter_fulltext})
  326. for k, v in matchExpr.extra_options.items():
  327. if not isinstance(v, str):
  328. matchExpr.extra_options[k] = str(v)
  329. similarity = matchExpr.extra_options.get("similarity")
  330. if similarity:
  331. matchExpr.extra_options["threshold"] = similarity
  332. del matchExpr.extra_options["similarity"]
  333. logger.debug(f"INFINITY search MatchDenseExpr: {json.dumps(matchExpr.__dict__)}")
  334. elif isinstance(matchExpr, FusionExpr):
  335. logger.debug(f"INFINITY search FusionExpr: {json.dumps(matchExpr.__dict__)}")
  336. order_by_expr_list = list()
  337. if orderBy.fields:
  338. for order_field in orderBy.fields:
  339. if order_field[1] == 0:
  340. order_by_expr_list.append((order_field[0], SortType.Asc))
  341. else:
  342. order_by_expr_list.append((order_field[0], SortType.Desc))
  343. total_hits_count = 0
  344. # Scatter search tables and gather the results
  345. for indexName in indexNames:
  346. for knowledgebaseId in knowledgebaseIds:
  347. table_name = f"{indexName}_{knowledgebaseId}"
  348. try:
  349. table_instance = db_instance.get_table(table_name)
  350. except Exception:
  351. continue
  352. table_list.append(table_name)
  353. builder = table_instance.output(output)
  354. if len(matchExprs) > 0:
  355. for matchExpr in matchExprs:
  356. if isinstance(matchExpr, MatchTextExpr):
  357. fields = ",".join(matchExpr.fields)
  358. builder = builder.match_text(
  359. fields,
  360. matchExpr.matching_text,
  361. matchExpr.topn,
  362. matchExpr.extra_options.copy(),
  363. )
  364. elif isinstance(matchExpr, MatchDenseExpr):
  365. builder = builder.match_dense(
  366. matchExpr.vector_column_name,
  367. matchExpr.embedding_data,
  368. matchExpr.embedding_data_type,
  369. matchExpr.distance_type,
  370. matchExpr.topn,
  371. matchExpr.extra_options.copy(),
  372. )
  373. elif isinstance(matchExpr, FusionExpr):
  374. builder = builder.fusion(
  375. matchExpr.method, matchExpr.topn, matchExpr.fusion_params
  376. )
  377. else:
  378. if len(filter_cond) > 0:
  379. builder.filter(filter_cond)
  380. if orderBy.fields:
  381. builder.sort(order_by_expr_list)
  382. builder.offset(offset).limit(limit)
  383. kb_res, extra_result = builder.option({"total_hits_count": True}).to_df()
  384. if extra_result:
  385. total_hits_count += int(extra_result["total_hits_count"])
  386. logger.debug(f"INFINITY search table: {str(table_name)}, result: {str(kb_res)}")
  387. df_list.append(kb_res)
  388. self.connPool.release_conn(inf_conn)
  389. res = concat_dataframes(df_list, output)
  390. if matchExprs:
  391. res['Sum'] = res[score_column] + res[PAGERANK_FLD]
  392. res = res.sort_values(by='Sum', ascending=False).reset_index(drop=True).drop(columns=['Sum'])
  393. res = res.head(limit)
  394. logger.debug(f"INFINITY search final result: {str(res)}")
  395. return res, total_hits_count
  396. def get(
  397. self, chunkId: str, indexName: str, knowledgebaseIds: list[str]
  398. ) -> dict | None:
  399. inf_conn = self.connPool.get_conn()
  400. db_instance = inf_conn.get_database(self.dbName)
  401. df_list = list()
  402. assert isinstance(knowledgebaseIds, list)
  403. table_list = list()
  404. for knowledgebaseId in knowledgebaseIds:
  405. table_name = f"{indexName}_{knowledgebaseId}"
  406. table_list.append(table_name)
  407. table_instance = None
  408. try:
  409. table_instance = db_instance.get_table(table_name)
  410. except Exception:
  411. logger.warning(
  412. f"Table not found: {table_name}, this knowledge base isn't created in Infinity. Maybe it is created in other document engine.")
  413. continue
  414. kb_res, _ = table_instance.output(["*"]).filter(f"id = '{chunkId}'").to_df()
  415. logger.debug(f"INFINITY get table: {str(table_list)}, result: {str(kb_res)}")
  416. df_list.append(kb_res)
  417. self.connPool.release_conn(inf_conn)
  418. res = concat_dataframes(df_list, ["id"])
  419. res_fields = self.getFields(res, res.columns.tolist())
  420. return res_fields.get(chunkId, None)
  421. def insert(
  422. self, documents: list[dict], indexName: str, knowledgebaseId: str = None
  423. ) -> list[str]:
  424. inf_conn = self.connPool.get_conn()
  425. db_instance = inf_conn.get_database(self.dbName)
  426. table_name = f"{indexName}_{knowledgebaseId}"
  427. try:
  428. table_instance = db_instance.get_table(table_name)
  429. except InfinityException as e:
  430. # src/common/status.cppm, kTableNotExist = 3022
  431. if e.error_code != ErrorCode.TABLE_NOT_EXIST:
  432. raise
  433. vector_size = 0
  434. patt = re.compile(r"q_(?P<vector_size>\d+)_vec")
  435. for k in documents[0].keys():
  436. m = patt.match(k)
  437. if m:
  438. vector_size = int(m.group("vector_size"))
  439. break
  440. if vector_size == 0:
  441. raise ValueError("Cannot infer vector size from documents")
  442. self.createIdx(indexName, knowledgebaseId, vector_size)
  443. table_instance = db_instance.get_table(table_name)
  444. # embedding fields can't have a default value....
  445. embedding_clmns = []
  446. clmns = table_instance.show_columns().rows()
  447. for n, ty, _, _ in clmns:
  448. r = re.search(r"Embedding\([a-z]+,([0-9]+)\)", ty)
  449. if not r:
  450. continue
  451. embedding_clmns.append((n, int(r.group(1))))
  452. docs = copy.deepcopy(documents)
  453. for d in docs:
  454. assert "_id" not in d
  455. assert "id" in d
  456. for k, v in d.items():
  457. if self.field_keyword(k):
  458. if isinstance(v, list):
  459. d[k] = "###".join(v)
  460. else:
  461. d[k] = v
  462. elif re.search(r"_feas$", k):
  463. d[k] = json.dumps(v)
  464. elif k == 'kb_id':
  465. if isinstance(d[k], list):
  466. d[k] = d[k][0] # since d[k] is a list, but we need a str
  467. elif k == "position_int":
  468. assert isinstance(v, list)
  469. arr = [num for row in v for num in row]
  470. d[k] = "_".join(f"{num:08x}" for num in arr)
  471. elif k in ["page_num_int", "top_int"]:
  472. assert isinstance(v, list)
  473. d[k] = "_".join(f"{num:08x}" for num in v)
  474. else:
  475. d[k] = v
  476. for n, vs in embedding_clmns:
  477. if n in d:
  478. continue
  479. d[n] = [0] * vs
  480. ids = ["'{}'".format(d["id"]) for d in docs]
  481. str_ids = ", ".join(ids)
  482. str_filter = f"id IN ({str_ids})"
  483. table_instance.delete(str_filter)
  484. # for doc in documents:
  485. # logger.info(f"insert position_int: {doc['position_int']}")
  486. # logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
  487. table_instance.insert(docs)
  488. self.connPool.release_conn(inf_conn)
  489. logger.debug(f"INFINITY inserted into {table_name} {str_ids}.")
  490. return []
  491. def update(
  492. self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
  493. ) -> bool:
  494. # if 'position_int' in newValue:
  495. # logger.info(f"update position_int: {newValue['position_int']}")
  496. inf_conn = self.connPool.get_conn()
  497. db_instance = inf_conn.get_database(self.dbName)
  498. table_name = f"{indexName}_{knowledgebaseId}"
  499. table_instance = db_instance.get_table(table_name)
  500. #if "exists" in condition:
  501. # del condition["exists"]
  502. filter = equivalent_condition_to_str(condition, table_instance)
  503. for k, v in list(newValue.items()):
  504. if self.field_keyword(k):
  505. if isinstance(v, list):
  506. newValue[k] = "###".join(v)
  507. else:
  508. newValue[k] = v
  509. elif re.search(r"_feas$", k):
  510. newValue[k] = json.dumps(v)
  511. elif k == 'kb_id':
  512. if isinstance(newValue[k], list):
  513. newValue[k] = newValue[k][0] # since d[k] is a list, but we need a str
  514. elif k == "position_int":
  515. assert isinstance(v, list)
  516. arr = [num for row in v for num in row]
  517. newValue[k] = "_".join(f"{num:08x}" for num in arr)
  518. elif k in ["page_num_int", "top_int"]:
  519. assert isinstance(v, list)
  520. newValue[k] = "_".join(f"{num:08x}" for num in v)
  521. elif k == "remove":
  522. del newValue[k]
  523. if v in [PAGERANK_FLD]:
  524. newValue[v] = 0
  525. else:
  526. newValue[k] = v
  527. logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")
  528. table_instance.update(filter, newValue)
  529. self.connPool.release_conn(inf_conn)
  530. return True
  531. def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
  532. inf_conn = self.connPool.get_conn()
  533. db_instance = inf_conn.get_database(self.dbName)
  534. table_name = f"{indexName}_{knowledgebaseId}"
  535. try:
  536. table_instance = db_instance.get_table(table_name)
  537. except Exception:
  538. logger.warning(
  539. f"Skipped deleting from table {table_name} since the table doesn't exist."
  540. )
  541. return 0
  542. filter = equivalent_condition_to_str(condition, table_instance)
  543. logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
  544. res = table_instance.delete(filter)
  545. self.connPool.release_conn(inf_conn)
  546. return res.deleted_rows
  547. """
  548. Helper functions for search result
  549. """
  550. def getTotal(self, res: tuple[pd.DataFrame, int] | pd.DataFrame) -> int:
  551. if isinstance(res, tuple):
  552. return res[1]
  553. return len(res)
  554. def getChunkIds(self, res: tuple[pd.DataFrame, int] | pd.DataFrame) -> list[str]:
  555. if isinstance(res, tuple):
  556. res = res[0]
  557. return list(res["id"])
  558. def getFields(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fields: list[str]) -> dict[str, dict]:
  559. if isinstance(res, tuple):
  560. res = res[0]
  561. if not fields:
  562. return {}
  563. fieldsAll = fields.copy()
  564. fieldsAll.append('id')
  565. column_map = {col.lower(): col for col in res.columns}
  566. matched_columns = {column_map[col.lower()]:col for col in set(fieldsAll) if col.lower() in column_map}
  567. none_columns = [col for col in set(fieldsAll) if col.lower() not in column_map]
  568. res2 = res[matched_columns.keys()]
  569. res2 = res2.rename(columns=matched_columns)
  570. res2.drop_duplicates(subset=['id'], inplace=True)
  571. for column in res2.columns:
  572. k = column.lower()
  573. if self.field_keyword(k):
  574. res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd])
  575. elif k == "position_int":
  576. def to_position_int(v):
  577. if v:
  578. arr = [int(hex_val, 16) for hex_val in v.split('_')]
  579. v = [arr[i:i + 5] for i in range(0, len(arr), 5)]
  580. else:
  581. v = []
  582. return v
  583. res2[column] = res2[column].apply(to_position_int)
  584. elif k in ["page_num_int", "top_int"]:
  585. res2[column] = res2[column].apply(lambda v:[int(hex_val, 16) for hex_val in v.split('_')] if v else [])
  586. else:
  587. pass
  588. for column in none_columns:
  589. res2[column] = None
  590. return res2.set_index("id").to_dict(orient="index")
  591. def getHighlight(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, keywords: list[str], fieldnm: str):
  592. if isinstance(res, tuple):
  593. res = res[0]
  594. ans = {}
  595. num_rows = len(res)
  596. column_id = res["id"]
  597. if fieldnm not in res:
  598. return {}
  599. for i in range(num_rows):
  600. id = column_id[i]
  601. txt = res[fieldnm][i]
  602. txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
  603. txts = []
  604. for t in re.split(r"[.?!;\n]", txt):
  605. for w in keywords:
  606. t = re.sub(
  607. r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])"
  608. % re.escape(w),
  609. r"\1<em>\2</em>\3",
  610. t,
  611. flags=re.IGNORECASE | re.MULTILINE,
  612. )
  613. if not re.search(
  614. r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE
  615. ):
  616. continue
  617. txts.append(t)
  618. ans[id] = "...".join(txts)
  619. return ans
  620. def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
  621. """
  622. TODO: Infinity doesn't provide aggregation
  623. """
  624. return list()
  625. """
  626. SQL
  627. """
  628. def sql(sql: str, fetch_size: int, format: str):
  629. raise NotImplementedError("Not implemented")