You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

infinity_conn.py 25KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import re
  19. import json
  20. import time
  21. import copy
  22. import infinity
  23. from infinity.common import ConflictType, InfinityException, SortType
  24. from infinity.index import IndexInfo, IndexType
  25. from infinity.connection_pool import ConnectionPool
  26. from infinity.errors import ErrorCode
  27. from rag import settings
  28. from rag.settings import PAGERANK_FLD
  29. from rag.utils import singleton
  30. import polars as pl
  31. from polars.series.series import Series
  32. from api.utils.file_utils import get_project_base_directory
  33. from rag.utils.doc_store_conn import (
  34. DocStoreConnection,
  35. MatchExpr,
  36. MatchTextExpr,
  37. MatchDenseExpr,
  38. FusionExpr,
  39. OrderByExpr,
  40. )
  41. logger = logging.getLogger('ragflow.infinity_conn')
  42. def equivalent_condition_to_str(condition: dict) -> str | None:
  43. assert "_id" not in condition
  44. cond = list()
  45. for k, v in condition.items():
  46. if not isinstance(k, str) or k in ["kb_id"] or not v:
  47. continue
  48. if isinstance(v, list):
  49. inCond = list()
  50. for item in v:
  51. if isinstance(item, str):
  52. inCond.append(f"'{item}'")
  53. else:
  54. inCond.append(str(item))
  55. if inCond:
  56. strInCond = ", ".join(inCond)
  57. strInCond = f"{k} IN ({strInCond})"
  58. cond.append(strInCond)
  59. elif isinstance(v, str):
  60. cond.append(f"{k}='{v}'")
  61. else:
  62. cond.append(f"{k}={str(v)}")
  63. return " AND ".join(cond) if cond else "1=1"
  64. def concat_dataframes(df_list: list[pl.DataFrame], selectFields: list[str]) -> pl.DataFrame:
  65. """
  66. Concatenate multiple dataframes into one.
  67. """
  68. df_list = [df for df in df_list if not df.is_empty()]
  69. if df_list:
  70. return pl.concat(df_list)
  71. schema = dict()
  72. for field_name in selectFields:
  73. if field_name == 'score()': # Workaround: fix schema is changed to score()
  74. schema['SCORE'] = str
  75. else:
  76. schema[field_name] = str
  77. return pl.DataFrame(schema=schema)
  78. @singleton
  79. class InfinityConnection(DocStoreConnection):
  80. def __init__(self):
  81. self.dbName = settings.INFINITY.get("db_name", "default_db")
  82. infinity_uri = settings.INFINITY["uri"]
  83. if ":" in infinity_uri:
  84. host, port = infinity_uri.split(":")
  85. infinity_uri = infinity.common.NetworkAddress(host, int(port))
  86. self.connPool = None
  87. logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
  88. for _ in range(24):
  89. try:
  90. connPool = ConnectionPool(infinity_uri)
  91. inf_conn = connPool.get_conn()
  92. res = inf_conn.show_current_node()
  93. if res.error_code == ErrorCode.OK and res.server_status == "started":
  94. self._migrate_db(inf_conn)
  95. self.connPool = connPool
  96. connPool.release_conn(inf_conn)
  97. break
  98. connPool.release_conn(inf_conn)
  99. logger.warn(f"Infinity status: {res.server_status}. Waiting Infinity {infinity_uri} to be healthy.")
  100. time.sleep(5)
  101. except Exception as e:
  102. logger.warning(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
  103. time.sleep(5)
  104. if self.connPool is None:
  105. msg = f"Infinity {infinity_uri} is unhealthy in 120s."
  106. logger.error(msg)
  107. raise Exception(msg)
  108. logger.info(f"Infinity {infinity_uri} is healthy.")
  109. def _migrate_db(self, inf_conn):
  110. inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
  111. fp_mapping = os.path.join(
  112. get_project_base_directory(), "conf", "infinity_mapping.json"
  113. )
  114. if not os.path.exists(fp_mapping):
  115. raise Exception(f"Mapping file not found at {fp_mapping}")
  116. schema = json.load(open(fp_mapping))
  117. table_names = inf_db.list_tables().table_names
  118. for table_name in table_names:
  119. inf_table = inf_db.get_table(table_name)
  120. index_names = inf_table.list_indexes().index_names
  121. if "q_vec_idx" not in index_names:
  122. # Skip tables not created by me
  123. continue
  124. column_names = inf_table.show_columns()["name"]
  125. column_names = set(column_names)
  126. for field_name, field_info in schema.items():
  127. if field_name in column_names:
  128. continue
  129. res = inf_table.add_columns({field_name: field_info})
  130. assert res.error_code == infinity.ErrorCode.OK
  131. logger.info(
  132. f"INFINITY added following column to table {table_name}: {field_name} {field_info}"
  133. )
  134. if field_info["type"] != "varchar" or "analyzer" not in field_info:
  135. continue
  136. inf_table.create_index(
  137. f"text_idx_{field_name}",
  138. IndexInfo(
  139. field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
  140. ),
  141. ConflictType.Ignore,
  142. )
  143. """
  144. Database operations
  145. """
  146. def dbType(self) -> str:
  147. return "infinity"
  148. def health(self) -> dict:
  149. """
  150. Return the health status of the database.
  151. """
  152. inf_conn = self.connPool.get_conn()
  153. res = inf_conn.show_current_node()
  154. self.connPool.release_conn(inf_conn)
  155. res2 = {
  156. "type": "infinity",
  157. "status": "green" if res.error_code == 0 and res.server_status == "started" else "red",
  158. "error": res.error_msg,
  159. }
  160. return res2
  161. """
  162. Table operations
  163. """
  164. def createIdx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
  165. table_name = f"{indexName}_{knowledgebaseId}"
  166. inf_conn = self.connPool.get_conn()
  167. inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
  168. fp_mapping = os.path.join(
  169. get_project_base_directory(), "conf", "infinity_mapping.json"
  170. )
  171. if not os.path.exists(fp_mapping):
  172. raise Exception(f"Mapping file not found at {fp_mapping}")
  173. schema = json.load(open(fp_mapping))
  174. vector_name = f"q_{vectorSize}_vec"
  175. schema[vector_name] = {"type": f"vector,{vectorSize},float"}
  176. inf_table = inf_db.create_table(
  177. table_name,
  178. schema,
  179. ConflictType.Ignore,
  180. )
  181. inf_table.create_index(
  182. "q_vec_idx",
  183. IndexInfo(
  184. vector_name,
  185. IndexType.Hnsw,
  186. {
  187. "M": "16",
  188. "ef_construction": "50",
  189. "metric": "cosine",
  190. "encode": "lvq",
  191. },
  192. ),
  193. ConflictType.Ignore,
  194. )
  195. for field_name, field_info in schema.items():
  196. if field_info["type"] != "varchar" or "analyzer" not in field_info:
  197. continue
  198. inf_table.create_index(
  199. f"text_idx_{field_name}",
  200. IndexInfo(
  201. field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
  202. ),
  203. ConflictType.Ignore,
  204. )
  205. self.connPool.release_conn(inf_conn)
  206. logger.info(
  207. f"INFINITY created table {table_name}, vector size {vectorSize}"
  208. )
  209. def deleteIdx(self, indexName: str, knowledgebaseId: str):
  210. table_name = f"{indexName}_{knowledgebaseId}"
  211. inf_conn = self.connPool.get_conn()
  212. db_instance = inf_conn.get_database(self.dbName)
  213. db_instance.drop_table(table_name, ConflictType.Ignore)
  214. self.connPool.release_conn(inf_conn)
  215. logger.info(f"INFINITY dropped table {table_name}")
  216. def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
  217. table_name = f"{indexName}_{knowledgebaseId}"
  218. try:
  219. inf_conn = self.connPool.get_conn()
  220. db_instance = inf_conn.get_database(self.dbName)
  221. _ = db_instance.get_table(table_name)
  222. self.connPool.release_conn(inf_conn)
  223. return True
  224. except Exception as e:
  225. logger.warning(f"INFINITY indexExist {str(e)}")
  226. return False
  227. """
  228. CRUD operations
  229. """
  230. def search(
  231. self, selectFields: list[str],
  232. highlightFields: list[str],
  233. condition: dict,
  234. matchExprs: list[MatchExpr],
  235. orderBy: OrderByExpr,
  236. offset: int,
  237. limit: int,
  238. indexNames: str | list[str],
  239. knowledgebaseIds: list[str],
  240. aggFields: list[str] = [],
  241. rank_feature: dict | None = None
  242. ) -> list[dict] | pl.DataFrame:
  243. """
  244. TODO: Infinity doesn't provide highlight
  245. """
  246. if isinstance(indexNames, str):
  247. indexNames = indexNames.split(",")
  248. assert isinstance(indexNames, list) and len(indexNames) > 0
  249. inf_conn = self.connPool.get_conn()
  250. db_instance = inf_conn.get_database(self.dbName)
  251. df_list = list()
  252. table_list = list()
  253. for essential_field in ["id"]:
  254. if essential_field not in selectFields:
  255. selectFields.append(essential_field)
  256. score_func = ""
  257. score_column = ""
  258. for matchExpr in matchExprs:
  259. if isinstance(matchExpr, MatchTextExpr):
  260. score_func = "score()"
  261. score_column = "SCORE"
  262. break
  263. if not score_func:
  264. for matchExpr in matchExprs:
  265. if isinstance(matchExpr, MatchDenseExpr):
  266. score_func = "similarity()"
  267. score_column = "SIMILARITY"
  268. break
  269. if matchExprs:
  270. selectFields.append(score_func)
  271. selectFields.append(PAGERANK_FLD)
  272. # Prepare expressions common to all tables
  273. filter_cond = None
  274. filter_fulltext = ""
  275. if condition:
  276. filter_cond = equivalent_condition_to_str(condition)
  277. for matchExpr in matchExprs:
  278. if isinstance(matchExpr, MatchTextExpr):
  279. if filter_cond and "filter" not in matchExpr.extra_options:
  280. matchExpr.extra_options.update({"filter": filter_cond})
  281. fields = ",".join(matchExpr.fields)
  282. filter_fulltext = f"filter_fulltext('{fields}', '{matchExpr.matching_text}')"
  283. if filter_cond:
  284. filter_fulltext = f"({filter_cond}) AND {filter_fulltext}"
  285. minimum_should_match = matchExpr.extra_options.get("minimum_should_match", 0.0)
  286. if isinstance(minimum_should_match, float):
  287. str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
  288. matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match
  289. for k, v in matchExpr.extra_options.items():
  290. if not isinstance(v, str):
  291. matchExpr.extra_options[k] = str(v)
  292. logger.debug(f"INFINITY search MatchTextExpr: {json.dumps(matchExpr.__dict__)}")
  293. elif isinstance(matchExpr, MatchDenseExpr):
  294. if filter_fulltext and filter_cond and "filter" not in matchExpr.extra_options:
  295. matchExpr.extra_options.update({"filter": filter_fulltext})
  296. for k, v in matchExpr.extra_options.items():
  297. if not isinstance(v, str):
  298. matchExpr.extra_options[k] = str(v)
  299. logger.debug(f"INFINITY search MatchDenseExpr: {json.dumps(matchExpr.__dict__)}")
  300. elif isinstance(matchExpr, FusionExpr):
  301. logger.debug(f"INFINITY search FusionExpr: {json.dumps(matchExpr.__dict__)}")
  302. order_by_expr_list = list()
  303. if orderBy.fields:
  304. for order_field in orderBy.fields:
  305. if order_field[1] == 0:
  306. order_by_expr_list.append((order_field[0], SortType.Asc))
  307. else:
  308. order_by_expr_list.append((order_field[0], SortType.Desc))
  309. total_hits_count = 0
  310. # Scatter search tables and gather the results
  311. for indexName in indexNames:
  312. for knowledgebaseId in knowledgebaseIds:
  313. table_name = f"{indexName}_{knowledgebaseId}"
  314. try:
  315. table_instance = db_instance.get_table(table_name)
  316. except Exception:
  317. continue
  318. table_list.append(table_name)
  319. builder = table_instance.output(selectFields)
  320. if len(matchExprs) > 0:
  321. for matchExpr in matchExprs:
  322. if isinstance(matchExpr, MatchTextExpr):
  323. fields = ",".join(matchExpr.fields)
  324. builder = builder.match_text(
  325. fields,
  326. matchExpr.matching_text,
  327. matchExpr.topn,
  328. matchExpr.extra_options,
  329. )
  330. elif isinstance(matchExpr, MatchDenseExpr):
  331. builder = builder.match_dense(
  332. matchExpr.vector_column_name,
  333. matchExpr.embedding_data,
  334. matchExpr.embedding_data_type,
  335. matchExpr.distance_type,
  336. matchExpr.topn,
  337. matchExpr.extra_options,
  338. )
  339. elif isinstance(matchExpr, FusionExpr):
  340. builder = builder.fusion(
  341. matchExpr.method, matchExpr.topn, matchExpr.fusion_params
  342. )
  343. else:
  344. if len(filter_cond) > 0:
  345. builder.filter(filter_cond)
  346. if orderBy.fields:
  347. builder.sort(order_by_expr_list)
  348. builder.offset(offset).limit(limit)
  349. kb_res, extra_result = builder.option({"total_hits_count": True}).to_pl()
  350. if extra_result:
  351. total_hits_count += int(extra_result["total_hits_count"])
  352. logger.debug(f"INFINITY search table: {str(table_name)}, result: {str(kb_res)}")
  353. df_list.append(kb_res)
  354. self.connPool.release_conn(inf_conn)
  355. res = concat_dataframes(df_list, selectFields)
  356. if matchExprs:
  357. res = res.sort(pl.col(score_column) + pl.col(PAGERANK_FLD), descending=True, maintain_order=True)
  358. if score_column and score_column != "SCORE":
  359. res = res.rename({score_column: "SCORE"})
  360. res = res.limit(limit)
  361. logger.debug(f"INFINITY search final result: {str(res)}")
  362. return res, total_hits_count
  363. def get(
  364. self, chunkId: str, indexName: str, knowledgebaseIds: list[str]
  365. ) -> dict | None:
  366. inf_conn = self.connPool.get_conn()
  367. db_instance = inf_conn.get_database(self.dbName)
  368. df_list = list()
  369. assert isinstance(knowledgebaseIds, list)
  370. table_list = list()
  371. for knowledgebaseId in knowledgebaseIds:
  372. table_name = f"{indexName}_{knowledgebaseId}"
  373. table_list.append(table_name)
  374. table_instance = None
  375. try:
  376. table_instance = db_instance.get_table(table_name)
  377. except Exception:
  378. logger.warning(
  379. f"Table not found: {table_name}, this knowledge base isn't created in Infinity. Maybe it is created in other document engine.")
  380. continue
  381. kb_res, _ = table_instance.output(["*"]).filter(f"id = '{chunkId}'").to_pl()
  382. logger.debug(f"INFINITY get table: {str(table_list)}, result: {str(kb_res)}")
  383. df_list.append(kb_res)
  384. self.connPool.release_conn(inf_conn)
  385. res = concat_dataframes(df_list, ["id"])
  386. res_fields = self.getFields(res, res.columns)
  387. return res_fields.get(chunkId, None)
  388. def insert(
  389. self, documents: list[dict], indexName: str, knowledgebaseId: str = None
  390. ) -> list[str]:
  391. inf_conn = self.connPool.get_conn()
  392. db_instance = inf_conn.get_database(self.dbName)
  393. table_name = f"{indexName}_{knowledgebaseId}"
  394. try:
  395. table_instance = db_instance.get_table(table_name)
  396. except InfinityException as e:
  397. # src/common/status.cppm, kTableNotExist = 3022
  398. if e.error_code != ErrorCode.TABLE_NOT_EXIST:
  399. raise
  400. vector_size = 0
  401. patt = re.compile(r"q_(?P<vector_size>\d+)_vec")
  402. for k in documents[0].keys():
  403. m = patt.match(k)
  404. if m:
  405. vector_size = int(m.group("vector_size"))
  406. break
  407. if vector_size == 0:
  408. raise ValueError("Cannot infer vector size from documents")
  409. self.createIdx(indexName, knowledgebaseId, vector_size)
  410. table_instance = db_instance.get_table(table_name)
  411. docs = copy.deepcopy(documents)
  412. for d in docs:
  413. assert "_id" not in d
  414. assert "id" in d
  415. for k, v in d.items():
  416. if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd"]:
  417. assert isinstance(v, list)
  418. d[k] = "###".join(v)
  419. elif re.search(r"_feas$", k):
  420. d[k] = json.dumps(v)
  421. elif k == 'kb_id':
  422. if isinstance(d[k], list):
  423. d[k] = d[k][0] # since d[k] is a list, but we need a str
  424. elif k == "position_int":
  425. assert isinstance(v, list)
  426. arr = [num for row in v for num in row]
  427. d[k] = "_".join(f"{num:08x}" for num in arr)
  428. elif k in ["page_num_int", "top_int"]:
  429. assert isinstance(v, list)
  430. d[k] = "_".join(f"{num:08x}" for num in v)
  431. ids = ["'{}'".format(d["id"]) for d in docs]
  432. str_ids = ", ".join(ids)
  433. str_filter = f"id IN ({str_ids})"
  434. table_instance.delete(str_filter)
  435. # for doc in documents:
  436. # logger.info(f"insert position_int: {doc['position_int']}")
  437. # logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
  438. table_instance.insert(docs)
  439. self.connPool.release_conn(inf_conn)
  440. logger.debug(f"INFINITY inserted into {table_name} {str_ids}.")
  441. return []
  442. def update(
  443. self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
  444. ) -> bool:
  445. # if 'position_int' in newValue:
  446. # logger.info(f"update position_int: {newValue['position_int']}")
  447. inf_conn = self.connPool.get_conn()
  448. db_instance = inf_conn.get_database(self.dbName)
  449. table_name = f"{indexName}_{knowledgebaseId}"
  450. table_instance = db_instance.get_table(table_name)
  451. if "exist" in condition:
  452. del condition["exist"]
  453. filter = equivalent_condition_to_str(condition)
  454. for k, v in list(newValue.items()):
  455. if k in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd"]:
  456. assert isinstance(v, list)
  457. newValue[k] = "###".join(v)
  458. elif re.search(r"_feas$", k):
  459. newValue[k] = json.dumps(v)
  460. elif k.endswith("_kwd") and isinstance(v, list):
  461. newValue[k] = " ".join(v)
  462. elif k == 'kb_id':
  463. if isinstance(newValue[k], list):
  464. newValue[k] = newValue[k][0] # since d[k] is a list, but we need a str
  465. elif k == "position_int":
  466. assert isinstance(v, list)
  467. arr = [num for row in v for num in row]
  468. newValue[k] = "_".join(f"{num:08x}" for num in arr)
  469. elif k in ["page_num_int", "top_int"]:
  470. assert isinstance(v, list)
  471. newValue[k] = "_".join(f"{num:08x}" for num in v)
  472. elif k == "remove" and v in [PAGERANK_FLD]:
  473. del newValue[k]
  474. newValue[v] = 0
  475. logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")
  476. table_instance.update(filter, newValue)
  477. self.connPool.release_conn(inf_conn)
  478. return True
  479. def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
  480. inf_conn = self.connPool.get_conn()
  481. db_instance = inf_conn.get_database(self.dbName)
  482. table_name = f"{indexName}_{knowledgebaseId}"
  483. filter = equivalent_condition_to_str(condition)
  484. try:
  485. table_instance = db_instance.get_table(table_name)
  486. except Exception:
  487. logger.warning(
  488. f"Skipped deleting `{filter}` from table {table_name} since the table doesn't exist."
  489. )
  490. return 0
  491. logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
  492. res = table_instance.delete(filter)
  493. self.connPool.release_conn(inf_conn)
  494. return res.deleted_rows
  495. """
  496. Helper functions for search result
  497. """
  498. def getTotal(self, res: tuple[pl.DataFrame, int] | pl.DataFrame) -> int:
  499. if isinstance(res, tuple):
  500. return res[1]
  501. return len(res)
  502. def getChunkIds(self, res: tuple[pl.DataFrame, int] | pl.DataFrame) -> list[str]:
  503. if isinstance(res, tuple):
  504. res = res[0]
  505. return list(res["id"])
  506. def getFields(self, res: tuple[pl.DataFrame, int] | pl.DataFrame, fields: list[str]) -> list[str, dict]:
  507. if isinstance(res, tuple):
  508. res = res[0]
  509. res_fields = {}
  510. if not fields:
  511. return {}
  512. num_rows = len(res)
  513. column_id = res["id"]
  514. for i in range(num_rows):
  515. id = column_id[i]
  516. m = {"id": id}
  517. for fieldnm in fields:
  518. if fieldnm not in res:
  519. m[fieldnm] = None
  520. continue
  521. v = res[fieldnm][i]
  522. if isinstance(v, Series):
  523. v = list(v)
  524. elif fieldnm in ["important_kwd", "question_kwd", "entities_kwd", "tag_kwd"]:
  525. assert isinstance(v, str)
  526. v = [kwd for kwd in v.split("###") if kwd]
  527. elif fieldnm == "position_int":
  528. assert isinstance(v, str)
  529. if v:
  530. arr = [int(hex_val, 16) for hex_val in v.split('_')]
  531. v = [arr[i:i + 5] for i in range(0, len(arr), 5)]
  532. else:
  533. v = []
  534. elif fieldnm in ["page_num_int", "top_int"]:
  535. assert isinstance(v, str)
  536. if v:
  537. v = [int(hex_val, 16) for hex_val in v.split('_')]
  538. else:
  539. v = []
  540. else:
  541. if not isinstance(v, str):
  542. v = str(v)
  543. # if fieldnm.endswith("_tks"):
  544. # v = rmSpace(v)
  545. m[fieldnm] = v
  546. res_fields[id] = m
  547. return res_fields
  548. def getHighlight(self, res: tuple[pl.DataFrame, int] | pl.DataFrame, keywords: list[str], fieldnm: str):
  549. if isinstance(res, tuple):
  550. res = res[0]
  551. ans = {}
  552. num_rows = len(res)
  553. column_id = res["id"]
  554. for i in range(num_rows):
  555. id = column_id[i]
  556. txt = res[fieldnm][i]
  557. txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
  558. txts = []
  559. for t in re.split(r"[.?!;\n]", txt):
  560. for w in keywords:
  561. t = re.sub(
  562. r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])"
  563. % re.escape(w),
  564. r"\1<em>\2</em>\3",
  565. t,
  566. flags=re.IGNORECASE | re.MULTILINE,
  567. )
  568. if not re.search(
  569. r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE
  570. ):
  571. continue
  572. txts.append(t)
  573. ans[id] = "...".join(txts)
  574. return ans
  575. def getAggregation(self, res: tuple[pl.DataFrame, int] | pl.DataFrame, fieldnm: str):
  576. """
  577. TODO: Infinity doesn't provide aggregation
  578. """
  579. return list()
  580. """
  581. SQL
  582. """
  583. def sql(sql: str, fetch_size: int, format: str):
  584. raise NotImplementedError("Not implemented")