### What problem does this PR solve? Added infinity rank_feature support ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.20.0
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import json | import json | ||||
| import os | |||||
| from flask import request | from flask import request | ||||
| from flask_login import login_required, current_user | from flask_login import login_required, current_user | ||||
| return get_data_error_result( | return get_data_error_result( | ||||
| message="Can't find this knowledgebase!") | message="Can't find this knowledgebase!") | ||||
| if req.get("parser_id", "") == "tag" and os.environ.get('DOC_ENGINE', "elasticsearch") == "infinity": | |||||
| return get_json_result( | |||||
| data=False, | |||||
| message='The chunking method Tag has not been supported by Infinity yet.', | |||||
| code=settings.RetCode.OPERATING_ERROR | |||||
| ) | |||||
| if req["name"].lower() != kb.name.lower() \ | if req["name"].lower() != kb.name.lower() \ | ||||
| and len( | and len( | ||||
| KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1: | KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1: | ||||
| return get_data_error_result() | return get_data_error_result() | ||||
| if kb.pagerank != req.get("pagerank", 0): | if kb.pagerank != req.get("pagerank", 0): | ||||
| if os.environ.get("DOC_ENGINE", "elasticsearch") != "elasticsearch": | |||||
| return get_data_error_result(message="'pagerank' can only be set when doc_engine is elasticsearch") | |||||
| if req.get("pagerank", 0) > 0: | if req.get("pagerank", 0) > 0: | ||||
| settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]}, | settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]}, | ||||
| search.index_name(kb.tenant_id), kb.id) | search.index_name(kb.tenant_id), kb.id) |
| "knowledge_graph_kwd": {"type": "varchar", "default": ""}, | "knowledge_graph_kwd": {"type": "varchar", "default": ""}, | ||||
| "entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | "entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | ||||
| "pagerank_fea": {"type": "integer", "default": 0}, | "pagerank_fea": {"type": "integer", "default": 0}, | ||||
| "tag_feas": {"type": "varchar", "default": ""}, | |||||
| "tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"}, | |||||
| "from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | "from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | ||||
| "to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | "to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, |
| container_name: ragflow-infinity | container_name: ragflow-infinity | ||||
| profiles: | profiles: | ||||
| - infinity | - infinity | ||||
| image: infiniflow/infinity:v0.6.0-dev4 | |||||
| image: infiniflow/infinity:v0.6.0-dev5 | |||||
| volumes: | volumes: | ||||
| - infinity_data:/var/infinity | - infinity_data:/var/infinity | ||||
| - ./infinity_conf.toml:/infinity_conf.toml | - ./infinity_conf.toml:/infinity_conf.toml |
| log_file_rotate_count = 10 | log_file_rotate_count = 10 | ||||
| # trace/debug/info/warning/error/critical 6 log levels, default: info | # trace/debug/info/warning/error/critical 6 log levels, default: info | ||||
| log_level = "info" | |||||
| log_level = "trace" | |||||
| [storage] | [storage] | ||||
| persistence_dir = "/var/infinity/persistence" | persistence_dir = "/var/infinity/persistence" | ||||
| buffer_manager_size = "8GB" | buffer_manager_size = "8GB" | ||||
| lru_num = 7 | lru_num = 7 | ||||
| temp_dir = "/var/infinity/tmp" | temp_dir = "/var/infinity/tmp" | ||||
| result_cache = "on" | |||||
| result_cache = "off" | |||||
| memindex_memory_quota = "1GB" | memindex_memory_quota = "1GB" | ||||
| [wal] | [wal] |
| infinity: | infinity: | ||||
| image: | image: | ||||
| repository: infiniflow/infinity | repository: infiniflow/infinity | ||||
| tag: v0.6.0-dev4 | |||||
| tag: v0.6.0-dev5 | |||||
| storage: | storage: | ||||
| className: | className: | ||||
| capacity: 5Gi | capacity: 5Gi |
| keywords.append(f"{tk}^{w}") | keywords.append(f"{tk}^{w}") | ||||
| return MatchTextExpr(self.query_fields, " ".join(keywords), 100, | return MatchTextExpr(self.query_fields, " ".join(keywords), 100, | ||||
| {"minimum_should_match": min(3, len(keywords) / 10)}) | |||||
| {"minimum_should_match": min(3, len(keywords) // 10)}) |
| q_vec = matchDense.embedding_data | q_vec = matchDense.embedding_data | ||||
| src.append(f"q_{len(q_vec)}_vec") | src.append(f"q_{len(q_vec)}_vec") | ||||
| fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"}) | |||||
| fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05,0.95"}) | |||||
| matchExprs = [matchText, matchDense, fusionExpr] | matchExprs = [matchText, matchDense, fusionExpr] | ||||
| res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, | res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, |
| from infinity.connection_pool import ConnectionPool | from infinity.connection_pool import ConnectionPool | ||||
| from infinity.errors import ErrorCode | from infinity.errors import ErrorCode | ||||
| from rag import settings | from rag import settings | ||||
| from rag.settings import PAGERANK_FLD | |||||
| from rag.settings import PAGERANK_FLD, TAG_FLD | |||||
| from rag.utils import singleton | from rag.utils import singleton | ||||
| import pandas as pd | import pandas as pd | ||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| df_list = list() | df_list = list() | ||||
| table_list = list() | table_list = list() | ||||
| output = selectFields.copy() | output = selectFields.copy() | ||||
| for essential_field in ["id"]: | |||||
| for essential_field in ["id"] + aggFields: | |||||
| if essential_field not in output: | if essential_field not in output: | ||||
| output.append(essential_field) | output.append(essential_field) | ||||
| score_func = "" | score_func = "" | ||||
| if PAGERANK_FLD not in output: | if PAGERANK_FLD not in output: | ||||
| output.append(PAGERANK_FLD) | output.append(PAGERANK_FLD) | ||||
| output = [f for f in output if f != "_score"] | output = [f for f in output if f != "_score"] | ||||
| if limit <= 0: | |||||
| # ElasticSearch default limit is 10000 | |||||
| limit = 10000 | |||||
| # Prepare expressions common to all tables | # Prepare expressions common to all tables | ||||
| filter_cond = None | filter_cond = None | ||||
| filter_fulltext = "" | filter_fulltext = "" | ||||
| if condition: | if condition: | ||||
| table_found = False | |||||
| for indexName in indexNames: | for indexName in indexNames: | ||||
| table_name = f"{indexName}_{knowledgebaseIds[0]}" | |||||
| filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name)) | |||||
| break | |||||
| for kb_id in knowledgebaseIds: | |||||
| table_name = f"{indexName}_{kb_id}" | |||||
| try: | |||||
| filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name)) | |||||
| table_found = True | |||||
| break | |||||
| except Exception: | |||||
| pass | |||||
| if table_found: | |||||
| break | |||||
| if not table_found: | |||||
| logger.error(f"No valid tables found for indexNames {indexNames} and knowledgebaseIds {knowledgebaseIds}") | |||||
| return pd.DataFrame(), 0 | |||||
| for matchExpr in matchExprs: | for matchExpr in matchExprs: | ||||
| if isinstance(matchExpr, MatchTextExpr): | if isinstance(matchExpr, MatchTextExpr): | ||||
| if isinstance(minimum_should_match, float): | if isinstance(minimum_should_match, float): | ||||
| str_minimum_should_match = str(int(minimum_should_match * 100)) + "%" | str_minimum_should_match = str(int(minimum_should_match * 100)) + "%" | ||||
| matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match | matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match | ||||
| # Add rank_feature support | |||||
| if rank_feature and "rank_features" not in matchExpr.extra_options: | |||||
| # Convert rank_feature dict to Infinity's rank_features string format | |||||
| # Format: "field^feature_name^weight,field^feature_name^weight" | |||||
| rank_features_list = [] | |||||
| for feature_name, weight in rank_feature.items(): | |||||
| # Use TAG_FLD as the field containing rank features | |||||
| rank_features_list.append(f"{TAG_FLD}^{feature_name}^{weight}") | |||||
| if rank_features_list: | |||||
| matchExpr.extra_options["rank_features"] = ",".join(rank_features_list) | |||||
| for k, v in matchExpr.extra_options.items(): | for k, v in matchExpr.extra_options.items(): | ||||
| if not isinstance(v, str): | if not isinstance(v, str): | ||||
| matchExpr.extra_options[k] = str(v) | matchExpr.extra_options[k] = str(v) | ||||
| matchExpr.method, matchExpr.topn, matchExpr.fusion_params | matchExpr.method, matchExpr.topn, matchExpr.fusion_params | ||||
| ) | ) | ||||
| else: | else: | ||||
| if len(filter_cond) > 0: | |||||
| if filter_cond and len(filter_cond) > 0: | |||||
| builder.filter(filter_cond) | builder.filter(filter_cond) | ||||
| if orderBy.fields: | if orderBy.fields: | ||||
| builder.sort(order_by_expr_list) | builder.sort(order_by_expr_list) | ||||
| k = column.lower() | k = column.lower() | ||||
| if field_keyword(k): | if field_keyword(k): | ||||
| res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd]) | res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd]) | ||||
| elif re.search(r"_feas$", k): | |||||
| res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {}) | |||||
| elif k == "position_int": | elif k == "position_int": | ||||
| def to_position_int(v): | def to_position_int(v): | ||||
| if v: | if v: | ||||
| def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str): | def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str): | ||||
| """ | """ | ||||
| TODO: Infinity doesn't provide aggregation | |||||
| Manual aggregation for tag fields since Infinity doesn't provide native aggregation | |||||
| """ | """ | ||||
| return list() | |||||
| from collections import Counter | |||||
| # Extract DataFrame from result | |||||
| if isinstance(res, tuple): | |||||
| df, _ = res | |||||
| else: | |||||
| df = res | |||||
| if df.empty or fieldnm not in df.columns: | |||||
| return [] | |||||
| # Aggregate tag counts | |||||
| tag_counter = Counter() | |||||
| for value in df[fieldnm]: | |||||
| if pd.isna(value) or not value: | |||||
| continue | |||||
| # Handle different tag formats | |||||
| if isinstance(value, str): | |||||
| # Split by ### for tag_kwd field or comma for other formats | |||||
| if fieldnm == "tag_kwd" and "###" in value: | |||||
| tags = [tag.strip() for tag in value.split("###") if tag.strip()] | |||||
| else: | |||||
| # Try comma separation as fallback | |||||
| tags = [tag.strip() for tag in value.split(",") if tag.strip()] | |||||
| for tag in tags: | |||||
| if tag: # Only count non-empty tags | |||||
| tag_counter[tag] += 1 | |||||
| elif isinstance(value, list): | |||||
| # Handle list format | |||||
| for tag in value: | |||||
| if tag and isinstance(tag, str): | |||||
| tag_counter[tag.strip()] += 1 | |||||
| # Return as list of [tag, count] pairs, sorted by count descending | |||||
| return [[tag, count] for tag, count in tag_counter.most_common()] | |||||
| """ | """ | ||||
| SQL | SQL |
| "params, expected_code, expected_page_size, expected_message", | "params, expected_code, expected_page_size, expected_message", | ||||
| [ | [ | ||||
| ({"page_size": None}, 0, 5, ""), | ({"page_size": None}, 0, 5, ""), | ||||
| pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"page_size": 0}, 0, 5, ""), | |||||
| pytest.param({"page_size": 0}, 100, 0, ""), | |||||
| ({"page_size": 1}, 0, 1, ""), | ({"page_size": 1}, 0, 1, ""), | ||||
| ({"page_size": 6}, 0, 5, ""), | ({"page_size": 6}, 0, 5, ""), | ||||
| ({"page_size": "1"}, 0, 1, ""), | ({"page_size": "1"}, 0, 1, ""), |
| "params, expected_code, expected_page_size, expected_message", | "params, expected_code, expected_page_size, expected_message", | ||||
| [ | [ | ||||
| ({"page_size": None}, 0, 5, ""), | ({"page_size": None}, 0, 5, ""), | ||||
| pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"page_size": 0}, 0, 5, ""), | |||||
| ({"page_size": 1}, 0, 1, ""), | ({"page_size": 1}, 0, 1, ""), | ||||
| ({"page_size": 6}, 0, 5, ""), | ({"page_size": 6}, 0, 5, ""), | ||||
| ({"page_size": "1"}, 0, 1, ""), | ({"page_size": "1"}, 0, 1, ""), |
| "params, expected_page_size, expected_message", | "params, expected_page_size, expected_message", | ||||
| [ | [ | ||||
| ({"page_size": None}, 5, ""), | ({"page_size": None}, 5, ""), | ||||
| pytest.param({"page_size": 0}, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"page_size": 0}, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"page_size": 0}, 5, ""), | |||||
| ({"page_size": 1}, 1, ""), | ({"page_size": 1}, 1, ""), | ||||
| ({"page_size": 6}, 5, ""), | ({"page_size": 6}, 5, ""), | ||||
| ({"page_size": "1"}, 1, ""), | ({"page_size": "1"}, 1, ""), |
| "params, expected_code, expected_page_size, expected_message", | "params, expected_code, expected_page_size, expected_message", | ||||
| [ | [ | ||||
| ({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""), | ({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""), | ||||
| pytest.param({"size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||||
| pytest.param({"size": 0}, 0, 5, ""), | |||||
| ({"size": 1}, 0, 1, ""), | ({"size": 1}, 0, 1, ""), | ||||
| ({"size": 6}, 0, 5, ""), | ({"size": 6}, 0, 5, ""), | ||||
| ({"size": "1"}, 0, 1, ""), | ({"size": "1"}, 0, 1, ""), |