### What problem does this PR solve? Added infinity rank_feature support ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.20.0
| @@ -14,7 +14,6 @@ | |||
| # limitations under the License. | |||
| # | |||
| import json | |||
| import os | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| @@ -106,13 +105,6 @@ def update(): | |||
| return get_data_error_result( | |||
| message="Can't find this knowledgebase!") | |||
| if req.get("parser_id", "") == "tag" and os.environ.get('DOC_ENGINE', "elasticsearch") == "infinity": | |||
| return get_json_result( | |||
| data=False, | |||
| message='The chunking method Tag has not been supported by Infinity yet.', | |||
| code=settings.RetCode.OPERATING_ERROR | |||
| ) | |||
| if req["name"].lower() != kb.name.lower() \ | |||
| and len( | |||
| KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1: | |||
| @@ -124,9 +116,6 @@ def update(): | |||
| return get_data_error_result() | |||
| if kb.pagerank != req.get("pagerank", 0): | |||
| if os.environ.get("DOC_ENGINE", "elasticsearch") != "elasticsearch": | |||
| return get_data_error_result(message="'pagerank' can only be set when doc_engine is elasticsearch") | |||
| if req.get("pagerank", 0) > 0: | |||
| settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]}, | |||
| search.index_name(kb.tenant_id), kb.id) | |||
| @@ -30,7 +30,7 @@ | |||
| "knowledge_graph_kwd": {"type": "varchar", "default": ""}, | |||
| "entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | |||
| "pagerank_fea": {"type": "integer", "default": 0}, | |||
| "tag_feas": {"type": "varchar", "default": ""}, | |||
| "tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"}, | |||
| "from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | |||
| "to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, | |||
| @@ -77,7 +77,7 @@ services: | |||
| container_name: ragflow-infinity | |||
| profiles: | |||
| - infinity | |||
| image: infiniflow/infinity:v0.6.0-dev4 | |||
| image: infiniflow/infinity:v0.6.0-dev5 | |||
| volumes: | |||
| - infinity_data:/var/infinity | |||
| - ./infinity_conf.toml:/infinity_conf.toml | |||
| @@ -17,7 +17,7 @@ log_file_max_size = "100MB" | |||
| log_file_rotate_count = 10 | |||
| # trace/debug/info/warning/error/critical 6 log levels, default: info | |||
| log_level = "info" | |||
| log_level = "trace" | |||
| [storage] | |||
| persistence_dir = "/var/infinity/persistence" | |||
| @@ -47,7 +47,7 @@ mem_index_capacity = 65536 | |||
| buffer_manager_size = "8GB" | |||
| lru_num = 7 | |||
| temp_dir = "/var/infinity/tmp" | |||
| result_cache = "on" | |||
| result_cache = "off" | |||
| memindex_memory_quota = "1GB" | |||
| [wal] | |||
| @@ -113,7 +113,7 @@ ragflow: | |||
| infinity: | |||
| image: | |||
| repository: infiniflow/infinity | |||
| tag: v0.6.0-dev4 | |||
| tag: v0.6.0-dev5 | |||
| storage: | |||
| className: | |||
| capacity: 5Gi | |||
| @@ -274,4 +274,4 @@ class FulltextQueryer: | |||
| keywords.append(f"{tk}^{w}") | |||
| return MatchTextExpr(self.query_fields, " ".join(keywords), 100, | |||
| {"minimum_should_match": min(3, len(keywords) / 10)}) | |||
| {"minimum_should_match": min(3, len(keywords) // 10)}) | |||
| @@ -111,7 +111,7 @@ class Dealer: | |||
| q_vec = matchDense.embedding_data | |||
| src.append(f"q_{len(q_vec)}_vec") | |||
| fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"}) | |||
| fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05,0.95"}) | |||
| matchExprs = [matchText, matchDense, fusionExpr] | |||
| res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, | |||
| @@ -26,7 +26,7 @@ from infinity.index import IndexInfo, IndexType | |||
| from infinity.connection_pool import ConnectionPool | |||
| from infinity.errors import ErrorCode | |||
| from rag import settings | |||
| from rag.settings import PAGERANK_FLD | |||
| from rag.settings import PAGERANK_FLD, TAG_FLD | |||
| from rag.utils import singleton | |||
| import pandas as pd | |||
| from api.utils.file_utils import get_project_base_directory | |||
| @@ -311,7 +311,7 @@ class InfinityConnection(DocStoreConnection): | |||
| df_list = list() | |||
| table_list = list() | |||
| output = selectFields.copy() | |||
| for essential_field in ["id"]: | |||
| for essential_field in ["id"] + aggFields: | |||
| if essential_field not in output: | |||
| output.append(essential_field) | |||
| score_func = "" | |||
| @@ -333,15 +333,29 @@ class InfinityConnection(DocStoreConnection): | |||
| if PAGERANK_FLD not in output: | |||
| output.append(PAGERANK_FLD) | |||
| output = [f for f in output if f != "_score"] | |||
| if limit <= 0: | |||
| # ElasticSearch default limit is 10000 | |||
| limit = 10000 | |||
| # Prepare expressions common to all tables | |||
| filter_cond = None | |||
| filter_fulltext = "" | |||
| if condition: | |||
| table_found = False | |||
| for indexName in indexNames: | |||
| table_name = f"{indexName}_{knowledgebaseIds[0]}" | |||
| filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name)) | |||
| break | |||
| for kb_id in knowledgebaseIds: | |||
| table_name = f"{indexName}_{kb_id}" | |||
| try: | |||
| filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name)) | |||
| table_found = True | |||
| break | |||
| except Exception: | |||
| pass | |||
| if table_found: | |||
| break | |||
| if not table_found: | |||
| logger.error(f"No valid tables found for indexNames {indexNames} and knowledgebaseIds {knowledgebaseIds}") | |||
| return pd.DataFrame(), 0 | |||
| for matchExpr in matchExprs: | |||
| if isinstance(matchExpr, MatchTextExpr): | |||
| @@ -355,6 +369,18 @@ class InfinityConnection(DocStoreConnection): | |||
| if isinstance(minimum_should_match, float): | |||
| str_minimum_should_match = str(int(minimum_should_match * 100)) + "%" | |||
| matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match | |||
| # Add rank_feature support | |||
| if rank_feature and "rank_features" not in matchExpr.extra_options: | |||
| # Convert rank_feature dict to Infinity's rank_features string format | |||
| # Format: "field^feature_name^weight,field^feature_name^weight" | |||
| rank_features_list = [] | |||
| for feature_name, weight in rank_feature.items(): | |||
| # Use TAG_FLD as the field containing rank features | |||
| rank_features_list.append(f"{TAG_FLD}^{feature_name}^{weight}") | |||
| if rank_features_list: | |||
| matchExpr.extra_options["rank_features"] = ",".join(rank_features_list) | |||
| for k, v in matchExpr.extra_options.items(): | |||
| if not isinstance(v, str): | |||
| matchExpr.extra_options[k] = str(v) | |||
| @@ -416,7 +442,7 @@ class InfinityConnection(DocStoreConnection): | |||
| matchExpr.method, matchExpr.topn, matchExpr.fusion_params | |||
| ) | |||
| else: | |||
| if len(filter_cond) > 0: | |||
| if filter_cond and len(filter_cond) > 0: | |||
| builder.filter(filter_cond) | |||
| if orderBy.fields: | |||
| builder.sort(order_by_expr_list) | |||
| @@ -662,6 +688,8 @@ class InfinityConnection(DocStoreConnection): | |||
| k = column.lower() | |||
| if field_keyword(k): | |||
| res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd]) | |||
| elif re.search(r"_feas$", k): | |||
| res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {}) | |||
| elif k == "position_int": | |||
| def to_position_int(v): | |||
| if v: | |||
| @@ -712,9 +740,46 @@ class InfinityConnection(DocStoreConnection): | |||
| def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str): | |||
| """ | |||
| TODO: Infinity doesn't provide aggregation | |||
| Manual aggregation for tag fields since Infinity doesn't provide native aggregation | |||
| """ | |||
| return list() | |||
| from collections import Counter | |||
| # Extract DataFrame from result | |||
| if isinstance(res, tuple): | |||
| df, _ = res | |||
| else: | |||
| df = res | |||
| if df.empty or fieldnm not in df.columns: | |||
| return [] | |||
| # Aggregate tag counts | |||
| tag_counter = Counter() | |||
| for value in df[fieldnm]: | |||
| if pd.isna(value) or not value: | |||
| continue | |||
| # Handle different tag formats | |||
| if isinstance(value, str): | |||
| # Split by ### for tag_kwd field or comma for other formats | |||
| if fieldnm == "tag_kwd" and "###" in value: | |||
| tags = [tag.strip() for tag in value.split("###") if tag.strip()] | |||
| else: | |||
| # Try comma separation as fallback | |||
| tags = [tag.strip() for tag in value.split(",") if tag.strip()] | |||
| for tag in tags: | |||
| if tag: # Only count non-empty tags | |||
| tag_counter[tag] += 1 | |||
| elif isinstance(value, list): | |||
| # Handle list format | |||
| for tag in value: | |||
| if tag and isinstance(tag, str): | |||
| tag_counter[tag.strip()] += 1 | |||
| # Return as list of [tag, count] pairs, sorted by count descending | |||
| return [[tag, count] for tag, count in tag_counter.most_common()] | |||
| """ | |||
| SQL | |||
| @@ -68,8 +68,8 @@ class TestChunksList: | |||
| "params, expected_code, expected_page_size, expected_message", | |||
| [ | |||
| ({"page_size": None}, 0, 5, ""), | |||
| pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 0, 5, ""), | |||
| pytest.param({"page_size": 0}, 100, 0, ""), | |||
| ({"page_size": 1}, 0, 1, ""), | |||
| ({"page_size": 6}, 0, 5, ""), | |||
| ({"page_size": "1"}, 0, 1, ""), | |||
| @@ -69,8 +69,7 @@ class TestChunksList: | |||
| "params, expected_code, expected_page_size, expected_message", | |||
| [ | |||
| ({"page_size": None}, 0, 5, ""), | |||
| pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 0, 5, ""), | |||
| ({"page_size": 1}, 0, 1, ""), | |||
| ({"page_size": 6}, 0, 5, ""), | |||
| ({"page_size": "1"}, 0, 1, ""), | |||
| @@ -50,8 +50,7 @@ class TestChunksList: | |||
| "params, expected_page_size, expected_message", | |||
| [ | |||
| ({"page_size": None}, 5, ""), | |||
| pytest.param({"page_size": 0}, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 5, ""), | |||
| ({"page_size": 1}, 1, ""), | |||
| ({"page_size": 6}, 5, ""), | |||
| ({"page_size": "1"}, 1, ""), | |||
| @@ -68,8 +68,7 @@ class TestChunksList: | |||
| "params, expected_code, expected_page_size, expected_message", | |||
| [ | |||
| ({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""), | |||
| pytest.param({"size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||
| pytest.param({"size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||
| pytest.param({"size": 0}, 0, 5, ""), | |||
| ({"size": 1}, 0, 1, ""), | |||
| ({"size": 6}, 0, 5, ""), | |||
| ({"size": "1"}, 0, 1, ""), | |||