Browse Source

Added infinity rank_feature support (#9044)

### What problem does this PR solve?

Added infinity rank_feature support

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.20.0
Zhichang Yu 3 months ago
parent
commit
342a04ec8a
No account linked to committer's email address

+ 0
- 11
api/apps/kb_app.py View File

# limitations under the License. # limitations under the License.
# #
import json import json
import os


from flask import request from flask import request
from flask_login import login_required, current_user from flask_login import login_required, current_user
return get_data_error_result( return get_data_error_result(
message="Can't find this knowledgebase!") message="Can't find this knowledgebase!")


if req.get("parser_id", "") == "tag" and os.environ.get('DOC_ENGINE', "elasticsearch") == "infinity":
return get_json_result(
data=False,
message='The chunking method Tag has not been supported by Infinity yet.',
code=settings.RetCode.OPERATING_ERROR
)

if req["name"].lower() != kb.name.lower() \ if req["name"].lower() != kb.name.lower() \
and len( and len(
KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1: KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1:
return get_data_error_result() return get_data_error_result()


if kb.pagerank != req.get("pagerank", 0): if kb.pagerank != req.get("pagerank", 0):
if os.environ.get("DOC_ENGINE", "elasticsearch") != "elasticsearch":
return get_data_error_result(message="'pagerank' can only be set when doc_engine is elasticsearch")
if req.get("pagerank", 0) > 0: if req.get("pagerank", 0) > 0:
settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]}, settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]},
search.index_name(kb.tenant_id), kb.id) search.index_name(kb.tenant_id), kb.id)

+ 1
- 1
conf/infinity_mapping.json View File

"knowledge_graph_kwd": {"type": "varchar", "default": ""}, "knowledge_graph_kwd": {"type": "varchar", "default": ""},
"entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"pagerank_fea": {"type": "integer", "default": 0}, "pagerank_fea": {"type": "integer", "default": 0},
"tag_feas": {"type": "varchar", "default": ""},
"tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"},


"from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
"to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},

+ 1
- 1
docker/docker-compose-base.yml View File

container_name: ragflow-infinity container_name: ragflow-infinity
profiles: profiles:
- infinity - infinity
image: infiniflow/infinity:v0.6.0-dev4
image: infiniflow/infinity:v0.6.0-dev5
volumes: volumes:
- infinity_data:/var/infinity - infinity_data:/var/infinity
- ./infinity_conf.toml:/infinity_conf.toml - ./infinity_conf.toml:/infinity_conf.toml

+ 2
- 2
docker/infinity_conf.toml View File

log_file_rotate_count = 10 log_file_rotate_count = 10


# trace/debug/info/warning/error/critical 6 log levels, default: info # trace/debug/info/warning/error/critical 6 log levels, default: info
log_level = "info"
log_level = "trace"


[storage] [storage]
persistence_dir = "/var/infinity/persistence" persistence_dir = "/var/infinity/persistence"
buffer_manager_size = "8GB" buffer_manager_size = "8GB"
lru_num = 7 lru_num = 7
temp_dir = "/var/infinity/tmp" temp_dir = "/var/infinity/tmp"
result_cache = "on"
result_cache = "off"
memindex_memory_quota = "1GB" memindex_memory_quota = "1GB"


[wal] [wal]

+ 1
- 1
helm/values.yaml View File

infinity: infinity:
image: image:
repository: infiniflow/infinity repository: infiniflow/infinity
tag: v0.6.0-dev4
tag: v0.6.0-dev5
storage: storage:
className: className:
capacity: 5Gi capacity: 5Gi

+ 1
- 1
rag/nlp/query.py View File

keywords.append(f"{tk}^{w}") keywords.append(f"{tk}^{w}")


return MatchTextExpr(self.query_fields, " ".join(keywords), 100, return MatchTextExpr(self.query_fields, " ".join(keywords), 100,
{"minimum_should_match": min(3, len(keywords) / 10)})
{"minimum_should_match": min(3, len(keywords) // 10)})

+ 1
- 1
rag/nlp/search.py View File

q_vec = matchDense.embedding_data q_vec = matchDense.embedding_data
src.append(f"q_{len(q_vec)}_vec") src.append(f"q_{len(q_vec)}_vec")


fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"})
fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05,0.95"})
matchExprs = [matchText, matchDense, fusionExpr] matchExprs = [matchText, matchDense, fusionExpr]


res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,

+ 73
- 8
rag/utils/infinity_conn.py View File

from infinity.connection_pool import ConnectionPool from infinity.connection_pool import ConnectionPool
from infinity.errors import ErrorCode from infinity.errors import ErrorCode
from rag import settings from rag import settings
from rag.settings import PAGERANK_FLD
from rag.settings import PAGERANK_FLD, TAG_FLD
from rag.utils import singleton from rag.utils import singleton
import pandas as pd import pandas as pd
from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
df_list = list() df_list = list()
table_list = list() table_list = list()
output = selectFields.copy() output = selectFields.copy()
for essential_field in ["id"]:
for essential_field in ["id"] + aggFields:
if essential_field not in output: if essential_field not in output:
output.append(essential_field) output.append(essential_field)
score_func = "" score_func = ""
if PAGERANK_FLD not in output: if PAGERANK_FLD not in output:
output.append(PAGERANK_FLD) output.append(PAGERANK_FLD)
output = [f for f in output if f != "_score"] output = [f for f in output if f != "_score"]
if limit <= 0:
# ElasticSearch default limit is 10000
limit = 10000


# Prepare expressions common to all tables # Prepare expressions common to all tables
filter_cond = None filter_cond = None
filter_fulltext = "" filter_fulltext = ""
if condition: if condition:
table_found = False
for indexName in indexNames: for indexName in indexNames:
table_name = f"{indexName}_{knowledgebaseIds[0]}"
filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
break
for kb_id in knowledgebaseIds:
table_name = f"{indexName}_{kb_id}"
try:
filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
table_found = True
break
except Exception:
pass
if table_found:
break
if not table_found:
logger.error(f"No valid tables found for indexNames {indexNames} and knowledgebaseIds {knowledgebaseIds}")
return pd.DataFrame(), 0


for matchExpr in matchExprs: for matchExpr in matchExprs:
if isinstance(matchExpr, MatchTextExpr): if isinstance(matchExpr, MatchTextExpr):
if isinstance(minimum_should_match, float): if isinstance(minimum_should_match, float):
str_minimum_should_match = str(int(minimum_should_match * 100)) + "%" str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match

# Add rank_feature support
if rank_feature and "rank_features" not in matchExpr.extra_options:
# Convert rank_feature dict to Infinity's rank_features string format
# Format: "field^feature_name^weight,field^feature_name^weight"
rank_features_list = []
for feature_name, weight in rank_feature.items():
# Use TAG_FLD as the field containing rank features
rank_features_list.append(f"{TAG_FLD}^{feature_name}^{weight}")
if rank_features_list:
matchExpr.extra_options["rank_features"] = ",".join(rank_features_list)

for k, v in matchExpr.extra_options.items(): for k, v in matchExpr.extra_options.items():
if not isinstance(v, str): if not isinstance(v, str):
matchExpr.extra_options[k] = str(v) matchExpr.extra_options[k] = str(v)
matchExpr.method, matchExpr.topn, matchExpr.fusion_params matchExpr.method, matchExpr.topn, matchExpr.fusion_params
) )
else: else:
if len(filter_cond) > 0:
if filter_cond and len(filter_cond) > 0:
builder.filter(filter_cond) builder.filter(filter_cond)
if orderBy.fields: if orderBy.fields:
builder.sort(order_by_expr_list) builder.sort(order_by_expr_list)
k = column.lower() k = column.lower()
if field_keyword(k): if field_keyword(k):
res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd]) res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd])
elif re.search(r"_feas$", k):
res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {})
elif k == "position_int": elif k == "position_int":
def to_position_int(v): def to_position_int(v):
if v: if v:


def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str): def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
""" """
TODO: Infinity doesn't provide aggregation
Manual aggregation for tag fields since Infinity doesn't provide native aggregation
""" """
return list()
from collections import Counter

# Extract DataFrame from result
if isinstance(res, tuple):
df, _ = res
else:
df = res

if df.empty or fieldnm not in df.columns:
return []

# Aggregate tag counts
tag_counter = Counter()

for value in df[fieldnm]:
if pd.isna(value) or not value:
continue

# Handle different tag formats
if isinstance(value, str):
# Split by ### for tag_kwd field or comma for other formats
if fieldnm == "tag_kwd" and "###" in value:
tags = [tag.strip() for tag in value.split("###") if tag.strip()]
else:
# Try comma separation as fallback
tags = [tag.strip() for tag in value.split(",") if tag.strip()]

for tag in tags:
if tag: # Only count non-empty tags
tag_counter[tag] += 1
elif isinstance(value, list):
# Handle list format
for tag in value:
if tag and isinstance(tag, str):
tag_counter[tag.strip()] += 1

# Return as list of [tag, count] pairs, sorted by count descending
return [[tag, count] for tag, count in tag_counter.most_common()]


""" """
SQL SQL

+ 2
- 2
sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py View File

"params, expected_code, expected_page_size, expected_message", "params, expected_code, expected_page_size, expected_message",
[ [
({"page_size": None}, 0, 5, ""), ({"page_size": None}, 0, 5, ""),
pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
pytest.param({"page_size": 0}, 0, 5, ""),
pytest.param({"page_size": 0}, 100, 0, ""),
({"page_size": 1}, 0, 1, ""), ({"page_size": 1}, 0, 1, ""),
({"page_size": 6}, 0, 5, ""), ({"page_size": 6}, 0, 5, ""),
({"page_size": "1"}, 0, 1, ""), ({"page_size": "1"}, 0, 1, ""),

+ 1
- 2
test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py View File

"params, expected_code, expected_page_size, expected_message", "params, expected_code, expected_page_size, expected_message",
[ [
({"page_size": None}, 0, 5, ""), ({"page_size": None}, 0, 5, ""),
pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
pytest.param({"page_size": 0}, 0, 5, ""),
({"page_size": 1}, 0, 1, ""), ({"page_size": 1}, 0, 1, ""),
({"page_size": 6}, 0, 5, ""), ({"page_size": 6}, 0, 5, ""),
({"page_size": "1"}, 0, 1, ""), ({"page_size": "1"}, 0, 1, ""),

+ 1
- 2
test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_list_chunks.py View File

"params, expected_page_size, expected_message", "params, expected_page_size, expected_message",
[ [
({"page_size": None}, 5, ""), ({"page_size": None}, 5, ""),
pytest.param({"page_size": 0}, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
pytest.param({"page_size": 0}, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
pytest.param({"page_size": 0}, 5, ""),
({"page_size": 1}, 1, ""), ({"page_size": 1}, 1, ""),
({"page_size": 6}, 5, ""), ({"page_size": 6}, 5, ""),
({"page_size": "1"}, 1, ""), ({"page_size": "1"}, 1, ""),

+ 1
- 2
test/testcases/test_web_api/test_chunk_app/test_list_chunks.py View File

"params, expected_code, expected_page_size, expected_message", "params, expected_code, expected_page_size, expected_message",
[ [
({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""), ({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""),
pytest.param({"size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
pytest.param({"size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
pytest.param({"size": 0}, 0, 5, ""),
({"size": 1}, 0, 1, ""), ({"size": 1}, 0, 1, ""),
({"size": 6}, 0, 5, ""), ({"size": 6}, 0, 5, ""),
({"size": "1"}, 0, 1, ""), ({"size": "1"}, 0, 1, ""),

Loading…
Cancel
Save