### What problem does this PR solve? This PR solve the problems metioned in the pr(https://github.com/infiniflow/ragflow/pull/7140) which is also submitted by me ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### Introduction I fixed the problems when using OpenSearch as the DOC_ENGINE, the failures of pytest and the wrong API's return. Mainly about delete chunk, list chunks, update chunk, retrieval chunk. The pytest comand "cd sdk/python && uv sync --python 3.10 --group test --frozen && source .venv/bin/activate && cd test/test_http_api && DOC_ENGINE=opensearch pytest test_chunk_management_within_dataset -s --tb=short " is finally successful. ###Others As some changes between Elasticsearch And Opensearch differ, some pytest results about OpenSearch are correct and resonable. However, some pytest params (skipif params) are incompatible. So I changed some pytest params about skipif. As a search engine programmer, I will still focus on the usage of vector databases (especially OpenSearch) for the RAG stuff. Thanks for your reviewtags/v0.19.1
| @@ -217,7 +217,7 @@ class OSConnection(DocStoreConnection): | |||
| if bqry: | |||
| s = s.query(bqry) | |||
| for field in highlightFields: | |||
| s = s.highlight(field) | |||
| s = s.highlight(field,force_source=True,no_match_size=30,require_field_match=False) | |||
| if orderBy: | |||
| orders = list() | |||
| @@ -269,7 +269,7 @@ class OSConnection(DocStoreConnection): | |||
| for i in range(ATTEMPT_TIME): | |||
| try: | |||
| res = self.os.get(index=(indexName), | |||
| id=chunkId, source=True, ) | |||
| id=chunkId, _source=True, ) | |||
| if str(res.get("timed_out", "")).lower() == "true": | |||
| raise Exception("Es Timeout.") | |||
| chunk = res["_source"] | |||
| @@ -329,7 +329,7 @@ class OSConnection(DocStoreConnection): | |||
| chunkId = condition["id"] | |||
| for i in range(ATTEMPT_TIME): | |||
| try: | |||
| self.os.update(index=indexName, id=chunkId, doc=doc) | |||
| self.os.update(index=indexName, id=chunkId, body=doc) | |||
| return True | |||
| except Exception as e: | |||
| logger.exception( | |||
| @@ -411,7 +411,10 @@ class OSConnection(DocStoreConnection): | |||
| chunk_ids = condition["id"] | |||
| if not isinstance(chunk_ids, list): | |||
| chunk_ids = [chunk_ids] | |||
| qry = Q("ids", values=chunk_ids) | |||
| if not chunk_ids: # when chunk_ids is empty, delete all | |||
| qry = Q("match_all") | |||
| else: | |||
| qry = Q("ids", values=chunk_ids) | |||
| else: | |||
| qry = Q("bool") | |||
| for k, v in condition.items(): | |||
| @@ -69,7 +69,7 @@ class TestChunksList: | |||
| [ | |||
| ({"page_size": None}, 0, 5, ""), | |||
| pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="Infinity does not support page_size=0")), | |||
| pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), | |||
| ({"page_size": 1}, 0, 1, ""), | |||
| ({"page_size": 6}, 0, 5, ""), | |||
| ({"page_size": "1"}, 0, 1, ""), | |||
| @@ -185,28 +185,28 @@ class TestChunksRetrieval: | |||
| 0, | |||
| 4, | |||
| "", | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity"), | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in ["infinity", "opensearch"], reason="Infinity"), | |||
| ), | |||
| pytest.param( | |||
| {"top_k": 1}, | |||
| 0, | |||
| 1, | |||
| "", | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="elasticsearch"), | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="elasticsearch"), | |||
| ), | |||
| pytest.param( | |||
| {"top_k": -1}, | |||
| 100, | |||
| 4, | |||
| "must be greater than 0", | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity"), | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in ["infinity", "opensearch"], reason="Infinity"), | |||
| ), | |||
| pytest.param( | |||
| {"top_k": -1}, | |||
| 100, | |||
| 4, | |||
| "3014", | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="elasticsearch"), | |||
| marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="elasticsearch"), | |||
| ), | |||
| pytest.param( | |||
| {"top_k": "a"}, | |||
| @@ -146,7 +146,7 @@ class TestUpdatedChunk: | |||
| [ | |||
| ("", 100, "<NotFound '404: Not Found'>"), | |||
| pytest.param("invalid_dataset_id", 102, "You don't own the dataset invalid_dataset_id.", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="infinity")), | |||
| pytest.param("invalid_dataset_id", 102, "Can't find this chunk", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="elasticsearch")), | |||
| pytest.param("invalid_dataset_id", 102, "Can't find this chunk", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch","elasticsearch"], reason="elasticsearch")), | |||
| ], | |||
| ) | |||
| def test_invalid_dataset_id(self, get_http_api_auth, add_chunks, dataset_id, expected_code, expected_message): | |||