### What problem does this PR solve? Refactor Dataset API ### Type of change - [x] Refactoring --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>tags/v0.13.0
| @@ -83,7 +83,7 @@ def register_page(page_path): | |||
| sys.modules[module_name] = page | |||
| spec.loader.exec_module(page) | |||
| page_name = getattr(page, 'page_name', page_name) | |||
| url_prefix = f'/api/{API_VERSION}/{page_name}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}' | |||
| url_prefix = f'/api/{API_VERSION}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}' | |||
| app.register_blueprint(page.manager, url_prefix=url_prefix) | |||
| return url_prefix | |||
| @@ -25,143 +25,146 @@ from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.user_service import TenantService | |||
| from api.settings import RetCode | |||
| from api.utils import get_uuid | |||
| from api.utils.api_utils import get_json_result, token_required, get_data_error_result | |||
| from api.utils.api_utils import get_result, token_required,get_error_data_result | |||
| @manager.route('/save', methods=['POST']) | |||
| @manager.route('/dataset', methods=['POST']) | |||
| @token_required | |||
| def save(tenant_id): | |||
| def create(tenant_id): | |||
| req = request.json | |||
| e, t = TenantService.get_by_id(tenant_id) | |||
| if "id" not in req: | |||
| if "tenant_id" in req or "embedding_model" in req: | |||
| return get_data_error_result( | |||
| retmsg="Tenant_id or embedding_model must not be provided") | |||
| if "name" not in req: | |||
| return get_data_error_result( | |||
| retmsg="Name is not empty!") | |||
| req['id'] = get_uuid() | |||
| req["name"] = req["name"].strip() | |||
| if req["name"] == "": | |||
| return get_data_error_result( | |||
| retmsg="Name is not empty string!") | |||
| if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value): | |||
| return get_data_error_result( | |||
| retmsg="Duplicated knowledgebase name in creating dataset.") | |||
| req["tenant_id"] = req['created_by'] = tenant_id | |||
| req['embedding_model'] = t.embd_id | |||
| key_mapping = { | |||
| "chunk_num": "chunk_count", | |||
| "doc_num": "document_count", | |||
| "parser_id": "parse_method", | |||
| "embd_id": "embedding_model" | |||
| } | |||
| mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req} | |||
| req.update(mapped_keys) | |||
| if not KnowledgebaseService.save(**req): | |||
| return get_data_error_result(retmsg="Create dataset error.(Database error)") | |||
| renamed_data = {} | |||
| e, k = KnowledgebaseService.get_by_id(req["id"]) | |||
| for key, value in k.to_dict().items(): | |||
| new_key = key_mapping.get(key, key) | |||
| renamed_data[new_key] = value | |||
| return get_json_result(data=renamed_data) | |||
| else: | |||
| invalid_keys = {"embd_id", "chunk_num", "doc_num", "parser_id"} | |||
| if any(key in req for key in invalid_keys): | |||
| return get_data_error_result(retmsg="The input parameters are invalid.") | |||
| if "tenant_id" in req: | |||
| if req["tenant_id"] != tenant_id: | |||
| return get_data_error_result( | |||
| retmsg="Can't change tenant_id.") | |||
| if "embedding_model" in req: | |||
| if req["embedding_model"] != t.embd_id: | |||
| return get_data_error_result( | |||
| retmsg="Can't change embedding_model.") | |||
| req.pop("embedding_model") | |||
| if not KnowledgebaseService.query( | |||
| created_by=tenant_id, id=req["id"]): | |||
| return get_json_result( | |||
| data=False, retmsg='You do not own the dataset.', | |||
| retcode=RetCode.OPERATING_ERROR) | |||
| if not req["id"]: | |||
| return get_data_error_result( | |||
| retmsg="id can not be empty.") | |||
| e, kb = KnowledgebaseService.get_by_id(req["id"]) | |||
| if "chunk_count" in req: | |||
| if req["chunk_count"] != kb.chunk_num: | |||
| return get_data_error_result( | |||
| retmsg="Can't change chunk_count.") | |||
| req.pop("chunk_count") | |||
| if "document_count" in req: | |||
| if req['document_count'] != kb.doc_num: | |||
| return get_data_error_result( | |||
| retmsg="Can't change document_count.") | |||
| req.pop("document_count") | |||
| if "parse_method" in req: | |||
| if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id: | |||
| return get_data_error_result( | |||
| retmsg="If chunk count is not 0, parse method is not changable.") | |||
| req['parser_id'] = req.pop('parse_method') | |||
| if "name" in req: | |||
| req["name"] = req["name"].strip() | |||
| if req["name"].lower() != kb.name.lower() \ | |||
| and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, | |||
| status=StatusEnum.VALID.value)) > 0: | |||
| return get_data_error_result( | |||
| retmsg="Duplicated knowledgebase name in updating dataset.") | |||
| del req["id"] | |||
| if not KnowledgebaseService.update_by_id(kb.id, req): | |||
| return get_data_error_result(retmsg="Update dataset error.(Database error)") | |||
| return get_json_result(data=True) | |||
| if "tenant_id" in req or "embedding_model" in req: | |||
| return get_error_data_result( | |||
| retmsg="Tenant_id or embedding_model must not be provided") | |||
| chunk_count=req.get("chunk_count") | |||
| document_count=req.get("document_count") | |||
| if chunk_count or document_count: | |||
| return get_error_data_result(retmsg="chunk_count or document_count must be 0 or not be provided") | |||
| if "name" not in req: | |||
| return get_error_data_result( | |||
| retmsg="Name is not empty!") | |||
| req['id'] = get_uuid() | |||
| req["name"] = req["name"].strip() | |||
| if req["name"] == "": | |||
| return get_error_data_result( | |||
| retmsg="Name is not empty string!") | |||
| if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value): | |||
| return get_error_data_result( | |||
| retmsg="Duplicated knowledgebase name in creating dataset.") | |||
| req["tenant_id"] = req['created_by'] = tenant_id | |||
| req['embedding_model'] = t.embd_id | |||
| key_mapping = { | |||
| "chunk_num": "chunk_count", | |||
| "doc_num": "document_count", | |||
| "parser_id": "parse_method", | |||
| "embd_id": "embedding_model" | |||
| } | |||
| mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req} | |||
| req.update(mapped_keys) | |||
| if not KnowledgebaseService.save(**req): | |||
| return get_error_data_result(retmsg="Create dataset error.(Database error)") | |||
| renamed_data = {} | |||
| e, k = KnowledgebaseService.get_by_id(req["id"]) | |||
| for key, value in k.to_dict().items(): | |||
| new_key = key_mapping.get(key, key) | |||
| renamed_data[new_key] = value | |||
| return get_result(data=renamed_data) | |||
| @manager.route('/delete', methods=['DELETE']) | |||
| @manager.route('/dataset', methods=['DELETE']) | |||
| @token_required | |||
| def delete(tenant_id): | |||
| req = request.args | |||
| if "id" not in req: | |||
| return get_data_error_result( | |||
| retmsg="id is required") | |||
| kbs = KnowledgebaseService.query( | |||
| created_by=tenant_id, id=req["id"]) | |||
| if not kbs: | |||
| return get_json_result( | |||
| data=False, retmsg='You do not own the dataset', | |||
| retcode=RetCode.OPERATING_ERROR) | |||
| for doc in DocumentService.query(kb_id=req["id"]): | |||
| if not DocumentService.remove_document(doc, kbs[0].tenant_id): | |||
| return get_data_error_result( | |||
| retmsg="Remove document error.(Database error)") | |||
| f2d = File2DocumentService.get_by_document_id(doc.id) | |||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | |||
| File2DocumentService.delete_by_document_id(doc.id) | |||
| if not KnowledgebaseService.delete_by_id(req["id"]): | |||
| return get_data_error_result( | |||
| retmsg="Delete dataset error.(Database serror)") | |||
| return get_json_result(data=True) | |||
| @manager.route('/list', methods=['GET']) | |||
| req = request.json | |||
| names=req.get("names") | |||
| ids = req.get("ids") | |||
| if not ids and not names: | |||
| return get_error_data_result( | |||
| retmsg="ids or names is required") | |||
| id_list=[] | |||
| if names: | |||
| for name in names: | |||
| kbs=KnowledgebaseService.query(name=name,tenant_id=tenant_id) | |||
| if not kbs: | |||
| return get_error_data_result(retmsg=f"You don't own the dataset {name}") | |||
| id_list.append(kbs[0].id) | |||
| if ids: | |||
| for id in ids: | |||
| kbs=KnowledgebaseService.query(id=id,tenant_id=tenant_id) | |||
| if not kbs: | |||
| return get_error_data_result(retmsg=f"You don't own the dataset {id}") | |||
| id_list.extend(ids) | |||
| for id in id_list: | |||
| for doc in DocumentService.query(kb_id=id): | |||
| if not DocumentService.remove_document(doc, tenant_id): | |||
| return get_error_data_result( | |||
| retmsg="Remove document error.(Database error)") | |||
| f2d = File2DocumentService.get_by_document_id(doc.id) | |||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | |||
| File2DocumentService.delete_by_document_id(doc.id) | |||
| if not KnowledgebaseService.delete_by_id(id): | |||
| return get_error_data_result( | |||
| retmsg="Delete dataset error.(Database serror)") | |||
| return get_result(retcode=RetCode.SUCCESS) | |||
| @manager.route('/dataset/<dataset_id>', methods=['PUT']) | |||
| @token_required | |||
| def update(tenant_id,dataset_id): | |||
| if not KnowledgebaseService.query(id=dataset_id,tenant_id=tenant_id): | |||
| return get_error_data_result(retmsg="You don't own the dataset") | |||
| req = request.json | |||
| e, t = TenantService.get_by_id(tenant_id) | |||
| invalid_keys = {"id", "embd_id", "chunk_num", "doc_num", "parser_id"} | |||
| if any(key in req for key in invalid_keys): | |||
| return get_error_data_result(retmsg="The input parameters are invalid.") | |||
| if "tenant_id" in req: | |||
| if req["tenant_id"] != tenant_id: | |||
| return get_error_data_result( | |||
| retmsg="Can't change tenant_id.") | |||
| if "embedding_model" in req: | |||
| if req["embedding_model"] != t.embd_id: | |||
| return get_error_data_result( | |||
| retmsg="Can't change embedding_model.") | |||
| req.pop("embedding_model") | |||
| e, kb = KnowledgebaseService.get_by_id(dataset_id) | |||
| if "chunk_count" in req: | |||
| if req["chunk_count"] != kb.chunk_num: | |||
| return get_error_data_result( | |||
| retmsg="Can't change chunk_count.") | |||
| req.pop("chunk_count") | |||
| if "document_count" in req: | |||
| if req['document_count'] != kb.doc_num: | |||
| return get_error_data_result( | |||
| retmsg="Can't change document_count.") | |||
| req.pop("document_count") | |||
| if "parse_method" in req: | |||
| if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id: | |||
| return get_error_data_result( | |||
| retmsg="If chunk count is not 0, parse method is not changable.") | |||
| req['parser_id'] = req.pop('parse_method') | |||
| if "name" in req: | |||
| req["name"] = req["name"].strip() | |||
| if req["name"].lower() != kb.name.lower() \ | |||
| and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, | |||
| status=StatusEnum.VALID.value)) > 0: | |||
| return get_error_data_result( | |||
| retmsg="Duplicated knowledgebase name in updating dataset.") | |||
| if not KnowledgebaseService.update_by_id(kb.id, req): | |||
| return get_error_data_result(retmsg="Update dataset error.(Database error)") | |||
| return get_result(retcode=RetCode.SUCCESS) | |||
| @manager.route('/dataset', methods=['GET']) | |||
| @token_required | |||
| def list_datasets(tenant_id): | |||
| def list(tenant_id): | |||
| id = request.args.get("id") | |||
| name = request.args.get("name") | |||
| kbs = KnowledgebaseService.query(id=id,name=name,status=1) | |||
| if not kbs: | |||
| return get_error_data_result(retmsg="The dataset doesn't exist") | |||
| page_number = int(request.args.get("page", 1)) | |||
| items_per_page = int(request.args.get("page_size", 1024)) | |||
| orderby = request.args.get("orderby", "create_time") | |||
| desc = bool(request.args.get("desc", True)) | |||
| tenants = TenantService.get_joined_tenants_by_user_id(tenant_id) | |||
| kbs = KnowledgebaseService.get_by_tenant_ids( | |||
| [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc) | |||
| kbs = KnowledgebaseService.get_list( | |||
| [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc, id, name) | |||
| renamed_list = [] | |||
| for kb in kbs: | |||
| key_mapping = { | |||
| @@ -175,50 +178,4 @@ def list_datasets(tenant_id): | |||
| new_key = key_mapping.get(key, key) | |||
| renamed_data[new_key] = value | |||
| renamed_list.append(renamed_data) | |||
| return get_json_result(data=renamed_list) | |||
| @manager.route('/detail', methods=['GET']) | |||
| @token_required | |||
| def detail(tenant_id): | |||
| req = request.args | |||
| key_mapping = { | |||
| "chunk_num": "chunk_count", | |||
| "doc_num": "document_count", | |||
| "parser_id": "parse_method", | |||
| "embd_id": "embedding_model" | |||
| } | |||
| renamed_data = {} | |||
| if "id" in req: | |||
| id = req["id"] | |||
| kb = KnowledgebaseService.query(created_by=tenant_id, id=req["id"]) | |||
| if not kb: | |||
| return get_json_result( | |||
| data=False, retmsg='You do not own the dataset.', | |||
| retcode=RetCode.OPERATING_ERROR) | |||
| if "name" in req: | |||
| name = req["name"] | |||
| if kb[0].name != name: | |||
| return get_json_result( | |||
| data=False, retmsg='You do not own the dataset.', | |||
| retcode=RetCode.OPERATING_ERROR) | |||
| e, k = KnowledgebaseService.get_by_id(id) | |||
| for key, value in k.to_dict().items(): | |||
| new_key = key_mapping.get(key, key) | |||
| renamed_data[new_key] = value | |||
| return get_json_result(data=renamed_data) | |||
| else: | |||
| if "name" in req: | |||
| name = req["name"] | |||
| e, k = KnowledgebaseService.get_by_name(kb_name=name, tenant_id=tenant_id) | |||
| if not e: | |||
| return get_json_result( | |||
| data=False, retmsg='You do not own the dataset.', | |||
| retcode=RetCode.OPERATING_ERROR) | |||
| for key, value in k.to_dict().items(): | |||
| new_key = key_mapping.get(key, key) | |||
| renamed_data[new_key] = value | |||
| return get_json_result(data=renamed_data) | |||
| else: | |||
| return get_data_error_result( | |||
| retmsg="At least one of `id` or `name` must be provided.") | |||
| return get_result(data=renamed_list) | |||
| @@ -142,3 +142,27 @@ class KnowledgebaseService(CommonService): | |||
| @DB.connection_context() | |||
| def get_all_ids(cls): | |||
| return [m["id"] for m in cls.model.select(cls.model.id).dicts()] | |||
| @classmethod | |||
| @DB.connection_context() | |||
| def get_list(cls, joined_tenant_ids, user_id, | |||
| page_number, items_per_page, orderby, desc, id , name): | |||
| kbs = cls.model.select() | |||
| if id: | |||
| kbs = kbs.where(cls.model.id == id) | |||
| if name: | |||
| kbs = kbs.where(cls.model.name == name) | |||
| kbs = kbs.where( | |||
| ((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission == | |||
| TenantPermission.TEAM.value)) | ( | |||
| cls.model.tenant_id == user_id)) | |||
| & (cls.model.status == StatusEnum.VALID.value) | |||
| ) | |||
| if desc: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| kbs = kbs.order_by(cls.model.getter_by(orderby).asc()) | |||
| kbs = kbs.paginate(page_number, items_per_page) | |||
| return list(kbs.dicts()) | |||
| @@ -5,63 +5,134 @@ | |||
| **POST** `/api/v1/dataset` | |||
| Creates a dataset with a name. If dataset of the same name already exists, the new dataset will be renamed by RAGFlow automatically. | |||
| Creates a dataset. | |||
| ### Request | |||
| - Method: POST | |||
| - URL: `/api/v1/dataset` | |||
| - URL: `http://{address}/api/v1/dataset` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `"dataset_name"`: `string` | |||
| - `"id"`: `string` | |||
| - `"name"`: `string` | |||
| - `"avatar"`: `string` | |||
| - `"tenant_id"`: `string` | |||
| - `"description"`: `string` | |||
| - `"language"`: `string` | |||
| - `"embedding_model"`: `string` | |||
| - `"chunk_count"`: `integer` | |||
| - `"permission"`: `string` | |||
| - `"document_count"`: `integer` | |||
| - `"chunk_count"`: `integer` | |||
| - `"parse_method"`: `string` | |||
| - `"parser_config"`: `Dataset.ParserConfig` | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| # "id": id must not be provided. | |||
| # "name": name is required and can't be duplicated. | |||
| # "tenant_id": tenant_id must not be provided. | |||
| # "embedding_model": embedding_model must not be provided. | |||
| # "navie" means general. | |||
| curl --request POST \ | |||
| --url http://{address}/api/v1/dataset \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data-binary '{ | |||
| "dataset_name": "test", | |||
| "tenant_id": "4fb0cd625f9311efba4a0242ac120006", | |||
| "embedding_model": "BAAI/bge--zh-v1.5", | |||
| "chunk_count": 0, | |||
| "document_count": 0, | |||
| "parse_method": "general" | |||
| --url http://{address}/api/v1/dataset \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{ | |||
| "name": "test", | |||
| "chunk_count": 0, | |||
| "document_count": 0, | |||
| "parse_method": "naive" | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| - `"dataset_name"`: (*Body parameter*) | |||
| - `"id"`: (*Body parameter*) | |||
| The ID of the created dataset used to uniquely identify different datasets. | |||
| - If creating a dataset, `id` must not be provided. | |||
| - `"name"`: (*Body parameter*) | |||
| The name of the dataset, which must adhere to the following requirements: | |||
| - Maximum 65,535 characters. | |||
| - Required when creating a dataset and must be unique. | |||
| - If updating a dataset, `name` must still be unique. | |||
| - `"avatar"`: (*Body parameter*) | |||
| Base64 encoding of the avatar. | |||
| - `"tenant_id"`: (*Body parameter*) | |||
| The ID of the tenant. | |||
| The ID of the tenant associated with the dataset, used to link it with specific users. | |||
| - If creating a dataset, `tenant_id` must not be provided. | |||
| - If updating a dataset, `tenant_id` cannot be changed. | |||
| - `"description"`: (*Body parameter*) | |||
| The description of the dataset. | |||
| - `"language"`: (*Body parameter*) | |||
| The language setting for the dataset. | |||
| - `"embedding_model"`: (*Body parameter*) | |||
| Embedding model used in the dataset. | |||
| - `"chunk_count"`: (*Body parameter*) | |||
| Chunk count of the dataset. | |||
| Embedding model used in the dataset to generate vector embeddings. | |||
| - If creating a dataset, `embedding_model` must not be provided. | |||
| - If updating a dataset, `embedding_model` cannot be changed. | |||
| - `"permission"`: (*Body parameter*) | |||
| Specifies who can manipulate the dataset. | |||
| - `"document_count"`: (*Body parameter*) | |||
| Document count of the dataset. | |||
| - `"parse_mehtod"`: (*Body parameter*) | |||
| Parsing method of the dataset. | |||
| Document count of the dataset. | |||
| - If updating a dataset, `document_count` cannot be changed. | |||
| - `"chunk_count"`: (*Body parameter*) | |||
| Chunk count of the dataset. | |||
| - If updating a dataset, `chunk_count` cannot be changed. | |||
| - `"parse_method"`: (*Body parameter*) | |||
| Parsing method of the dataset. | |||
| - If updating `parse_method`, `chunk_count` must be greater than 0. | |||
| - `"parser_config"`: (*Body parameter*) | |||
| The configuration settings for the dataset parser. | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| "code": 0, | |||
| "data": { | |||
| "avatar": null, | |||
| "chunk_count": 0, | |||
| "create_date": "Thu, 10 Oct 2024 05:57:37 GMT", | |||
| "create_time": 1728539857641, | |||
| "created_by": "69736c5e723611efb51b0242ac120007", | |||
| "description": null, | |||
| "document_count": 0, | |||
| "embedding_model": "BAAI/bge-large-zh-v1.5", | |||
| "id": "8d73076886cc11ef8c270242ac120006", | |||
| "language": "English", | |||
| "name": "test_1", | |||
| "parse_method": "naive", | |||
| "parser_config": { | |||
| "pages": [ | |||
| [ | |||
| 1, | |||
| 1000000 | |||
| ] | |||
| ] | |||
| }, | |||
| "permission": "me", | |||
| "similarity_threshold": 0.2, | |||
| "status": "1", | |||
| "tenant_id": "69736c5e723611efb51b0242ac120007", | |||
| "token_num": 0, | |||
| "update_date": "Thu, 10 Oct 2024 05:57:37 GMT", | |||
| "update_time": 1728539857641, | |||
| "vector_similarity_weight": 0.3 | |||
| } | |||
| } | |||
| ``` | |||
| @@ -71,10 +142,10 @@ The successful response includes a JSON object like the following: | |||
| The error response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 3016, | |||
| "message": "Can't connect database" | |||
| "code": 102, | |||
| "message": "Duplicated knowledgebase name in creating dataset." | |||
| } | |||
| ``` | |||
| @@ -82,27 +153,31 @@ The error response includes a JSON object like the following: | |||
| **DELETE** `/api/v1/dataset` | |||
| Deletes a dataset by its id or name. | |||
| Deletes datasets by ids or names. | |||
| ### Request | |||
| - Method: DELETE | |||
| - URL: `/api/v1/dataset/{dataset_id}` | |||
| - URL: `http://{address}/api/v1/dataset` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `"names"`: `List[string]` | |||
| - `"ids"`: `List[string]` | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| # Either id or name must be provided, but not both. | |||
| curl --request DELETE \ | |||
| --url http://{address}/api/v1/dataset/0 \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --data ' { | |||
| "names": ["ds1", "ds2"] | |||
| }' | |||
| --url http://{address}/api/v1/dataset \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{ | |||
| "names": ["test_1", "test_2"] | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| @@ -118,7 +193,7 @@ curl --request DELETE \ | |||
| The successful response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| } | |||
| @@ -130,10 +205,10 @@ The successful response includes a JSON object like the following: | |||
| The error response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 3016, | |||
| "message": "Try to delete non-existent dataset." | |||
| "code": 102, | |||
| "message": "You don't own the dataset." | |||
| } | |||
| ``` | |||
| @@ -146,50 +221,47 @@ Updates a dataset by its id. | |||
| ### Request | |||
| - Method: PUT | |||
| - URL: `/api/v1/dataset/{dataset_id}` | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: (Refer to the "Create Dataset" for the complete structure of the request body.) | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| # "id": id is required. | |||
| # "name": If you update name, it can't be duplicated. | |||
| # "tenant_id": If you update tenant_id, it can't be changed | |||
| # "embedding_model": If you update embedding_model, it can't be changed. | |||
| # "chunk_count": If you update chunk_count, it can't be changed. | |||
| # "document_count": If you update document_count, it can't be changed. | |||
| # "parse_method": If you update parse_method, chunk_count must be 0. | |||
| # "navie" means general. | |||
| curl --request PUT \ | |||
| --url http://{address}/api/v1/dataset/0 \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --data-binary '{ | |||
| "dataset_name": "test", | |||
| "tenant_id": "4fb0cd625f9311efba4a0242ac120006", | |||
| "embedding_model": "BAAI/bge--zh-v1.5", | |||
| "chunk_count": 0, | |||
| "document_count": 0, | |||
| "parse_method": "general" | |||
| --url http://{address}/api/v1/dataset/{dataset_id} \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{ | |||
| "name": "test", | |||
| "tenant_id": "4fb0cd625f9311efba4a0242ac120006", | |||
| "embedding_model": "BAAI/bge-zh-v1.5", | |||
| "chunk_count": 0, | |||
| "document_count": 0, | |||
| "parse_method": "navie" | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| (Refer to the "Create Dataset" for the complete structure of the request parameters.) | |||
| - `"dataset_name"`: (*Body parameter*) | |||
| The name of the dataset, which must adhere to the following requirements: | |||
| - Maximum 65,535 characters. | |||
| - `"tenant_id"`: (*Body parameter*) | |||
| The ID of the tenant. | |||
| - `"embedding_model"`: (*Body parameter*) | |||
| Embedding model used in the dataset. | |||
| - `"chunk_count"`: (*Body parameter*) | |||
| Chunk count of the dataset. | |||
| - `"document_count"`: (*Body parameter*) | |||
| Document count of the dataset. | |||
| - `"parse_mehtod"`: (*Body parameter*) | |||
| Parsing method of the dataset. | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| } | |||
| @@ -201,35 +273,37 @@ The successful response includes a JSON object like the following: | |||
| The error response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 3016, | |||
| "message": "Can't change embedding model since some files already use it." | |||
| "code": 102, | |||
| "message": "Can't change tenant_id." | |||
| } | |||
| ``` | |||
| ## List datasets | |||
| **GET** `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}` | |||
| **GET** `/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}` | |||
| List all datasets | |||
| ### Request | |||
| - Method: GET | |||
| - URL: `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}` | |||
| - URL: `http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| # If no page parameter is passed, the default is 1 | |||
| # If no page_size parameter is passed, the default is 1024 | |||
| # If no order_by parameter is passed, the default is "create_time" | |||
| # If no desc parameter is passed, the default is True | |||
| curl --request GET \ | |||
| --url http://{address}/api/v1/dataset?page=0&page_size=50&orderby=create_time&desc=false \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --url http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id} \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| ``` | |||
| #### Request parameters | |||
| @@ -244,54 +318,63 @@ curl --request GET \ | |||
| A boolean flag indicating whether the sorting should be in descending order. | |||
| - `name`: (*Path parameter*) | |||
| Dataset name | |||
| - - `"id"`: (*Path parameter*) | |||
| The ID of the dataset to be retrieved. | |||
| - `"name"`: (*Path parameter*) | |||
| The name of the dataset to be retrieved. | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 0, | |||
| "data": [ | |||
| { | |||
| "avatar": "", | |||
| "chunk_count": 0, | |||
| "create_date": "Thu, 29 Aug 2024 03:13:07 GMT", | |||
| "create_time": 1724901187843, | |||
| "created_by": "4fb0cd625f9311efba4a0242ac120006", | |||
| "description": "", | |||
| "document_count": 0, | |||
| "embedding_model": "BAAI/bge-large-zh-v1.5", | |||
| "id": "9d3d906665b411ef87d10242ac120006", | |||
| "language": "English", | |||
| "name": "Test", | |||
| "parser_config": { | |||
| "chunk_token_count": 128, | |||
| "delimiter": "\n!?。;!?", | |||
| "layout_recognize": true, | |||
| "task_page_size": 12 | |||
| }, | |||
| "parse_method": "naive", | |||
| "permission": "me", | |||
| "similarity_threshold": 0.2, | |||
| "status": "1", | |||
| "tenant_id": "4fb0cd625f9311efba4a0242ac120006", | |||
| "token_count": 0, | |||
| "update_date": "Thu, 29 Aug 2024 03:13:07 GMT", | |||
| "update_time": 1724901187843, | |||
| "vector_similarity_weight": 0.3 | |||
| "avatar": "", | |||
| "chunk_count": 59, | |||
| "create_date": "Sat, 14 Sep 2024 01:12:37 GMT", | |||
| "create_time": 1726276357324, | |||
| "created_by": "69736c5e723611efb51b0242ac120007", | |||
| "description": null, | |||
| "document_count": 1, | |||
| "embedding_model": "BAAI/bge-large-zh-v1.5", | |||
| "id": "6e211ee0723611efa10a0242ac120007", | |||
| "language": "English", | |||
| "name": "mysql", | |||
| "parse_method": "knowledge_graph", | |||
| "parser_config": { | |||
| "chunk_token_num": 8192, | |||
| "delimiter": "\\n!?;。;!?", | |||
| "entity_types": [ | |||
| "organization", | |||
| "person", | |||
| "location", | |||
| "event", | |||
| "time" | |||
| ] | |||
| }, | |||
| "permission": "me", | |||
| "similarity_threshold": 0.2, | |||
| "status": "1", | |||
| "tenant_id": "69736c5e723611efb51b0242ac120007", | |||
| "token_num": 12744, | |||
| "update_date": "Thu, 10 Oct 2024 04:07:23 GMT", | |||
| "update_time": 1728533243536, | |||
| "vector_similarity_weight": 0.3 | |||
| } | |||
| ], | |||
| ] | |||
| } | |||
| ``` | |||
| The error response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 3016, | |||
| "message": "Can't access database to get the dataset list." | |||
| "code": 102, | |||
| "message": "The dataset doesn't exist" | |||
| } | |||
| ``` | |||
| @@ -38,9 +38,9 @@ The unique name of the dataset to create. It must adhere to the following requir | |||
| #### avatar: `str` | |||
| The url or ???????????????????????? path to the avatar image associated with the created dataset. Defaults to `""` | |||
| Base64 encoding of the avatar. Defaults to `""` | |||
| #### tenant_id: `str` ????????????????? | |||
| #### tenant_id: `str` | |||
| The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`. | |||
| @@ -55,9 +55,9 @@ The description of the created dataset. Defaults to `""`. | |||
| The language setting of the created dataset. Defaults to `"English"`. ???????????? | |||
| #### embedding_model: `str` ???????????????? | |||
| #### embedding_model: `str` | |||
| The specific model or algorithm used by the dataset to generate vector embeddings. Defaults to `""`. | |||
| The specific model used by the dataset to generate vector embeddings. Defaults to `""`. | |||
| - If creating a dataset, embedding_model must not be provided. | |||
| - If updating a dataset, embedding_model can't be changed. | |||
| @@ -89,12 +89,10 @@ The method used by the dataset to parse and process data. | |||
| The configuration settings for the parser used by the dataset. | |||
| ### Returns | |||
| - Success: An `infinity.local_infinity.table.LocalTable` object in Python module mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode. | |||
| - Failure: `InfinityException` | |||
| - `error_code`: `int` A non-zero value indicating a specific error condition. | |||
| - `error_msg`: `str` A message providing additional details about the error. | |||
| ```python | |||
| DataSet | |||
| description: dataset object | |||
| ``` | |||
| ### Examples | |||
| ```python | |||
| @@ -106,19 +104,28 @@ ds = rag.create_dataset(name="kb_1") | |||
| --- | |||
| ## Delete knowledge base | |||
| ## Delete knowledge bases | |||
| ```python | |||
| DataSet.delete() -> bool | |||
| RAGFlow.delete_dataset(ids: List[str] = None, names: List[str] = None) | |||
| ``` | |||
| Deletes knowledge bases. | |||
| ### Parameters | |||
| Deletes a knowledge base. | |||
| #### ids: `List[str]` | |||
| ### Returns | |||
| The ids of the datasets to be deleted. | |||
| #### names: `List[str]` | |||
| `bool` | |||
| The names of the datasets to be deleted. | |||
| description:the case of updating an dateset, `True` or `False`. | |||
| Either `ids` or `names` must be provided, but not both. | |||
| ### Returns | |||
| ```python | |||
| no return | |||
| ``` | |||
| ### Examples | |||
| @@ -126,8 +133,8 @@ description:the case of updating an dateset, `True` or `False`. | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.create_dataset(name="kb_1") | |||
| ds.delete() | |||
| rag.delete_dataset(names=["name_1","name_2"]) | |||
| rag.delete_dataset(ids=["id_1","id_2"]) | |||
| ``` | |||
| --- | |||
| @@ -139,7 +146,9 @@ RAGFlow.list_datasets( | |||
| page: int = 1, | |||
| page_size: int = 1024, | |||
| orderby: str = "create_time", | |||
| desc: bool = True | |||
| desc: bool = True, | |||
| id: str = None, | |||
| name: str = None | |||
| ) -> List[DataSet] | |||
| ``` | |||
| @@ -163,51 +172,19 @@ The field by which the records should be sorted. This specifies the attribute or | |||
| Whether the sorting should be in descending order. Defaults to `True`. | |||
| ### Returns | |||
| ```python | |||
| List[DataSet] | |||
| description:the list of datasets. | |||
| ``` | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| for ds in rag.list_datasets(): | |||
| print(ds) | |||
| ``` | |||
| --- | |||
| ## Retrieve knowledge base | |||
| ```python | |||
| RAGFlow.get_dataset( | |||
| id: str = None, | |||
| name: str = None | |||
| ) -> DataSet | |||
| ``` | |||
| #### id: `str` | |||
| Retrieves a knowledge base by name. | |||
| ### Parameters | |||
| The id of the dataset to be got. Defaults to `None`. | |||
| #### name: `str` | |||
| The name of the dataset to be got. If `id` is not provided, `name` is required. | |||
| #### id: `str` | |||
| The id of the dataset to be got. If `name` is not provided, `id` is required. | |||
| The name of the dataset to be got. Defaults to `None`. | |||
| ### Returns | |||
| ```python | |||
| DataSet | |||
| description: dataset object | |||
| List[DataSet] | |||
| description:the list of datasets. | |||
| ``` | |||
| ### Examples | |||
| @@ -216,23 +193,23 @@ description: dataset object | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.get_dataset(name="ragflow") | |||
| print(ds) | |||
| for ds in rag.list_datasets(): | |||
| print(ds) | |||
| ``` | |||
| --- | |||
| ## Save knowledge base configurations | |||
| ## Update knowledge base | |||
| ```python | |||
| DataSet.save() -> bool | |||
| DataSet.update(update_message: dict) | |||
| ``` | |||
| ### Returns | |||
| ```python | |||
| bool | |||
| description:the case of updating an dateset, True or False. | |||
| no return | |||
| ``` | |||
| ### Examples | |||
| @@ -242,8 +219,7 @@ from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.get_dataset(name="kb_1") | |||
| ds.parse_method = "manual" | |||
| ds.save() | |||
| ds.update({"parse_method":"manual", ...}} | |||
| ``` | |||
| --- | |||
| @@ -268,3 +268,32 @@ def token_required(func): | |||
| return func(*args, **kwargs) | |||
| return decorated_function | |||
| def get_result(retcode=RetCode.SUCCESS, retmsg='error', data=None): | |||
| if retcode == 0: | |||
| if data is not None: | |||
| response = {"code": retcode, "data": data} | |||
| else: | |||
| response = {"code": retcode} | |||
| else: | |||
| response = {"code": retcode, "message": retmsg} | |||
| return jsonify(response) | |||
| def get_error_data_result(retcode=RetCode.DATA_ERROR, | |||
| retmsg='Sorry! Data missing!'): | |||
| import re | |||
| result_dict = { | |||
| "code": retcode, | |||
| "message": re.sub( | |||
| r"rag", | |||
| "seceum", | |||
| retmsg, | |||
| flags=re.IGNORECASE)} | |||
| response = {} | |||
| for key, value in result_dict.items(): | |||
| if value is None and key != "code": | |||
| continue | |||
| else: | |||
| response[key] = value | |||
| return jsonify(response) | |||
| @@ -30,5 +30,9 @@ class Base(object): | |||
| res = self.rag.delete(path, params) | |||
| return res | |||
| def put(self,path, json): | |||
| res = self.rag.put(path,json) | |||
| return res | |||
| def __str__(self): | |||
| return str(self.to_json()) | |||
| @@ -32,24 +32,13 @@ class DataSet(Base): | |||
| res_dict.pop(k) | |||
| super().__init__(rag, res_dict) | |||
| def save(self) -> bool: | |||
| res = self.post('/dataset/save', | |||
| {"id": self.id, "name": self.name, "avatar": self.avatar, "tenant_id": self.tenant_id, | |||
| "description": self.description, "language": self.language, "embedding_model": self.embedding_model, | |||
| "permission": self.permission, | |||
| "document_count": self.document_count, "chunk_count": self.chunk_count, "parse_method": self.parse_method, | |||
| "parser_config": self.parser_config.to_json() | |||
| }) | |||
| def update(self, update_message: dict): | |||
| res = self.put(f'/dataset/{self.id}', | |||
| update_message) | |||
| res = res.json() | |||
| if res.get("retmsg") == "success": return True | |||
| raise Exception(res["retmsg"]) | |||
| if res.get("code") != 0: | |||
| raise Exception(res["message"]) | |||
| def delete(self) -> bool: | |||
| res = self.rm('/dataset/delete', | |||
| {"id": self.id}) | |||
| res = res.json() | |||
| if res.get("retmsg") == "success": return True | |||
| raise Exception(res["retmsg"]) | |||
| def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]: | |||
| """ | |||
| @@ -18,9 +18,9 @@ from typing import List | |||
| import requests | |||
| from .modules.assistant import Assistant | |||
| from .modules.chunk import Chunk | |||
| from .modules.dataset import DataSet | |||
| from .modules.document import Document | |||
| from .modules.chunk import Chunk | |||
| class RAGFlow: | |||
| @@ -41,7 +41,11 @@ class RAGFlow: | |||
| return res | |||
| def delete(self, path, params): | |||
| res = requests.delete(url=self.api_url + path, params=params, headers=self.authorization_header) | |||
| res = requests.delete(url=self.api_url + path, json=params, headers=self.authorization_header) | |||
| return res | |||
| def put(self, path, json): | |||
| res = requests.put(url=self.api_url + path, json= json,headers=self.authorization_header) | |||
| return res | |||
| def create_dataset(self, name: str, avatar: str = "", description: str = "", language: str = "English", | |||
| @@ -52,7 +56,7 @@ class RAGFlow: | |||
| parser_config = DataSet.ParserConfig(self, {"chunk_token_count": 128, "layout_recognize": True, | |||
| "delimiter": "\n!?。;!?", "task_page_size": 12}) | |||
| parser_config = parser_config.to_json() | |||
| res = self.post("/dataset/save", | |||
| res = self.post("/dataset", | |||
| {"name": name, "avatar": avatar, "description": description, "language": language, | |||
| "permission": permission, | |||
| "document_count": document_count, "chunk_count": chunk_count, "parse_method": parse_method, | |||
| @@ -60,27 +64,28 @@ class RAGFlow: | |||
| } | |||
| ) | |||
| res = res.json() | |||
| if res.get("retmsg") == "success": | |||
| if res.get("code") == 0: | |||
| return DataSet(self, res["data"]) | |||
| raise Exception(res["retmsg"]) | |||
| raise Exception(res["message"]) | |||
| def delete_dataset(self, ids: List[str] = None, names: List[str] = None): | |||
| res = self.delete("/dataset",{"ids": ids, "names": names}) | |||
| res=res.json() | |||
| if res.get("code") != 0: | |||
| raise Exception(res["message"]) | |||
| def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True) -> \ | |||
| def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True, | |||
| id: str = None, name: str = None) -> \ | |||
| List[DataSet]: | |||
| res = self.get("/dataset/list", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc}) | |||
| res = self.get("/dataset", | |||
| {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name}) | |||
| res = res.json() | |||
| result_list = [] | |||
| if res.get("retmsg") == "success": | |||
| if res.get("code") == 0: | |||
| for data in res['data']: | |||
| result_list.append(DataSet(self, data)) | |||
| return result_list | |||
| raise Exception(res["retmsg"]) | |||
| def get_dataset(self, id: str = None, name: str = None) -> DataSet: | |||
| res = self.get("/dataset/detail", {"id": id, "name": name}) | |||
| res = res.json() | |||
| if res.get("retmsg") == "success": | |||
| return DataSet(self, res['data']) | |||
| raise Exception(res["retmsg"]) | |||
| raise Exception(res["message"]) | |||
| def create_assistant(self, name: str = "assistant", avatar: str = "path", knowledgebases: List[DataSet] = [], | |||
| llm: Assistant.LLM = None, prompt: Assistant.Prompt = None) -> Assistant: | |||
| @@ -272,4 +277,3 @@ class RAGFlow: | |||
| except Exception as e: | |||
| print(f"An error occurred during retrieval: {e}") | |||
| raise | |||
| @@ -1,4 +1,4 @@ | |||
| API_KEY = 'ragflow-k0YzUxMGY4NjY5YTExZWY5MjI5MDI0Mm' | |||
| API_KEY = 'ragflow-NiYmZjNTVjODYwNzExZWZiODEwMDI0Mm' | |||
| HOST_ADDRESS = 'http://127.0.0.1:9380' | |||
| @@ -24,9 +24,8 @@ class TestDataset(TestSdk): | |||
| ds = rag.create_dataset("ABC") | |||
| if isinstance(ds, DataSet): | |||
| assert ds.name == "ABC", "Name does not match." | |||
| ds.name = 'DEF' | |||
| res = ds.save() | |||
| assert res is True, f"Failed to update dataset, error: {res}" | |||
| res = ds.update({"name":"DEF"}) | |||
| assert res is None, f"Failed to update dataset, error: {res}" | |||
| else: | |||
| assert False, f"Failed to create dataset, error: {ds}" | |||
| @@ -38,8 +37,8 @@ class TestDataset(TestSdk): | |||
| ds = rag.create_dataset("MA") | |||
| if isinstance(ds, DataSet): | |||
| assert ds.name == "MA", "Name does not match." | |||
| res = ds.delete() | |||
| assert res is True, f"Failed to delete dataset, error: {res}" | |||
| res = rag.delete_dataset(names=["MA"]) | |||
| assert res is None, f"Failed to delete dataset, error: {res}" | |||
| else: | |||
| assert False, f"Failed to create dataset, error: {ds}" | |||
| @@ -52,12 +51,3 @@ class TestDataset(TestSdk): | |||
| assert len(list_datasets) > 0, "Do not exist any dataset" | |||
| for ds in list_datasets: | |||
| assert isinstance(ds, DataSet), "Existence type is not dataset." | |||
| def test_get_detail_dataset_with_success(self): | |||
| """ | |||
| Test getting a dataset's detail with success | |||
| """ | |||
| rag = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| ds = rag.get_dataset(name="God") | |||
| assert isinstance(ds, DataSet), f"Failed to get dataset, error: {ds}." | |||
| assert ds.name == "God", "Name does not match" | |||