소스 검색

Refactor Dataset API (#2783)

### What problem does this PR solve?

Refactor Dataset API

### Type of change

- [x] Refactoring

---------

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
tags/v0.13.0
liuhua 1 년 전
부모
커밋
cbd7cd7c4d
No account linked to committer's email address

+ 1
- 1
api/apps/__init__.py 파일 보기

sys.modules[module_name] = page sys.modules[module_name] = page
spec.loader.exec_module(page) spec.loader.exec_module(page)
page_name = getattr(page, 'page_name', page_name) page_name = getattr(page, 'page_name', page_name)
url_prefix = f'/api/{API_VERSION}/{page_name}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}'
url_prefix = f'/api/{API_VERSION}' if "/sdk/" in path else f'/{API_VERSION}/{page_name}'


app.register_blueprint(page.manager, url_prefix=url_prefix) app.register_blueprint(page.manager, url_prefix=url_prefix)
return url_prefix return url_prefix

+ 128
- 171
api/apps/sdk/dataset.py 파일 보기

from api.db.services.user_service import TenantService from api.db.services.user_service import TenantService
from api.settings import RetCode from api.settings import RetCode
from api.utils import get_uuid from api.utils import get_uuid
from api.utils.api_utils import get_json_result, token_required, get_data_error_result
from api.utils.api_utils import get_result, token_required,get_error_data_result
@manager.route('/save', methods=['POST'])
@manager.route('/dataset', methods=['POST'])
@token_required @token_required
def save(tenant_id):
def create(tenant_id):
req = request.json req = request.json
e, t = TenantService.get_by_id(tenant_id) e, t = TenantService.get_by_id(tenant_id)
if "id" not in req:
if "tenant_id" in req or "embedding_model" in req:
return get_data_error_result(
retmsg="Tenant_id or embedding_model must not be provided")
if "name" not in req:
return get_data_error_result(
retmsg="Name is not empty!")
req['id'] = get_uuid()
req["name"] = req["name"].strip()
if req["name"] == "":
return get_data_error_result(
retmsg="Name is not empty string!")
if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value):
return get_data_error_result(
retmsg="Duplicated knowledgebase name in creating dataset.")
req["tenant_id"] = req['created_by'] = tenant_id
req['embedding_model'] = t.embd_id
key_mapping = {
"chunk_num": "chunk_count",
"doc_num": "document_count",
"parser_id": "parse_method",
"embd_id": "embedding_model"
}
mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req}
req.update(mapped_keys)
if not KnowledgebaseService.save(**req):
return get_data_error_result(retmsg="Create dataset error.(Database error)")
renamed_data = {}
e, k = KnowledgebaseService.get_by_id(req["id"])
for key, value in k.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_data[new_key] = value
return get_json_result(data=renamed_data)
else:
invalid_keys = {"embd_id", "chunk_num", "doc_num", "parser_id"}
if any(key in req for key in invalid_keys):
return get_data_error_result(retmsg="The input parameters are invalid.")
if "tenant_id" in req:
if req["tenant_id"] != tenant_id:
return get_data_error_result(
retmsg="Can't change tenant_id.")
if "embedding_model" in req:
if req["embedding_model"] != t.embd_id:
return get_data_error_result(
retmsg="Can't change embedding_model.")
req.pop("embedding_model")
if not KnowledgebaseService.query(
created_by=tenant_id, id=req["id"]):
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
if not req["id"]:
return get_data_error_result(
retmsg="id can not be empty.")
e, kb = KnowledgebaseService.get_by_id(req["id"])
if "chunk_count" in req:
if req["chunk_count"] != kb.chunk_num:
return get_data_error_result(
retmsg="Can't change chunk_count.")
req.pop("chunk_count")
if "document_count" in req:
if req['document_count'] != kb.doc_num:
return get_data_error_result(
retmsg="Can't change document_count.")
req.pop("document_count")
if "parse_method" in req:
if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id:
return get_data_error_result(
retmsg="If chunk count is not 0, parse method is not changable.")
req['parser_id'] = req.pop('parse_method')
if "name" in req:
req["name"] = req["name"].strip()
if req["name"].lower() != kb.name.lower() \
and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id,
status=StatusEnum.VALID.value)) > 0:
return get_data_error_result(
retmsg="Duplicated knowledgebase name in updating dataset.")
del req["id"]
if not KnowledgebaseService.update_by_id(kb.id, req):
return get_data_error_result(retmsg="Update dataset error.(Database error)")
return get_json_result(data=True)
if "tenant_id" in req or "embedding_model" in req:
return get_error_data_result(
retmsg="Tenant_id or embedding_model must not be provided")
chunk_count=req.get("chunk_count")
document_count=req.get("document_count")
if chunk_count or document_count:
return get_error_data_result(retmsg="chunk_count or document_count must be 0 or not be provided")
if "name" not in req:
return get_error_data_result(
retmsg="Name is not empty!")
req['id'] = get_uuid()
req["name"] = req["name"].strip()
if req["name"] == "":
return get_error_data_result(
retmsg="Name is not empty string!")
if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value):
return get_error_data_result(
retmsg="Duplicated knowledgebase name in creating dataset.")
req["tenant_id"] = req['created_by'] = tenant_id
req['embedding_model'] = t.embd_id
key_mapping = {
"chunk_num": "chunk_count",
"doc_num": "document_count",
"parser_id": "parse_method",
"embd_id": "embedding_model"
}
mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req}
req.update(mapped_keys)
if not KnowledgebaseService.save(**req):
return get_error_data_result(retmsg="Create dataset error.(Database error)")
renamed_data = {}
e, k = KnowledgebaseService.get_by_id(req["id"])
for key, value in k.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_data[new_key] = value
return get_result(data=renamed_data)
@manager.route('/delete', methods=['DELETE'])
@manager.route('/dataset', methods=['DELETE'])
@token_required @token_required
def delete(tenant_id): def delete(tenant_id):
req = request.args
if "id" not in req:
return get_data_error_result(
retmsg="id is required")
kbs = KnowledgebaseService.query(
created_by=tenant_id, id=req["id"])
if not kbs:
return get_json_result(
data=False, retmsg='You do not own the dataset',
retcode=RetCode.OPERATING_ERROR)
for doc in DocumentService.query(kb_id=req["id"]):
if not DocumentService.remove_document(doc, kbs[0].tenant_id):
return get_data_error_result(
retmsg="Remove document error.(Database error)")
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc.id)
if not KnowledgebaseService.delete_by_id(req["id"]):
return get_data_error_result(
retmsg="Delete dataset error.(Database serror)")
return get_json_result(data=True)
@manager.route('/list', methods=['GET'])
req = request.json
names=req.get("names")
ids = req.get("ids")
if not ids and not names:
return get_error_data_result(
retmsg="ids or names is required")
id_list=[]
if names:
for name in names:
kbs=KnowledgebaseService.query(name=name,tenant_id=tenant_id)
if not kbs:
return get_error_data_result(retmsg=f"You don't own the dataset {name}")
id_list.append(kbs[0].id)
if ids:
for id in ids:
kbs=KnowledgebaseService.query(id=id,tenant_id=tenant_id)
if not kbs:
return get_error_data_result(retmsg=f"You don't own the dataset {id}")
id_list.extend(ids)
for id in id_list:
for doc in DocumentService.query(kb_id=id):
if not DocumentService.remove_document(doc, tenant_id):
return get_error_data_result(
retmsg="Remove document error.(Database error)")
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc.id)
if not KnowledgebaseService.delete_by_id(id):
return get_error_data_result(
retmsg="Delete dataset error.(Database serror)")
return get_result(retcode=RetCode.SUCCESS)
@manager.route('/dataset/<dataset_id>', methods=['PUT'])
@token_required
def update(tenant_id,dataset_id):
if not KnowledgebaseService.query(id=dataset_id,tenant_id=tenant_id):
return get_error_data_result(retmsg="You don't own the dataset")
req = request.json
e, t = TenantService.get_by_id(tenant_id)
invalid_keys = {"id", "embd_id", "chunk_num", "doc_num", "parser_id"}
if any(key in req for key in invalid_keys):
return get_error_data_result(retmsg="The input parameters are invalid.")
if "tenant_id" in req:
if req["tenant_id"] != tenant_id:
return get_error_data_result(
retmsg="Can't change tenant_id.")
if "embedding_model" in req:
if req["embedding_model"] != t.embd_id:
return get_error_data_result(
retmsg="Can't change embedding_model.")
req.pop("embedding_model")
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if "chunk_count" in req:
if req["chunk_count"] != kb.chunk_num:
return get_error_data_result(
retmsg="Can't change chunk_count.")
req.pop("chunk_count")
if "document_count" in req:
if req['document_count'] != kb.doc_num:
return get_error_data_result(
retmsg="Can't change document_count.")
req.pop("document_count")
if "parse_method" in req:
if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id:
return get_error_data_result(
retmsg="If chunk count is not 0, parse method is not changable.")
req['parser_id'] = req.pop('parse_method')
if "name" in req:
req["name"] = req["name"].strip()
if req["name"].lower() != kb.name.lower() \
and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id,
status=StatusEnum.VALID.value)) > 0:
return get_error_data_result(
retmsg="Duplicated knowledgebase name in updating dataset.")
if not KnowledgebaseService.update_by_id(kb.id, req):
return get_error_data_result(retmsg="Update dataset error.(Database error)")
return get_result(retcode=RetCode.SUCCESS)
@manager.route('/dataset', methods=['GET'])
@token_required @token_required
def list_datasets(tenant_id):
def list(tenant_id):
id = request.args.get("id")
name = request.args.get("name")
kbs = KnowledgebaseService.query(id=id,name=name,status=1)
if not kbs:
return get_error_data_result(retmsg="The dataset doesn't exist")
page_number = int(request.args.get("page", 1)) page_number = int(request.args.get("page", 1))
items_per_page = int(request.args.get("page_size", 1024)) items_per_page = int(request.args.get("page_size", 1024))
orderby = request.args.get("orderby", "create_time") orderby = request.args.get("orderby", "create_time")
desc = bool(request.args.get("desc", True)) desc = bool(request.args.get("desc", True))
tenants = TenantService.get_joined_tenants_by_user_id(tenant_id) tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
kbs = KnowledgebaseService.get_by_tenant_ids(
[m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc)
kbs = KnowledgebaseService.get_list(
[m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc, id, name)
renamed_list = [] renamed_list = []
for kb in kbs: for kb in kbs:
key_mapping = { key_mapping = {
new_key = key_mapping.get(key, key) new_key = key_mapping.get(key, key)
renamed_data[new_key] = value renamed_data[new_key] = value
renamed_list.append(renamed_data) renamed_list.append(renamed_data)
return get_json_result(data=renamed_list)
@manager.route('/detail', methods=['GET'])
@token_required
def detail(tenant_id):
req = request.args
key_mapping = {
"chunk_num": "chunk_count",
"doc_num": "document_count",
"parser_id": "parse_method",
"embd_id": "embedding_model"
}
renamed_data = {}
if "id" in req:
id = req["id"]
kb = KnowledgebaseService.query(created_by=tenant_id, id=req["id"])
if not kb:
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
if "name" in req:
name = req["name"]
if kb[0].name != name:
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
e, k = KnowledgebaseService.get_by_id(id)
for key, value in k.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_data[new_key] = value
return get_json_result(data=renamed_data)
else:
if "name" in req:
name = req["name"]
e, k = KnowledgebaseService.get_by_name(kb_name=name, tenant_id=tenant_id)
if not e:
return get_json_result(
data=False, retmsg='You do not own the dataset.',
retcode=RetCode.OPERATING_ERROR)
for key, value in k.to_dict().items():
new_key = key_mapping.get(key, key)
renamed_data[new_key] = value
return get_json_result(data=renamed_data)
else:
return get_data_error_result(
retmsg="At least one of `id` or `name` must be provided.")
return get_result(data=renamed_list)

+ 24
- 0
api/db/services/knowledgebase_service.py 파일 보기

@DB.connection_context() @DB.connection_context()
def get_all_ids(cls): def get_all_ids(cls):
return [m["id"] for m in cls.model.select(cls.model.id).dicts()] return [m["id"] for m in cls.model.select(cls.model.id).dicts()]

@classmethod
@DB.connection_context()
def get_list(cls, joined_tenant_ids, user_id,
page_number, items_per_page, orderby, desc, id , name):
kbs = cls.model.select()
if id:
kbs = kbs.where(cls.model.id == id)
if name:
kbs = kbs.where(cls.model.name == name)
kbs = kbs.where(
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
TenantPermission.TEAM.value)) | (
cls.model.tenant_id == user_id))
& (cls.model.status == StatusEnum.VALID.value)
)
if desc:
kbs = kbs.order_by(cls.model.getter_by(orderby).desc())
else:
kbs = kbs.order_by(cls.model.getter_by(orderby).asc())

kbs = kbs.paginate(page_number, items_per_page)

return list(kbs.dicts())

+ 192
- 109
api/http_api.md 파일 보기



**POST** `/api/v1/dataset` **POST** `/api/v1/dataset`


Creates a dataset with a name. If dataset of the same name already exists, the new dataset will be renamed by RAGFlow automatically.
Creates a dataset.


### Request ### Request


- Method: POST - Method: POST
- URL: `/api/v1/dataset`
- URL: `http://{address}/api/v1/dataset`
- Headers: - Headers:
- `content-Type: application/json` - `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
- Body: - Body:
- `"dataset_name"`: `string`
- `"id"`: `string`
- `"name"`: `string`
- `"avatar"`: `string`
- `"tenant_id"`: `string` - `"tenant_id"`: `string`
- `"description"`: `string`
- `"language"`: `string`
- `"embedding_model"`: `string` - `"embedding_model"`: `string`
- `"chunk_count"`: `integer`
- `"permission"`: `string`
- `"document_count"`: `integer` - `"document_count"`: `integer`
- `"chunk_count"`: `integer`
- `"parse_method"`: `string` - `"parse_method"`: `string`
- `"parser_config"`: `Dataset.ParserConfig`


#### Request example #### Request example


```shell
```bash
# "id": id must not be provided.
# "name": name is required and can't be duplicated.
# "tenant_id": tenant_id must not be provided.
# "embedding_model": embedding_model must not be provided.
# "navie" means general.
curl --request POST \ curl --request POST \
--url http://{address}/api/v1/dataset \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
--data-binary '{
"dataset_name": "test",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006",
"embedding_model": "BAAI/bge--zh-v1.5",
"chunk_count": 0,
"document_count": 0,
"parse_method": "general"
--url http://{address}/api/v1/dataset \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
--data '{
"name": "test",
"chunk_count": 0,
"document_count": 0,
"parse_method": "naive"
}' }'
``` ```


#### Request parameters #### Request parameters


- `"dataset_name"`: (*Body parameter*)
- `"id"`: (*Body parameter*)
The ID of the created dataset used to uniquely identify different datasets.
- If creating a dataset, `id` must not be provided.

- `"name"`: (*Body parameter*)
The name of the dataset, which must adhere to the following requirements: The name of the dataset, which must adhere to the following requirements:
- Maximum 65,535 characters.
- Required when creating a dataset and must be unique.
- If updating a dataset, `name` must still be unique.

- `"avatar"`: (*Body parameter*)
Base64 encoding of the avatar.

- `"tenant_id"`: (*Body parameter*) - `"tenant_id"`: (*Body parameter*)
The ID of the tenant.
The ID of the tenant associated with the dataset, used to link it with specific users.
- If creating a dataset, `tenant_id` must not be provided.
- If updating a dataset, `tenant_id` cannot be changed.

- `"description"`: (*Body parameter*)
The description of the dataset.

- `"language"`: (*Body parameter*)
The language setting for the dataset.

- `"embedding_model"`: (*Body parameter*) - `"embedding_model"`: (*Body parameter*)
Embedding model used in the dataset.
- `"chunk_count"`: (*Body parameter*)
Chunk count of the dataset.
Embedding model used in the dataset to generate vector embeddings.
- If creating a dataset, `embedding_model` must not be provided.
- If updating a dataset, `embedding_model` cannot be changed.

- `"permission"`: (*Body parameter*)
Specifies who can manipulate the dataset.

- `"document_count"`: (*Body parameter*) - `"document_count"`: (*Body parameter*)
Document count of the dataset.
- `"parse_mehtod"`: (*Body parameter*)
Parsing method of the dataset.
Document count of the dataset.
- If updating a dataset, `document_count` cannot be changed.

- `"chunk_count"`: (*Body parameter*)
Chunk count of the dataset.
- If updating a dataset, `chunk_count` cannot be changed.

- `"parse_method"`: (*Body parameter*)
Parsing method of the dataset.
- If updating `parse_method`, `chunk_count` must be greater than 0.

- `"parser_config"`: (*Body parameter*)
The configuration settings for the dataset parser.


### Response ### Response


The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:


```shell
```json
{ {
"code": 0
"code": 0,
"data": {
"avatar": null,
"chunk_count": 0,
"create_date": "Thu, 10 Oct 2024 05:57:37 GMT",
"create_time": 1728539857641,
"created_by": "69736c5e723611efb51b0242ac120007",
"description": null,
"document_count": 0,
"embedding_model": "BAAI/bge-large-zh-v1.5",
"id": "8d73076886cc11ef8c270242ac120006",
"language": "English",
"name": "test_1",
"parse_method": "naive",
"parser_config": {
"pages": [
[
1,
1000000
]
]
},
"permission": "me",
"similarity_threshold": 0.2,
"status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"token_num": 0,
"update_date": "Thu, 10 Oct 2024 05:57:37 GMT",
"update_time": 1728539857641,
"vector_similarity_weight": 0.3
}
} }
``` ```


The error response includes a JSON object like the following: The error response includes a JSON object like the following:


```shell
```json
{ {
"code": 3016,
"message": "Can't connect database"
"code": 102,
"message": "Duplicated knowledgebase name in creating dataset."
} }
``` ```




**DELETE** `/api/v1/dataset` **DELETE** `/api/v1/dataset`


Deletes a dataset by its id or name.
Deletes datasets by ids or names.


### Request ### Request


- Method: DELETE - Method: DELETE
- URL: `/api/v1/dataset/{dataset_id}`
- URL: `http://{address}/api/v1/dataset`
- Headers: - Headers:
- `content-Type: application/json` - `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
- Body:
- `"names"`: `List[string]`
- `"ids"`: `List[string]`




#### Request example #### Request example


```shell
```bash
# Either id or name must be provided, but not both.
curl --request DELETE \ curl --request DELETE \
--url http://{address}/api/v1/dataset/0 \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
--data ' {
"names": ["ds1", "ds2"]
}'
--url http://{address}/api/v1/dataset \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
--data '{
"names": ["test_1", "test_2"]
}'
``` ```


#### Request parameters #### Request parameters


The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:


```shell
```json
{ {
"code": 0 "code": 0
} }
The error response includes a JSON object like the following: The error response includes a JSON object like the following:


```shell
```json
{ {
"code": 3016,
"message": "Try to delete non-existent dataset."
"code": 102,
"message": "You don't own the dataset."
} }
``` ```


### Request ### Request


- Method: PUT - Method: PUT
- URL: `/api/v1/dataset/{dataset_id}`
- URL: `http://{address}/api/v1/dataset/{dataset_id}`
- Headers: - Headers:
- `content-Type: application/json` - `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
- Body: (Refer to the "Create Dataset" for the complete structure of the request body.)




#### Request example #### Request example


```shell
```bash
# "id": id is required.
# "name": If you update name, it can't be duplicated.
# "tenant_id": If you update tenant_id, it can't be changed
# "embedding_model": If you update embedding_model, it can't be changed.
# "chunk_count": If you update chunk_count, it can't be changed.
# "document_count": If you update document_count, it can't be changed.
# "parse_method": If you update parse_method, chunk_count must be 0.
# "navie" means general.
curl --request PUT \ curl --request PUT \
--url http://{address}/api/v1/dataset/0 \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
--data-binary '{
"dataset_name": "test",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006",
"embedding_model": "BAAI/bge--zh-v1.5",
"chunk_count": 0,
"document_count": 0,
"parse_method": "general"
--url http://{address}/api/v1/dataset/{dataset_id} \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
--data '{
"name": "test",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006",
"embedding_model": "BAAI/bge-zh-v1.5",
"chunk_count": 0,
"document_count": 0,
"parse_method": "navie"
}' }'
``` ```


#### Request parameters #### Request parameters
(Refer to the "Create Dataset" for the complete structure of the request parameters.)


- `"dataset_name"`: (*Body parameter*)
The name of the dataset, which must adhere to the following requirements:
- Maximum 65,535 characters.
- `"tenant_id"`: (*Body parameter*)
The ID of the tenant.
- `"embedding_model"`: (*Body parameter*)
Embedding model used in the dataset.
- `"chunk_count"`: (*Body parameter*)
Chunk count of the dataset.
- `"document_count"`: (*Body parameter*)
Document count of the dataset.
- `"parse_mehtod"`: (*Body parameter*)
Parsing method of the dataset.


### Response ### Response


The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:


```shell
```json
{ {
"code": 0 "code": 0
} }
The error response includes a JSON object like the following: The error response includes a JSON object like the following:


```shell
```json
{ {
"code": 3016,
"message": "Can't change embedding model since some files already use it."
"code": 102,
"message": "Can't change tenant_id."
} }
``` ```


## List datasets ## List datasets


**GET** `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}`
**GET** `/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}`


List all datasets List all datasets


### Request ### Request


- Method: GET - Method: GET
- URL: `/api/v1/dataset?name={name}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}`
- URL: `http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}`
- Headers: - Headers:
- `content-Type: application/json`
- 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'




#### Request example #### Request example


```shell
```bash
# If no page parameter is passed, the default is 1
# If no page_size parameter is passed, the default is 1024
# If no order_by parameter is passed, the default is "create_time"
# If no desc parameter is passed, the default is True
curl --request GET \ curl --request GET \
--url http://{address}/api/v1/dataset?page=0&page_size=50&orderby=create_time&desc=false \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
--url http://{address}/api/v1/dataset?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id} \
--header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
``` ```


#### Request parameters #### Request parameters
A boolean flag indicating whether the sorting should be in descending order. A boolean flag indicating whether the sorting should be in descending order.
- `name`: (*Path parameter*) - `name`: (*Path parameter*)
Dataset name Dataset name
- - `"id"`: (*Path parameter*)
The ID of the dataset to be retrieved.
- `"name"`: (*Path parameter*)
The name of the dataset to be retrieved.


### Response ### Response


The successful response includes a JSON object like the following: The successful response includes a JSON object like the following:


```shell
```json
{ {
"code": 0, "code": 0,
"data": [ "data": [
{ {
"avatar": "",
"chunk_count": 0,
"create_date": "Thu, 29 Aug 2024 03:13:07 GMT",
"create_time": 1724901187843,
"created_by": "4fb0cd625f9311efba4a0242ac120006",
"description": "",
"document_count": 0,
"embedding_model": "BAAI/bge-large-zh-v1.5",
"id": "9d3d906665b411ef87d10242ac120006",
"language": "English",
"name": "Test",
"parser_config": {
"chunk_token_count": 128,
"delimiter": "\n!?。;!?",
"layout_recognize": true,
"task_page_size": 12
},
"parse_method": "naive",
"permission": "me",
"similarity_threshold": 0.2,
"status": "1",
"tenant_id": "4fb0cd625f9311efba4a0242ac120006",
"token_count": 0,
"update_date": "Thu, 29 Aug 2024 03:13:07 GMT",
"update_time": 1724901187843,
"vector_similarity_weight": 0.3
"avatar": "",
"chunk_count": 59,
"create_date": "Sat, 14 Sep 2024 01:12:37 GMT",
"create_time": 1726276357324,
"created_by": "69736c5e723611efb51b0242ac120007",
"description": null,
"document_count": 1,
"embedding_model": "BAAI/bge-large-zh-v1.5",
"id": "6e211ee0723611efa10a0242ac120007",
"language": "English",
"name": "mysql",
"parse_method": "knowledge_graph",
"parser_config": {
"chunk_token_num": 8192,
"delimiter": "\\n!?;。;!?",
"entity_types": [
"organization",
"person",
"location",
"event",
"time"
]
},
"permission": "me",
"similarity_threshold": 0.2,
"status": "1",
"tenant_id": "69736c5e723611efb51b0242ac120007",
"token_num": 12744,
"update_date": "Thu, 10 Oct 2024 04:07:23 GMT",
"update_time": 1728533243536,
"vector_similarity_weight": 0.3
} }
],
]
} }
``` ```


The error response includes a JSON object like the following: The error response includes a JSON object like the following:


```shell
```json
{ {
"code": 3016,
"message": "Can't access database to get the dataset list."
"code": 102,
"message": "The dataset doesn't exist"
} }
``` ```



+ 40
- 64
api/python_api_reference.md 파일 보기



#### avatar: `str` #### avatar: `str`


The url or ???????????????????????? path to the avatar image associated with the created dataset. Defaults to `""`
Base64 encoding of the avatar. Defaults to `""`


#### tenant_id: `str` ?????????????????
#### tenant_id: `str`


The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`. The id of the tenant associated with the created dataset is used to identify different users. Defaults to `None`.




The language setting of the created dataset. Defaults to `"English"`. ???????????? The language setting of the created dataset. Defaults to `"English"`. ????????????


#### embedding_model: `str` ????????????????
#### embedding_model: `str`


The specific model or algorithm used by the dataset to generate vector embeddings. Defaults to `""`.
The specific model used by the dataset to generate vector embeddings. Defaults to `""`.


- If creating a dataset, embedding_model must not be provided. - If creating a dataset, embedding_model must not be provided.
- If updating a dataset, embedding_model can't be changed. - If updating a dataset, embedding_model can't be changed.
The configuration settings for the parser used by the dataset. The configuration settings for the parser used by the dataset.


### Returns ### Returns

- Success: An `infinity.local_infinity.table.LocalTable` object in Python module mode or an `infinity.remote_thrift.table.RemoteTable` object in client-server mode.
- Failure: `InfinityException`
- `error_code`: `int` A non-zero value indicating a specific error condition.
- `error_msg`: `str` A message providing additional details about the error.

```python
DataSet
description: dataset object
```
### Examples ### Examples


```python ```python


--- ---


## Delete knowledge base
## Delete knowledge bases


```python ```python
DataSet.delete() -> bool
RAGFlow.delete_dataset(ids: List[str] = None, names: List[str] = None)
``` ```
Deletes knowledge bases.
### Parameters


Deletes a knowledge base.
#### ids: `List[str]`


### Returns
The ids of the datasets to be deleted.

#### names: `List[str]`


`bool`
The names of the datasets to be deleted.


description:the case of updating an dateset, `True` or `False`.
Either `ids` or `names` must be provided, but not both.
### Returns

```python
no return
```


### Examples ### Examples


from ragflow import RAGFlow from ragflow import RAGFlow


rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.create_dataset(name="kb_1")
ds.delete()
rag.delete_dataset(names=["name_1","name_2"])
rag.delete_dataset(ids=["id_1","id_2"])
``` ```


--- ---
page: int = 1, page: int = 1,
page_size: int = 1024, page_size: int = 1024,
orderby: str = "create_time", orderby: str = "create_time",
desc: bool = True
desc: bool = True,
id: str = None,
name: str = None
) -> List[DataSet] ) -> List[DataSet]
``` ```




Whether the sorting should be in descending order. Defaults to `True`. Whether the sorting should be in descending order. Defaults to `True`.


### Returns

```python
List[DataSet]
description:the list of datasets.
```

### Examples

```python
from ragflow import RAGFlow

rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
for ds in rag.list_datasets():
print(ds)
```

---

## Retrieve knowledge base

```python
RAGFlow.get_dataset(
id: str = None,
name: str = None
) -> DataSet
```
#### id: `str`


Retrieves a knowledge base by name.

### Parameters
The id of the dataset to be got. Defaults to `None`.


#### name: `str` #### name: `str`


The name of the dataset to be got. If `id` is not provided, `name` is required.

#### id: `str`

The id of the dataset to be got. If `name` is not provided, `id` is required.
The name of the dataset to be got. Defaults to `None`.


### Returns ### Returns


```python ```python
DataSet
description: dataset object
List[DataSet]
description:the list of datasets.
``` ```


### Examples ### Examples
from ragflow import RAGFlow from ragflow import RAGFlow


rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.get_dataset(name="ragflow")
print(ds)
for ds in rag.list_datasets():
print(ds)
``` ```


--- ---


## Save knowledge base configurations

## Update knowledge base


```python ```python
DataSet.save() -> bool
DataSet.update(update_message: dict)
``` ```


### Returns ### Returns


```python ```python
bool
description:the case of updating an dateset, True or False.
no return
``` ```


### Examples ### Examples


rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
ds = rag.get_dataset(name="kb_1") ds = rag.get_dataset(name="kb_1")
ds.parse_method = "manual"
ds.save()
ds.update({"parse_method":"manual", ...}}
``` ```


--- ---

+ 29
- 0
api/utils/api_utils.py 파일 보기

return func(*args, **kwargs) return func(*args, **kwargs)


return decorated_function return decorated_function

def get_result(retcode=RetCode.SUCCESS, retmsg='error', data=None):
if retcode == 0:
if data is not None:
response = {"code": retcode, "data": data}
else:
response = {"code": retcode}
else:
response = {"code": retcode, "message": retmsg}
return jsonify(response)

def get_error_data_result(retcode=RetCode.DATA_ERROR,
retmsg='Sorry! Data missing!'):
import re
result_dict = {
"code": retcode,
"message": re.sub(
r"rag",
"seceum",
retmsg,
flags=re.IGNORECASE)}
response = {}
for key, value in result_dict.items():
if value is None and key != "code":
continue
else:
response[key] = value
return jsonify(response)


+ 4
- 0
sdk/python/ragflow/modules/base.py 파일 보기

res = self.rag.delete(path, params) res = self.rag.delete(path, params)
return res return res
def put(self,path, json):
res = self.rag.put(path,json)
return res
def __str__(self): def __str__(self):
return str(self.to_json()) return str(self.to_json())

+ 5
- 16
sdk/python/ragflow/modules/dataset.py 파일 보기

res_dict.pop(k) res_dict.pop(k)
super().__init__(rag, res_dict) super().__init__(rag, res_dict)
def save(self) -> bool:
res = self.post('/dataset/save',
{"id": self.id, "name": self.name, "avatar": self.avatar, "tenant_id": self.tenant_id,
"description": self.description, "language": self.language, "embedding_model": self.embedding_model,
"permission": self.permission,
"document_count": self.document_count, "chunk_count": self.chunk_count, "parse_method": self.parse_method,
"parser_config": self.parser_config.to_json()
})
def update(self, update_message: dict):
res = self.put(f'/dataset/{self.id}',
update_message)
res = res.json() res = res.json()
if res.get("retmsg") == "success": return True
raise Exception(res["retmsg"])
if res.get("code") != 0:
raise Exception(res["message"])
def delete(self) -> bool:
res = self.rm('/dataset/delete',
{"id": self.id})
res = res.json()
if res.get("retmsg") == "success": return True
raise Exception(res["retmsg"])
def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]: def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
""" """

+ 21
- 17
sdk/python/ragflow/ragflow.py 파일 보기

import requests import requests


from .modules.assistant import Assistant from .modules.assistant import Assistant
from .modules.chunk import Chunk
from .modules.dataset import DataSet from .modules.dataset import DataSet
from .modules.document import Document from .modules.document import Document
from .modules.chunk import Chunk




class RAGFlow: class RAGFlow:
return res return res


def delete(self, path, params): def delete(self, path, params):
res = requests.delete(url=self.api_url + path, params=params, headers=self.authorization_header)
res = requests.delete(url=self.api_url + path, json=params, headers=self.authorization_header)
return res

def put(self, path, json):
res = requests.put(url=self.api_url + path, json= json,headers=self.authorization_header)
return res return res


def create_dataset(self, name: str, avatar: str = "", description: str = "", language: str = "English", def create_dataset(self, name: str, avatar: str = "", description: str = "", language: str = "English",
parser_config = DataSet.ParserConfig(self, {"chunk_token_count": 128, "layout_recognize": True, parser_config = DataSet.ParserConfig(self, {"chunk_token_count": 128, "layout_recognize": True,
"delimiter": "\n!?。;!?", "task_page_size": 12}) "delimiter": "\n!?。;!?", "task_page_size": 12})
parser_config = parser_config.to_json() parser_config = parser_config.to_json()
res = self.post("/dataset/save",
res = self.post("/dataset",
{"name": name, "avatar": avatar, "description": description, "language": language, {"name": name, "avatar": avatar, "description": description, "language": language,
"permission": permission, "permission": permission,
"document_count": document_count, "chunk_count": chunk_count, "parse_method": parse_method, "document_count": document_count, "chunk_count": chunk_count, "parse_method": parse_method,
} }
) )
res = res.json() res = res.json()
if res.get("retmsg") == "success":
if res.get("code") == 0:
return DataSet(self, res["data"]) return DataSet(self, res["data"])
raise Exception(res["retmsg"])
raise Exception(res["message"])

def delete_dataset(self, ids: List[str] = None, names: List[str] = None):
res = self.delete("/dataset",{"ids": ids, "names": names})
res=res.json()
if res.get("code") != 0:
raise Exception(res["message"])


def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True) -> \
def list_datasets(self, page: int = 1, page_size: int = 1024, orderby: str = "create_time", desc: bool = True,
id: str = None, name: str = None) -> \
List[DataSet]: List[DataSet]:
res = self.get("/dataset/list", {"page": page, "page_size": page_size, "orderby": orderby, "desc": desc})
res = self.get("/dataset",
{"page": page, "page_size": page_size, "orderby": orderby, "desc": desc, "id": id, "name": name})
res = res.json() res = res.json()
result_list = [] result_list = []
if res.get("retmsg") == "success":
if res.get("code") == 0:
for data in res['data']: for data in res['data']:
result_list.append(DataSet(self, data)) result_list.append(DataSet(self, data))
return result_list return result_list
raise Exception(res["retmsg"])

def get_dataset(self, id: str = None, name: str = None) -> DataSet:
res = self.get("/dataset/detail", {"id": id, "name": name})
res = res.json()
if res.get("retmsg") == "success":
return DataSet(self, res['data'])
raise Exception(res["retmsg"])
raise Exception(res["message"])


def create_assistant(self, name: str = "assistant", avatar: str = "path", knowledgebases: List[DataSet] = [], def create_assistant(self, name: str = "assistant", avatar: str = "path", knowledgebases: List[DataSet] = [],
llm: Assistant.LLM = None, prompt: Assistant.Prompt = None) -> Assistant: llm: Assistant.LLM = None, prompt: Assistant.Prompt = None) -> Assistant:
except Exception as e: except Exception as e:
print(f"An error occurred during retrieval: {e}") print(f"An error occurred during retrieval: {e}")
raise raise


+ 1
- 1
sdk/python/test/common.py 파일 보기





API_KEY = 'ragflow-k0YzUxMGY4NjY5YTExZWY5MjI5MDI0Mm'
API_KEY = 'ragflow-NiYmZjNTVjODYwNzExZWZiODEwMDI0Mm'
HOST_ADDRESS = 'http://127.0.0.1:9380' HOST_ADDRESS = 'http://127.0.0.1:9380'

+ 4
- 14
sdk/python/test/t_dataset.py 파일 보기

ds = rag.create_dataset("ABC") ds = rag.create_dataset("ABC")
if isinstance(ds, DataSet): if isinstance(ds, DataSet):
assert ds.name == "ABC", "Name does not match." assert ds.name == "ABC", "Name does not match."
ds.name = 'DEF'
res = ds.save()
assert res is True, f"Failed to update dataset, error: {res}"
res = ds.update({"name":"DEF"})
assert res is None, f"Failed to update dataset, error: {res}"
else: else:
assert False, f"Failed to create dataset, error: {ds}" assert False, f"Failed to create dataset, error: {ds}"
ds = rag.create_dataset("MA") ds = rag.create_dataset("MA")
if isinstance(ds, DataSet): if isinstance(ds, DataSet):
assert ds.name == "MA", "Name does not match." assert ds.name == "MA", "Name does not match."
res = ds.delete()
assert res is True, f"Failed to delete dataset, error: {res}"
res = rag.delete_dataset(names=["MA"])
assert res is None, f"Failed to delete dataset, error: {res}"
else: else:
assert False, f"Failed to create dataset, error: {ds}" assert False, f"Failed to create dataset, error: {ds}"
assert len(list_datasets) > 0, "Do not exist any dataset" assert len(list_datasets) > 0, "Do not exist any dataset"
for ds in list_datasets: for ds in list_datasets:
assert isinstance(ds, DataSet), "Existence type is not dataset." assert isinstance(ds, DataSet), "Existence type is not dataset."
def test_get_detail_dataset_with_success(self):
"""
Test getting a dataset's detail with success
"""
rag = RAGFlow(API_KEY, HOST_ADDRESS)
ds = rag.get_dataset(name="God")
assert isinstance(ds, DataSet), f"Failed to get dataset, error: {ds}."
assert ds.name == "God", "Name does not match"

Loading…
취소
저장