### What problem does this PR solve? Adds the api of listing documentation. ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.8.0
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| import os | |||||
| import re | |||||
| import warnings | |||||
| from flask import request | from flask import request | ||||
| from flask_login import login_required, current_user | from flask_login import login_required, current_user | ||||
| from httpx import HTTPError | from httpx import HTTPError | ||||
| from api.contants import NAME_LENGTH_LIMIT | from api.contants import NAME_LENGTH_LIMIT | ||||
| from api.db import FileSource, StatusEnum | |||||
| from api.db import FileType, ParserType, FileSource | |||||
| from api.db import StatusEnum | |||||
| from api.db.db_models import File | from api.db.db_models import File | ||||
| from api.db.services import duplicate_name | from api.db.services import duplicate_name | ||||
| from api.db.services.document_service import DocumentService | from api.db.services.document_service import DocumentService | ||||
| from api.db.services.user_service import TenantService | from api.db.services.user_service import TenantService | ||||
| from api.settings import RetCode | from api.settings import RetCode | ||||
| from api.utils import get_uuid | from api.utils import get_uuid | ||||
| from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request | |||||
| from api.utils.api_utils import construct_json_result, construct_error_response | |||||
| from api.utils.api_utils import construct_result, validate_request | |||||
| from api.utils.file_utils import filename_type, thumbnail | |||||
| from rag.utils.minio_conn import MINIO | |||||
| MAXIMUM_OF_UPLOADING_FILES = 256 | |||||
| # ------------------------------ create a dataset --------------------------------------- | # ------------------------------ create a dataset --------------------------------------- | ||||
| return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS) | return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS) | ||||
| except Exception as e: | except Exception as e: | ||||
| return construct_error_response(e) | return construct_error_response(e) | ||||
| # --------------------------------content management ---------------------------------------------- | |||||
| # ----------------------------upload files----------------------------------------------------- | |||||
| @manager.route('/<dataset_id>/documents/', methods=['POST']) | |||||
| @login_required | |||||
| def upload_documents(dataset_id): | |||||
| # no files | |||||
| if not request.files: | |||||
| return construct_json_result( | |||||
| message='There is no file!', code=RetCode.ARGUMENT_ERROR) | |||||
| # the number of uploading files exceeds the limit | |||||
| file_objs = request.files.getlist('file') | |||||
| num_file_objs = len(file_objs) | |||||
| if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " | |||||
| f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") | |||||
| for file_obj in file_objs: | |||||
| # the content of the file | |||||
| file_content = file_obj.read() | |||||
| file_name = file_obj.filename | |||||
| # no name | |||||
| if not file_name: | |||||
| return construct_json_result( | |||||
| message='There is a file without name!', code=RetCode.ARGUMENT_ERROR) | |||||
| # TODO: support the remote files | |||||
| if 'http' in file_name: | |||||
| return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") | |||||
| # the content is empty, raising a warning | |||||
| if file_content == b'': | |||||
| warnings.warn(f"[WARNING]: The file {file_name} is empty.") | |||||
| # no dataset | |||||
| exist, dataset = KnowledgebaseService.get_by_id(dataset_id) | |||||
| if not exist: | |||||
| return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) | |||||
| # get the root_folder | |||||
| root_folder = FileService.get_root_folder(current_user.id) | |||||
| # get the id of the root_folder | |||||
| parent_file_id = root_folder["id"] # document id | |||||
| # this is for the new user, create '.knowledgebase' file | |||||
| FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | |||||
| # go inside this folder, get the kb_root_folder | |||||
| kb_root_folder = FileService.get_kb_folder(current_user.id) | |||||
| # link the file management to the kb_folder | |||||
| kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) | |||||
| # grab all the errs | |||||
| err = [] | |||||
| MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) | |||||
| uploaded_docs_json = [] | |||||
| for file in file_objs: | |||||
| try: | |||||
| # TODO: get this value from the database as some tenants have this limit while others don't | |||||
| if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||||
| message="Exceed the maximum file number of a free user!") | |||||
| # deal with the duplicate name | |||||
| filename = duplicate_name( | |||||
| DocumentService.query, | |||||
| name=file.filename, | |||||
| kb_id=dataset.id) | |||||
| # deal with the unsupported type | |||||
| filetype = filename_type(filename) | |||||
| if filetype == FileType.OTHER.value: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||||
| message="This type of file has not been supported yet!") | |||||
| # upload to the minio | |||||
| location = filename | |||||
| while MINIO.obj_exist(dataset_id, location): | |||||
| location += "_" | |||||
| blob = file.read() | |||||
| MINIO.put(dataset_id, location, blob) | |||||
| doc = { | |||||
| "id": get_uuid(), | |||||
| "kb_id": dataset.id, | |||||
| "parser_id": dataset.parser_id, | |||||
| "parser_config": dataset.parser_config, | |||||
| "created_by": current_user.id, | |||||
| "type": filetype, | |||||
| "name": filename, | |||||
| "location": location, | |||||
| "size": len(blob), | |||||
| "thumbnail": thumbnail(filename, blob) | |||||
| } | |||||
| if doc["type"] == FileType.VISUAL: | |||||
| doc["parser_id"] = ParserType.PICTURE.value | |||||
| if re.search(r"\.(ppt|pptx|pages)$", filename): | |||||
| doc["parser_id"] = ParserType.PRESENTATION.value | |||||
| DocumentService.insert(doc) | |||||
| FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) | |||||
| uploaded_docs_json.append(doc) | |||||
| except Exception as e: | |||||
| err.append(file.filename + ": " + str(e)) | |||||
| if err: | |||||
| # return all the errors | |||||
| return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) | |||||
| # success | |||||
| return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS) | |||||
| # ----------------------------delete a file----------------------------------------------------- | |||||
| @manager.route('/<dataset_id>/documents/<document_id>', methods=['DELETE']) | |||||
| @login_required | |||||
| def delete_document(document_id, dataset_id): # string | |||||
| # get the root folder | |||||
| root_folder = FileService.get_root_folder(current_user.id) | |||||
| # parent file's id | |||||
| parent_file_id = root_folder["id"] | |||||
| # consider the new user | |||||
| FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | |||||
| # store all the errors that may have | |||||
| errors = "" | |||||
| try: | |||||
| # whether there is this document | |||||
| exist, doc = DocumentService.get_by_id(document_id) | |||||
| if not exist: | |||||
| return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR) | |||||
| # whether this doc is authorized by this tenant | |||||
| tenant_id = DocumentService.get_tenant_id(document_id) | |||||
| if not tenant_id: | |||||
| return construct_json_result( | |||||
| message=f"You cannot delete this document {document_id} due to the authorization" | |||||
| f" reason!", code=RetCode.AUTHENTICATION_ERROR) | |||||
| # get the doc's id and location | |||||
| real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id) | |||||
| if real_dataset_id != dataset_id: | |||||
| return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, " | |||||
| f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR) | |||||
| # there is an issue when removing | |||||
| if not DocumentService.remove_document(doc, tenant_id): | |||||
| return construct_json_result( | |||||
| message="There was an error during the document removal process. Please check the status of the " | |||||
| "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR) | |||||
| # fetch the File2Document record associated with the provided document ID. | |||||
| file_to_doc = File2DocumentService.get_by_document_id(document_id) | |||||
| # delete the associated File record. | |||||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id]) | |||||
| # delete the File2Document record itself using the document ID. This removes the | |||||
| # association between the document and the file after the File record has been deleted. | |||||
| File2DocumentService.delete_by_document_id(document_id) | |||||
| # delete it from minio | |||||
| MINIO.rm(dataset_id, location) | |||||
| except Exception as e: | |||||
| errors += str(e) | |||||
| if errors: | |||||
| return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) | |||||
| return construct_json_result(data=True, code=RetCode.SUCCESS) | |||||
| # ----------------------------list files----------------------------------------------------- | |||||
| @manager.route('/<dataset_id>/documents/', methods=['GET']) | |||||
| @login_required | |||||
| def list_documents(dataset_id): | |||||
| if not dataset_id: | |||||
| return construct_json_result( | |||||
| data=False, message='Lack of "dataset_id"', code=RetCode.ARGUMENT_ERROR) | |||||
| # searching keywords | |||||
| keywords = request.args.get("keywords", "") | |||||
| offset = request.args.get("offset", 0) | |||||
| count = request.args.get("count", -1) | |||||
| order_by = request.args.get("order_by", "create_time") | |||||
| descend = request.args.get("descend", True) | |||||
| try: | |||||
| docs, total = DocumentService.list_documents_in_dataset(dataset_id, int(offset), int(count), order_by, | |||||
| descend, keywords) | |||||
| return construct_json_result(data={"total": total, "docs": docs}, message=RetCode.SUCCESS) | |||||
| except Exception as e: | |||||
| return construct_error_response(e) | |||||
| # ----------------------------download a file----------------------------------------------------- | |||||
| # ----------------------------enable rename----------------------------------------------------- | |||||
| # ----------------------------start parsing----------------------------------------------------- | |||||
| # ----------------------------stop parsing----------------------------------------------------- | |||||
| # ----------------------------show the status of the file----------------------------------------------------- | |||||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||||
| # ----------------------------delete the chunk----------------------------------------------------- | |||||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||||
| # ----------------------------upload a file----------------------------------------------------- | |||||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||||
| # ----------------------------retrieval test----------------------------------------------------- | |||||
| # | |||||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License | |||||
| # | |||||
| import os | |||||
| import re | |||||
| import warnings | |||||
| from flask import request | |||||
| from flask_login import login_required, current_user | |||||
| from api.db import FileType, ParserType | |||||
| from api.db.services import duplicate_name | |||||
| from api.db.services.document_service import DocumentService | |||||
| from api.db.services.file2document_service import File2DocumentService | |||||
| from api.db.services.file_service import FileService | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.settings import RetCode | |||||
| from api.utils import get_uuid | |||||
| from api.utils.api_utils import construct_json_result | |||||
| from api.utils.file_utils import filename_type, thumbnail | |||||
| from rag.utils.minio_conn import MINIO | |||||
| from api.db.db_models import Task, File | |||||
| from api.db import FileType, TaskStatus, ParserType, FileSource | |||||
| MAXIMUM_OF_UPLOADING_FILES = 256 | |||||
| # ----------------------------upload local files----------------------------------------------------- | |||||
| @manager.route('/<dataset_id>', methods=['POST']) | |||||
| @login_required | |||||
| def upload(dataset_id): | |||||
| # no files | |||||
| if not request.files: | |||||
| return construct_json_result( | |||||
| message='There is no file!', code=RetCode.ARGUMENT_ERROR) | |||||
| # the number of uploading files exceeds the limit | |||||
| file_objs = request.files.getlist('file') | |||||
| num_file_objs = len(file_objs) | |||||
| if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " | |||||
| f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") | |||||
| for file_obj in file_objs: | |||||
| # the content of the file | |||||
| file_content = file_obj.read() | |||||
| file_name = file_obj.filename | |||||
| # no name | |||||
| if not file_name: | |||||
| return construct_json_result( | |||||
| message='There is a file without name!', code=RetCode.ARGUMENT_ERROR) | |||||
| # TODO: support the remote files | |||||
| if 'http' in file_name: | |||||
| return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") | |||||
| # the content is empty, raising a warning | |||||
| if file_content == b'': | |||||
| warnings.warn(f"[WARNING]: The file {file_name} is empty.") | |||||
| # no dataset | |||||
| exist, dataset = KnowledgebaseService.get_by_id(dataset_id) | |||||
| if not exist: | |||||
| return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) | |||||
| # get the root_folder | |||||
| root_folder = FileService.get_root_folder(current_user.id) | |||||
| # get the id of the root_folder | |||||
| parent_file_id = root_folder["id"] # document id | |||||
| # this is for the new user, create '.knowledgebase' file | |||||
| FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | |||||
| # go inside this folder, get the kb_root_folder | |||||
| kb_root_folder = FileService.get_kb_folder(current_user.id) | |||||
| # link the file management to the kb_folder | |||||
| kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) | |||||
| # grab all the errs | |||||
| err = [] | |||||
| MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) | |||||
| uploaded_docs_json = [] | |||||
| for file in file_objs: | |||||
| try: | |||||
| # TODO: get this value from the database as some tenants have this limit while others don't | |||||
| if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||||
| message="Exceed the maximum file number of a free user!") | |||||
| # deal with the duplicate name | |||||
| filename = duplicate_name( | |||||
| DocumentService.query, | |||||
| name=file.filename, | |||||
| kb_id=dataset.id) | |||||
| # deal with the unsupported type | |||||
| filetype = filename_type(filename) | |||||
| if filetype == FileType.OTHER.value: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||||
| message="This type of file has not been supported yet!") | |||||
| # upload to the minio | |||||
| location = filename | |||||
| while MINIO.obj_exist(dataset_id, location): | |||||
| location += "_" | |||||
| blob = file.read() | |||||
| MINIO.put(dataset_id, location, blob) | |||||
| doc = { | |||||
| "id": get_uuid(), | |||||
| "kb_id": dataset.id, | |||||
| "parser_id": dataset.parser_id, | |||||
| "parser_config": dataset.parser_config, | |||||
| "created_by": current_user.id, | |||||
| "type": filetype, | |||||
| "name": filename, | |||||
| "location": location, | |||||
| "size": len(blob), | |||||
| "thumbnail": thumbnail(filename, blob) | |||||
| } | |||||
| if doc["type"] == FileType.VISUAL: | |||||
| doc["parser_id"] = ParserType.PICTURE.value | |||||
| if re.search(r"\.(ppt|pptx|pages)$", filename): | |||||
| doc["parser_id"] = ParserType.PRESENTATION.value | |||||
| DocumentService.insert(doc) | |||||
| FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) | |||||
| uploaded_docs_json.append(doc) | |||||
| except Exception as e: | |||||
| err.append(file.filename + ": " + str(e)) | |||||
| if err: | |||||
| # return all the errors | |||||
| return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) | |||||
| # success | |||||
| return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS) | |||||
| # ----------------------------delete a file----------------------------------------------------- | |||||
| @manager.route('/<dataset_id>/<document_id>', methods=['DELETE']) | |||||
| @login_required | |||||
| def delete(document_id, dataset_id): # string | |||||
| # get the root folder | |||||
| root_folder = FileService.get_root_folder(current_user.id) | |||||
| # parent file's id | |||||
| parent_file_id = root_folder["id"] | |||||
| # consider the new user | |||||
| FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | |||||
| # store all the errors that may have | |||||
| errors = "" | |||||
| try: | |||||
| # whether there is this document | |||||
| exist, doc = DocumentService.get_by_id(document_id) | |||||
| if not exist: | |||||
| return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR) | |||||
| # whether this doc is authorized by this tenant | |||||
| tenant_id = DocumentService.get_tenant_id(document_id) | |||||
| if not tenant_id: | |||||
| return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization" | |||||
| f" reason!", code=RetCode.AUTHENTICATION_ERROR) | |||||
| # get the doc's id and location | |||||
| real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id) | |||||
| if real_dataset_id != dataset_id: | |||||
| return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, " | |||||
| f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR) | |||||
| # there is an issue when removing | |||||
| if not DocumentService.remove_document(doc, tenant_id): | |||||
| return construct_json_result( | |||||
| message="There was an error during the document removal process. Please check the status of the " | |||||
| "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR) | |||||
| # fetch the File2Document record associated with the provided document ID. | |||||
| file_to_doc = File2DocumentService.get_by_document_id(document_id) | |||||
| # delete the associated File record. | |||||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id]) | |||||
| # delete the File2Document record itself using the document ID. This removes the | |||||
| # association between the document and the file after the File record has been deleted. | |||||
| File2DocumentService.delete_by_document_id(document_id) | |||||
| # delete it from minio | |||||
| MINIO.rm(dataset_id, location) | |||||
| except Exception as e: | |||||
| errors += str(e) | |||||
| if errors: | |||||
| return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) | |||||
| return construct_json_result(data=True, code=RetCode.SUCCESS) | |||||
| # ----------------------------upload online files------------------------------------------------ | |||||
| # ----------------------------download a file----------------------------------------------------- | |||||
| # ----------------------------enable rename----------------------------------------------------- | |||||
| # ----------------------------list files----------------------------------------------------- | |||||
| # ----------------------------start parsing----------------------------------------------------- | |||||
| # ----------------------------stop parsing----------------------------------------------------- | |||||
| # ----------------------------show the status of the file----------------------------------------------------- | |||||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||||
| # ----------------------------delete the chunk----------------------------------------------------- | |||||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||||
| # ----------------------------upload a file----------------------------------------------------- | |||||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||||
| # ----------------------------retrieval test----------------------------------------------------- |
| return list(docs.dicts()), count | return list(docs.dicts()), count | ||||
| @classmethod | |||||
| @DB.connection_context() | |||||
| def list_documents_in_dataset(cls, dataset_id, offset, count, order_by, descend, keywords): | |||||
| if keywords: | |||||
| docs = cls.model.select().where( | |||||
| (cls.model.kb_id == dataset_id), | |||||
| (fn.LOWER(cls.model.name).contains(keywords.lower())) | |||||
| ) | |||||
| else: | |||||
| docs = cls.model.select().where(cls.model.kb_id == dataset_id) | |||||
| total = docs.count() | |||||
| if descend == 'True': | |||||
| docs = docs.order_by(cls.model.getter_by(order_by).desc()) | |||||
| if descend == 'False': | |||||
| docs = docs.order_by(cls.model.getter_by(order_by).asc()) | |||||
| docs = list(docs.dicts()) | |||||
| docs_length = len(docs) | |||||
| if offset < 0 or offset > docs_length: | |||||
| raise IndexError("Offset is out of the valid range.") | |||||
| if count == -1: | |||||
| return docs[offset:], total | |||||
| return docs[offset:offset + count], total | |||||
| @classmethod | @classmethod | ||||
| @DB.connection_context() | @DB.connection_context() | ||||
| def insert(cls, doc): | def insert(cls, doc): |
| if offset < 0 or offset > kbs_length: | if offset < 0 or offset > kbs_length: | ||||
| raise IndexError("Offset is out of the valid range.") | raise IndexError("Offset is out of the valid range.") | ||||
| if count == -1: | |||||
| return kbs[offset:] | |||||
| return kbs[offset:offset+count] | return kbs[offset:offset+count] | ||||
| @classmethod | @classmethod |
| "code": 102, | "code": 102, | ||||
| "message": "Please input at least one parameter that you want to update!" | "message": "Please input at least one parameter that you want to update!" | ||||
| } | } | ||||
| ``` | |||||
| ``` | |||||
| ''' | ''' | ||||
| api_url: http://<host_address>/api/v1 | api_url: http://<host_address>/api/v1 | ||||
| dataset_url: http://<host_address>/api/v1/dataset | dataset_url: http://<host_address>/api/v1/dataset | ||||
| document_url: http://<host_address>/api/v1/documents | |||||
| document_url: http://<host_address>/api/v1/dataset/{dataset_id}/documents | |||||
| ''' | ''' | ||||
| self.user_key = user_key | self.user_key = user_key | ||||
| self.api_url = f"{base_url}/api/{version}" | self.api_url = f"{base_url}/api/{version}" | ||||
| self.dataset_url = f"{self.api_url}/dataset" | self.dataset_url = f"{self.api_url}/dataset" | ||||
| self.document_url = f"{self.api_url}/documents" | |||||
| self.authorization_header = {"Authorization": "{}".format(self.user_key)} | self.authorization_header = {"Authorization": "{}".format(self.user_key)} | ||||
| def create_dataset(self, dataset_name): | def create_dataset(self, dataset_name): | ||||
| response = requests.put(endpoint, json=params, headers=self.authorization_header) | response = requests.put(endpoint, json=params, headers=self.authorization_header) | ||||
| return response.json() | return response.json() | ||||
| # -------------------- content management ----------------------------------------------------- | |||||
| # -------------------- content management ----------------------------------------------------- | |||||
| # ----------------------------upload local files----------------------------------------------------- | # ----------------------------upload local files----------------------------------------------------- | ||||
| def upload_local_file(self, dataset_id, file_paths): | def upload_local_file(self, dataset_id, file_paths): | ||||
| else: | else: | ||||
| return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"} | return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"} | ||||
| res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files, | |||||
| res = requests.request('POST', url=f"{self.dataset_url}/{dataset_id}/documents", files=files, | |||||
| headers=self.authorization_header) | headers=self.authorization_header) | ||||
| result_dict = json.loads(res.text) | result_dict = json.loads(res.text) | ||||
| # ----------------------------delete a file----------------------------------------------------- | # ----------------------------delete a file----------------------------------------------------- | ||||
| def delete_files(self, document_id, dataset_id): | def delete_files(self, document_id, dataset_id): | ||||
| endpoint = f"{self.document_url}/{dataset_id}/{document_id}" | |||||
| endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}" | |||||
| res = requests.delete(endpoint, headers=self.authorization_header) | res = requests.delete(endpoint, headers=self.authorization_header) | ||||
| return res.json() | return res.json() | ||||
| # ----------------------------list files----------------------------------------------------- | |||||
| def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", descend=True, keywords=""): | |||||
| params = { | |||||
| "offset": offset, | |||||
| "count": count, | |||||
| "order_by": order_by, | |||||
| "descend": descend, | |||||
| "keywords": keywords | |||||
| } | |||||
| endpoint = f"{self.dataset_url}/{dataset_id}/documents/" | |||||
| res = requests.get(endpoint, params=params, headers=self.authorization_header) | |||||
| return res.json() | |||||
| # ----------------------------download a file----------------------------------------------------- | # ----------------------------download a file----------------------------------------------------- | ||||
| # ----------------------------enable rename----------------------------------------------------- | # ----------------------------enable rename----------------------------------------------------- | ||||
| # ----------------------------list files----------------------------------------------------- | |||||
| # ----------------------------start parsing----------------------------------------------------- | # ----------------------------start parsing----------------------------------------------------- | ||||
| # ----------------------------stop parsing----------------------------------------------------- | # ----------------------------stop parsing----------------------------------------------------- |
| dataset_id = created_res['data']['dataset_id'] | dataset_id = created_res['data']['dataset_id'] | ||||
| file_paths = ["test_data/test.txt", "test_data/test1.txt"] | file_paths = ["test_data/test.txt", "test_data/test1.txt"] | ||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | res = ragflow.upload_local_file(dataset_id, file_paths) | ||||
| assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' | |||||
| assert res['code'] == RetCode.SUCCESS and res['message'] == 'success' | |||||
| def test_upload_one_file(self): | def test_upload_one_file(self): | ||||
| """ | """ | ||||
| dataset_id = created_res['data']['dataset_id'] | dataset_id = created_res['data']['dataset_id'] | ||||
| file_paths = ["test_data/test.txt"] | file_paths = ["test_data/test.txt"] | ||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | res = ragflow.upload_local_file(dataset_id, file_paths) | ||||
| assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' | |||||
| assert res['code'] == RetCode.SUCCESS and res['message'] == 'success' | |||||
| def test_upload_nonexistent_files(self): | def test_upload_nonexistent_files(self): | ||||
| """ | """ | ||||
| assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] == | assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] == | ||||
| f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.') | f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.') | ||||
| # ----------------------------list files----------------------------------------------------- | |||||
| def test_list_documents_with_success(self): | |||||
| """ | |||||
| Test listing documents with a successful outcome. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| # upload a document | |||||
| created_res = ragflow.create_dataset("test_list_documents_with_success") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt"] | |||||
| ragflow.upload_local_file(created_res_id, file_paths) | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id) | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1 | |||||
| def test_list_documents_with_checking_size(self): | |||||
| """ | |||||
| Test listing documents and verify the size and names of the documents. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| # upload 10 documents | |||||
| created_res = ragflow.create_dataset("test_list_documents_with_checking_size") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt"] * 10 | |||||
| ragflow.upload_local_file(created_res_id, file_paths) | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id) | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10 | |||||
| def test_list_documents_with_getting_empty_result(self): | |||||
| """ | |||||
| Test listing documents that should be empty. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| # upload 0 documents | |||||
| created_res = ragflow.create_dataset("test_list_documents_with_getting_empty_result") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id) | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 0 | |||||
| def test_list_documents_with_creating_100_documents(self): | |||||
| """ | |||||
| Test listing 100 documents and verify the size of these documents. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| # upload 100 documents | |||||
| created_res = ragflow.create_dataset("test_list_documents_with_creating_100_documents") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt"] * 100 | |||||
| ragflow.upload_local_file(created_res_id, file_paths) | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id) | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 100 | |||||
| def test_list_document_with_failure(self): | |||||
| """ | |||||
| Test listing documents with IndexError. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_list_document_with_failure") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| response = ragflow.list_files(created_res_id, offset=-1, count=-1) | |||||
| assert "IndexError" in response['message'] and response['code'] == RetCode.EXCEPTION_ERROR | |||||
| def test_list_document_with_verifying_offset_and_count(self): | |||||
| """ | |||||
| Test listing documents with verifying the functionalities of offset and count. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_list_document_with_verifying_offset_and_count") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt", "test_data/empty.txt"] * 10 | |||||
| ragflow.upload_local_file(created_res_id, file_paths) | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id, offset=2, count=10) | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10 | |||||
| def test_list_document_with_verifying_keywords(self): | |||||
| """ | |||||
| Test listing documents with verifying the functionality of searching keywords. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_list_document_with_verifying_keywords") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt", "test_data/empty.txt"] | |||||
| ragflow.upload_local_file(created_res_id, file_paths) | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id, keywords="empty") | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1 | |||||
| def test_list_document_with_verifying_order_by_and_descend(self): | |||||
| """ | |||||
| Test listing documents with verifying the functionality of order_by and descend. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_descend") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt", "test_data/empty.txt"] | |||||
| ragflow.upload_local_file(created_res_id, file_paths) | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id) | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 2 | |||||
| docs = response['data']['docs'] | |||||
| # reverse | |||||
| i = 1 | |||||
| for doc in docs: | |||||
| assert doc['name'] in file_paths[i] | |||||
| i -= 1 | |||||
| def test_list_document_with_verifying_order_by_and_ascend(self): | |||||
| """ | |||||
| Test listing documents with verifying the functionality of order_by and ascend. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_ascend") | |||||
| created_res_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"] | |||||
| ragflow.upload_local_file(created_res_id, file_paths) | |||||
| # Call the list_document method | |||||
| response = ragflow.list_files(created_res_id, descend=False) | |||||
| assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 3 | |||||
| docs = response['data']['docs'] | |||||
| i = 0 | |||||
| for doc in docs: | |||||
| assert doc['name'] in file_paths[i] | |||||
| i += 1 | |||||
| # TODO: have to set the limitation of the number of documents | |||||
| # ----------------------------download a file----------------------------------------------------- | # ----------------------------download a file----------------------------------------------------- | ||||
| # ----------------------------enable rename----------------------------------------------------- | # ----------------------------enable rename----------------------------------------------------- | ||||
| # ----------------------------list files----------------------------------------------------- | |||||
| # ----------------------------start parsing----------------------------------------------------- | # ----------------------------start parsing----------------------------------------------------- | ||||
| # ----------------------------stop parsing----------------------------------------------------- | # ----------------------------stop parsing----------------------------------------------------- | ||||
| # ----------------------------insert a new chunk----------------------------------------------------- | # ----------------------------insert a new chunk----------------------------------------------------- | ||||
| # ----------------------------upload a file----------------------------------------------------- | |||||
| # ----------------------------get a specific chunk----------------------------------------------------- | # ----------------------------get a specific chunk----------------------------------------------------- | ||||
| # ----------------------------retrieval test----------------------------------------------------- | # ----------------------------retrieval test----------------------------------------------------- |