### What problem does this PR solve? API: Adds the feature of uploading document. ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.8.0
| @@ -14,36 +14,23 @@ | |||
| # limitations under the License. | |||
| import json | |||
| import os | |||
| import re | |||
| from datetime import datetime, timedelta | |||
| from flask import request, Response | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from httpx import HTTPError | |||
| from api.db import FileType, ParserType, FileSource, StatusEnum | |||
| from api.db.db_models import APIToken, API4Conversation, Task, File | |||
| from api.contants import NAME_LENGTH_LIMIT | |||
| from api.db import FileSource, StatusEnum | |||
| from api.db.db_models import File | |||
| from api.db.services import duplicate_name | |||
| from api.db.services.api_service import APITokenService, API4ConversationService | |||
| from api.db.services.dialog_service import DialogService, chat | |||
| from api.db.services.document_service import DocumentService | |||
| from api.db.services.file2document_service import File2DocumentService | |||
| from api.db.services.file_service import FileService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.db.services.task_service import queue_tasks, TaskService | |||
| from api.db.services.user_service import UserTenantService, TenantService | |||
| from api.settings import RetCode, retrievaler | |||
| from api.utils import get_uuid, current_timestamp, datetime_format | |||
| # from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request | |||
| from itsdangerous import URLSafeTimedSerializer | |||
| from api.utils.file_utils import filename_type, thumbnail | |||
| from rag.utils.minio_conn import MINIO | |||
| # import library | |||
| from api.db.services.user_service import TenantService | |||
| from api.settings import RetCode | |||
| from api.utils import get_uuid | |||
| from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request | |||
| from api.contants import NAME_LENGTH_LIMIT | |||
| # ------------------------------ create a dataset --------------------------------------- | |||
| @@ -0,0 +1,172 @@ | |||
| # | |||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License | |||
| # | |||
| import os | |||
| import re | |||
| import warnings | |||
| from flask import request | |||
| from flask_login import login_required, current_user | |||
| from api.db import FileType, ParserType | |||
| from api.db.services import duplicate_name | |||
| from api.db.services.document_service import DocumentService | |||
| from api.db.services.file_service import FileService | |||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||
| from api.settings import RetCode | |||
| from api.utils import get_uuid | |||
| from api.utils.api_utils import construct_json_result | |||
| from api.utils.file_utils import filename_type, thumbnail | |||
| from rag.utils.minio_conn import MINIO | |||
| MAXIMUM_OF_UPLOADING_FILES = 256 | |||
| # ----------------------------upload local files----------------------------------------------------- | |||
| @manager.route('/<dataset_id>', methods=['POST']) | |||
| @login_required | |||
| def upload(dataset_id): | |||
| # no files | |||
| if not request.files: | |||
| return construct_json_result( | |||
| message='There is no file!', code=RetCode.ARGUMENT_ERROR) | |||
| # the number of uploading files exceeds the limit | |||
| file_objs = request.files.getlist('file') | |||
| num_file_objs = len(file_objs) | |||
| if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: | |||
| return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " | |||
| f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") | |||
| for file_obj in file_objs: | |||
| # the content of the file | |||
| file_content = file_obj.read() | |||
| file_name = file_obj.filename | |||
| # no name | |||
| if not file_name: | |||
| return construct_json_result( | |||
| message='There is a file without name!', code=RetCode.ARGUMENT_ERROR) | |||
| # TODO: support the remote files | |||
| if 'http' in file_name: | |||
| return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") | |||
| # the content is empty, raising a warning | |||
| if file_content == b'': | |||
| warnings.warn(f"[WARNING]: The file {file_name} is empty.") | |||
| # no dataset | |||
| exist, dataset = KnowledgebaseService.get_by_id(dataset_id) | |||
| if not exist: | |||
| return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) | |||
| # get the root_folder | |||
| root_folder = FileService.get_root_folder(current_user.id) | |||
| # get the id of the root_folder | |||
| parent_file_id = root_folder["id"] # document id | |||
| # this is for the new user, create '.knowledgebase' file | |||
| FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | |||
| # go inside this folder, get the kb_root_folder | |||
| kb_root_folder = FileService.get_kb_folder(current_user.id) | |||
| # link the file management to the kb_folder | |||
| kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) | |||
| # grab all the errs | |||
| err = [] | |||
| MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) | |||
| for file in file_objs: | |||
| try: | |||
| # TODO: get this value from the database as some tenants have this limit while others don't | |||
| if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: | |||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||
| message="Exceed the maximum file number of a free user!") | |||
| # deal with the duplicate name | |||
| filename = duplicate_name( | |||
| DocumentService.query, | |||
| name=file.filename, | |||
| kb_id=dataset.id) | |||
| # deal with the unsupported type | |||
| filetype = filename_type(filename) | |||
| if filetype == FileType.OTHER.value: | |||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||
| message="This type of file has not been supported yet!") | |||
| # upload to the minio | |||
| location = filename | |||
| while MINIO.obj_exist(dataset_id, location): | |||
| location += "_" | |||
| blob = file.read() | |||
| MINIO.put(dataset_id, location, blob) | |||
| doc = { | |||
| "id": get_uuid(), | |||
| "kb_id": dataset.id, | |||
| "parser_id": dataset.parser_id, | |||
| "parser_config": dataset.parser_config, | |||
| "created_by": current_user.id, | |||
| "type": filetype, | |||
| "name": filename, | |||
| "location": location, | |||
| "size": len(blob), | |||
| "thumbnail": thumbnail(filename, blob) | |||
| } | |||
| if doc["type"] == FileType.VISUAL: | |||
| doc["parser_id"] = ParserType.PICTURE.value | |||
| if re.search(r"\.(ppt|pptx|pages)$", filename): | |||
| doc["parser_id"] = ParserType.PRESENTATION.value | |||
| DocumentService.insert(doc) | |||
| FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) | |||
| except Exception as e: | |||
| err.append(file.filename + ": " + str(e)) | |||
| if err: | |||
| # return all the errors | |||
| return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) | |||
| # success | |||
| return construct_json_result(data=True, code=RetCode.SUCCESS) | |||
| # ----------------------------upload online files------------------------------------------------ | |||
| # ----------------------------download a file----------------------------------------------------- | |||
| # ----------------------------delete a file----------------------------------------------------- | |||
| # ----------------------------enable rename----------------------------------------------------- | |||
| # ----------------------------list files----------------------------------------------------- | |||
| # ----------------------------start parsing----------------------------------------------------- | |||
| # ----------------------------stop parsing----------------------------------------------------- | |||
| # ----------------------------show the status of the file----------------------------------------------------- | |||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||
| # ----------------------------delete the chunk----------------------------------------------------- | |||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||
| # ----------------------------upload a file----------------------------------------------------- | |||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||
| # ----------------------------retrieval test----------------------------------------------------- | |||
| @@ -13,9 +13,12 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| import json | |||
| import os | |||
| import requests | |||
| import json | |||
| from api.settings import RetCode | |||
| class RAGFlow: | |||
| @@ -23,10 +26,12 @@ class RAGFlow: | |||
| ''' | |||
| api_url: http://<host_address>/api/v1 | |||
| dataset_url: http://<host_address>/api/v1/dataset | |||
| document_url: http://<host_address>/api/v1/documents | |||
| ''' | |||
| self.user_key = user_key | |||
| self.api_url = f"{base_url}/api/{version}" | |||
| self.dataset_url = f"{self.api_url}/dataset" | |||
| self.document_url = f"{self.api_url}/documents" | |||
| self.authorization_header = {"Authorization": "{}".format(self.user_key)} | |||
| def create_dataset(self, dataset_name): | |||
| @@ -73,3 +78,54 @@ class RAGFlow: | |||
| endpoint = f"{self.dataset_url}/{dataset_id}" | |||
| response = requests.put(endpoint, json=params, headers=self.authorization_header) | |||
| return response.json() | |||
| # -------------------- content management ----------------------------------------------------- | |||
| # ----------------------------upload local files----------------------------------------------------- | |||
| def upload_local_file(self, dataset_id, file_paths): | |||
| files = [] | |||
| for file_path in file_paths: | |||
| if not isinstance(file_path, str): | |||
| return {'code': RetCode.ARGUMENT_ERROR, 'message': f"{file_path} is not string."} | |||
| if 'http' in file_path: | |||
| return {'code': RetCode.ARGUMENT_ERROR, 'message': "Remote files have not unsupported."} | |||
| if os.path.isfile(file_path): | |||
| files.append(('file', open(file_path, 'rb'))) | |||
| else: | |||
| return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"} | |||
| res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files, | |||
| headers=self.authorization_header) | |||
| result_dict = json.loads(res.text) | |||
| return result_dict | |||
| # ----------------------------upload remote files----------------------------------------------------- | |||
| # ----------------------------download a file----------------------------------------------------- | |||
| # ----------------------------delete a file----------------------------------------------------- | |||
| # ----------------------------enable rename----------------------------------------------------- | |||
| # ----------------------------list files----------------------------------------------------- | |||
| # ----------------------------start parsing----------------------------------------------------- | |||
| # ----------------------------stop parsing----------------------------------------------------- | |||
| # ----------------------------show the status of the file----------------------------------------------------- | |||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||
| # ----------------------------delete the chunk----------------------------------------------------- | |||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||
| # ----------------------------upload a file----------------------------------------------------- | |||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||
| # ----------------------------retrieval test----------------------------------------------------- | |||
| @@ -0,0 +1,2 @@ | |||
| hhh | |||
| hhh | |||
| @@ -0,0 +1,3 @@ | |||
| test | |||
| test | |||
| test | |||
| @@ -0,0 +1,2 @@ | |||
| test1 | |||
| test1 | |||
| @@ -0,0 +1,180 @@ | |||
| from api.settings import RetCode | |||
| from test_sdkbase import TestSdk | |||
| from ragflow import RAGFlow | |||
| import pytest | |||
| from common import API_KEY, HOST_ADDRESS | |||
| from api.contants import NAME_LENGTH_LIMIT | |||
| class TestFile(TestSdk): | |||
| """ | |||
| This class contains a suite of tests for the content management functionality within the dataset. | |||
| It ensures that the following functionalities as expected: | |||
| 1. upload local files | |||
| 2. upload remote files | |||
| 3. download a file | |||
| 4. delete a file | |||
| 5. enable rename | |||
| 6. list files | |||
| 7. start parsing | |||
| 8. end parsing | |||
| 9. check the status of the file | |||
| 10. list the chunks | |||
| 11. delete a chunk | |||
| 12. insert a new chunk | |||
| 13. edit the status of chunk | |||
| 14. get the specific chunk | |||
| 15. retrieval test | |||
| """ | |||
| # ----------------------------upload local files----------------------------------------------------- | |||
| def test_upload_two_files(self): | |||
| """ | |||
| Test uploading two files with success. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_two_files") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ["test_data/test.txt", "test_data/test1.txt"] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' | |||
| def test_upload_one_file(self): | |||
| """ | |||
| Test uploading one file with success. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_one_file") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ["test_data/test.txt"] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' | |||
| def test_upload_nonexistent_files(self): | |||
| """ | |||
| Test uploading a file which does not exist. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_nonexistent_files") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ["test_data/imagination.txt"] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert res['code'] == RetCode.DATA_ERROR and "does not exist" in res['message'] | |||
| def test_upload_file_if_dataset_does_not_exist(self): | |||
| """ | |||
| Test uploading files if the dataset id does not exist. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| file_paths = ["test_data/test.txt"] | |||
| res = ragflow.upload_local_file("111", file_paths) | |||
| assert res['code'] == RetCode.DATA_ERROR and res['message'] == "Can't find this dataset" | |||
| def test_upload_file_without_name(self): | |||
| """ | |||
| Test uploading files that do not have name. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_file_without_name") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ["test_data/.txt"] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert res['code'] == RetCode.SUCCESS | |||
| def test_upload_file_without_name1(self): | |||
| """ | |||
| Test uploading files that do not have name. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_file_without_name") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ["test_data/.txt", "test_data/empty.txt"] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert res['code'] == RetCode.SUCCESS | |||
| def test_upload_files_exceeding_the_number_limit(self): | |||
| """ | |||
| Test uploading files whose number exceeds the limit. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_files_exceeding_the_number_limit") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ["test_data/test.txt", "test_data/test1.txt"] * 256 | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert (res['message'] == | |||
| 'You try to upload 512 files, which exceeds the maximum number of uploading files: 256' | |||
| and res['code'] == RetCode.DATA_ERROR) | |||
| def test_upload_files_without_files(self): | |||
| """ | |||
| Test uploading files without files. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_files_without_files") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = [None] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert (res['message'] == 'None is not string.' and res['code'] == RetCode.ARGUMENT_ERROR) | |||
| def test_upload_files_with_two_files_with_same_name(self): | |||
| """ | |||
| Test uploading files with the same name. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_files_with_two_files_with_same_name") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ['test_data/test.txt'] * 2 | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert (res['message'] == 'success' and res['code'] == RetCode.SUCCESS) | |||
| def test_upload_files_with_file_paths(self): | |||
| """ | |||
| Test uploading files with only specifying the file path's repo. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_files_with_file_paths") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ['test_data/'] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert (res['message'] == 'The file test_data/ does not exist' and res['code'] == RetCode.DATA_ERROR) | |||
| def test_upload_files_with_remote_file_path(self): | |||
| """ | |||
| Test uploading files with remote files. | |||
| """ | |||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| created_res = ragflow.create_dataset("test_upload_files_with_remote_file_path") | |||
| dataset_id = created_res['data']['dataset_id'] | |||
| file_paths = ['https://github.com/genostack/ragflow'] | |||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||
| assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.' | |||
| # ----------------------------upload remote files----------------------------------------------------- | |||
| # ----------------------------download a file----------------------------------------------------- | |||
| # ----------------------------delete a file----------------------------------------------------- | |||
| # ----------------------------enable rename----------------------------------------------------- | |||
| # ----------------------------list files----------------------------------------------------- | |||
| # ----------------------------start parsing----------------------------------------------------- | |||
| # ----------------------------stop parsing----------------------------------------------------- | |||
| # ----------------------------show the status of the file----------------------------------------------------- | |||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||
| # ----------------------------delete the chunk----------------------------------------------------- | |||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||
| # ----------------------------upload a file----------------------------------------------------- | |||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||
| # ----------------------------retrieval test----------------------------------------------------- | |||