### What problem does this PR solve? API: Adds the feature of uploading document. ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.8.0
| # limitations under the License. | # limitations under the License. | ||||
| import json | |||||
| import os | |||||
| import re | |||||
| from datetime import datetime, timedelta | |||||
| from flask import request, Response | |||||
| from flask import request | |||||
| from flask_login import login_required, current_user | from flask_login import login_required, current_user | ||||
| from httpx import HTTPError | from httpx import HTTPError | ||||
| from api.db import FileType, ParserType, FileSource, StatusEnum | |||||
| from api.db.db_models import APIToken, API4Conversation, Task, File | |||||
| from api.contants import NAME_LENGTH_LIMIT | |||||
| from api.db import FileSource, StatusEnum | |||||
| from api.db.db_models import File | |||||
| from api.db.services import duplicate_name | from api.db.services import duplicate_name | ||||
| from api.db.services.api_service import APITokenService, API4ConversationService | |||||
| from api.db.services.dialog_service import DialogService, chat | |||||
| from api.db.services.document_service import DocumentService | from api.db.services.document_service import DocumentService | ||||
| from api.db.services.file2document_service import File2DocumentService | from api.db.services.file2document_service import File2DocumentService | ||||
| from api.db.services.file_service import FileService | from api.db.services.file_service import FileService | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from api.db.services.task_service import queue_tasks, TaskService | |||||
| from api.db.services.user_service import UserTenantService, TenantService | |||||
| from api.settings import RetCode, retrievaler | |||||
| from api.utils import get_uuid, current_timestamp, datetime_format | |||||
| # from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request | |||||
| from itsdangerous import URLSafeTimedSerializer | |||||
| from api.utils.file_utils import filename_type, thumbnail | |||||
| from rag.utils.minio_conn import MINIO | |||||
| # import library | |||||
| from api.db.services.user_service import TenantService | |||||
| from api.settings import RetCode | |||||
| from api.utils import get_uuid | |||||
| from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request | from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request | ||||
| from api.contants import NAME_LENGTH_LIMIT | |||||
| # ------------------------------ create a dataset --------------------------------------- | # ------------------------------ create a dataset --------------------------------------- | ||||
| # | |||||
| # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License | |||||
| # | |||||
| import os | |||||
| import re | |||||
| import warnings | |||||
| from flask import request | |||||
| from flask_login import login_required, current_user | |||||
| from api.db import FileType, ParserType | |||||
| from api.db.services import duplicate_name | |||||
| from api.db.services.document_service import DocumentService | |||||
| from api.db.services.file_service import FileService | |||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | |||||
| from api.settings import RetCode | |||||
| from api.utils import get_uuid | |||||
| from api.utils.api_utils import construct_json_result | |||||
| from api.utils.file_utils import filename_type, thumbnail | |||||
| from rag.utils.minio_conn import MINIO | |||||
| MAXIMUM_OF_UPLOADING_FILES = 256 | |||||
| # ----------------------------upload local files----------------------------------------------------- | |||||
| @manager.route('/<dataset_id>', methods=['POST']) | |||||
| @login_required | |||||
| def upload(dataset_id): | |||||
| # no files | |||||
| if not request.files: | |||||
| return construct_json_result( | |||||
| message='There is no file!', code=RetCode.ARGUMENT_ERROR) | |||||
| # the number of uploading files exceeds the limit | |||||
| file_objs = request.files.getlist('file') | |||||
| num_file_objs = len(file_objs) | |||||
| if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " | |||||
| f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") | |||||
| for file_obj in file_objs: | |||||
| # the content of the file | |||||
| file_content = file_obj.read() | |||||
| file_name = file_obj.filename | |||||
| # no name | |||||
| if not file_name: | |||||
| return construct_json_result( | |||||
| message='There is a file without name!', code=RetCode.ARGUMENT_ERROR) | |||||
| # TODO: support the remote files | |||||
| if 'http' in file_name: | |||||
| return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") | |||||
| # the content is empty, raising a warning | |||||
| if file_content == b'': | |||||
| warnings.warn(f"[WARNING]: The file {file_name} is empty.") | |||||
| # no dataset | |||||
| exist, dataset = KnowledgebaseService.get_by_id(dataset_id) | |||||
| if not exist: | |||||
| return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) | |||||
| # get the root_folder | |||||
| root_folder = FileService.get_root_folder(current_user.id) | |||||
| # get the id of the root_folder | |||||
| parent_file_id = root_folder["id"] # document id | |||||
| # this is for the new user, create '.knowledgebase' file | |||||
| FileService.init_knowledgebase_docs(parent_file_id, current_user.id) | |||||
| # go inside this folder, get the kb_root_folder | |||||
| kb_root_folder = FileService.get_kb_folder(current_user.id) | |||||
| # link the file management to the kb_folder | |||||
| kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) | |||||
| # grab all the errs | |||||
| err = [] | |||||
| MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) | |||||
| for file in file_objs: | |||||
| try: | |||||
| # TODO: get this value from the database as some tenants have this limit while others don't | |||||
| if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||||
| message="Exceed the maximum file number of a free user!") | |||||
| # deal with the duplicate name | |||||
| filename = duplicate_name( | |||||
| DocumentService.query, | |||||
| name=file.filename, | |||||
| kb_id=dataset.id) | |||||
| # deal with the unsupported type | |||||
| filetype = filename_type(filename) | |||||
| if filetype == FileType.OTHER.value: | |||||
| return construct_json_result(code=RetCode.DATA_ERROR, | |||||
| message="This type of file has not been supported yet!") | |||||
| # upload to the minio | |||||
| location = filename | |||||
| while MINIO.obj_exist(dataset_id, location): | |||||
| location += "_" | |||||
| blob = file.read() | |||||
| MINIO.put(dataset_id, location, blob) | |||||
| doc = { | |||||
| "id": get_uuid(), | |||||
| "kb_id": dataset.id, | |||||
| "parser_id": dataset.parser_id, | |||||
| "parser_config": dataset.parser_config, | |||||
| "created_by": current_user.id, | |||||
| "type": filetype, | |||||
| "name": filename, | |||||
| "location": location, | |||||
| "size": len(blob), | |||||
| "thumbnail": thumbnail(filename, blob) | |||||
| } | |||||
| if doc["type"] == FileType.VISUAL: | |||||
| doc["parser_id"] = ParserType.PICTURE.value | |||||
| if re.search(r"\.(ppt|pptx|pages)$", filename): | |||||
| doc["parser_id"] = ParserType.PRESENTATION.value | |||||
| DocumentService.insert(doc) | |||||
| FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) | |||||
| except Exception as e: | |||||
| err.append(file.filename + ": " + str(e)) | |||||
| if err: | |||||
| # return all the errors | |||||
| return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) | |||||
| # success | |||||
| return construct_json_result(data=True, code=RetCode.SUCCESS) | |||||
| # ----------------------------upload online files------------------------------------------------ | |||||
| # ----------------------------download a file----------------------------------------------------- | |||||
| # ----------------------------delete a file----------------------------------------------------- | |||||
| # ----------------------------enable rename----------------------------------------------------- | |||||
| # ----------------------------list files----------------------------------------------------- | |||||
| # ----------------------------start parsing----------------------------------------------------- | |||||
| # ----------------------------stop parsing----------------------------------------------------- | |||||
| # ----------------------------show the status of the file----------------------------------------------------- | |||||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||||
| # ----------------------------delete the chunk----------------------------------------------------- | |||||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||||
| # ----------------------------upload a file----------------------------------------------------- | |||||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||||
| # ----------------------------retrieval test----------------------------------------------------- |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| import json | |||||
| import os | import os | ||||
| import requests | import requests | ||||
| import json | |||||
| from api.settings import RetCode | |||||
| class RAGFlow: | class RAGFlow: | ||||
| ''' | ''' | ||||
| api_url: http://<host_address>/api/v1 | api_url: http://<host_address>/api/v1 | ||||
| dataset_url: http://<host_address>/api/v1/dataset | dataset_url: http://<host_address>/api/v1/dataset | ||||
| document_url: http://<host_address>/api/v1/documents | |||||
| ''' | ''' | ||||
| self.user_key = user_key | self.user_key = user_key | ||||
| self.api_url = f"{base_url}/api/{version}" | self.api_url = f"{base_url}/api/{version}" | ||||
| self.dataset_url = f"{self.api_url}/dataset" | self.dataset_url = f"{self.api_url}/dataset" | ||||
| self.document_url = f"{self.api_url}/documents" | |||||
| self.authorization_header = {"Authorization": "{}".format(self.user_key)} | self.authorization_header = {"Authorization": "{}".format(self.user_key)} | ||||
| def create_dataset(self, dataset_name): | def create_dataset(self, dataset_name): | ||||
| endpoint = f"{self.dataset_url}/{dataset_id}" | endpoint = f"{self.dataset_url}/{dataset_id}" | ||||
| response = requests.put(endpoint, json=params, headers=self.authorization_header) | response = requests.put(endpoint, json=params, headers=self.authorization_header) | ||||
| return response.json() | return response.json() | ||||
| # -------------------- content management ----------------------------------------------------- | |||||
| # ----------------------------upload local files----------------------------------------------------- | |||||
| def upload_local_file(self, dataset_id, file_paths): | |||||
| files = [] | |||||
| for file_path in file_paths: | |||||
| if not isinstance(file_path, str): | |||||
| return {'code': RetCode.ARGUMENT_ERROR, 'message': f"{file_path} is not string."} | |||||
| if 'http' in file_path: | |||||
| return {'code': RetCode.ARGUMENT_ERROR, 'message': "Remote files have not unsupported."} | |||||
| if os.path.isfile(file_path): | |||||
| files.append(('file', open(file_path, 'rb'))) | |||||
| else: | |||||
| return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"} | |||||
| res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files, | |||||
| headers=self.authorization_header) | |||||
| result_dict = json.loads(res.text) | |||||
| return result_dict | |||||
| # ----------------------------upload remote files----------------------------------------------------- | |||||
| # ----------------------------download a file----------------------------------------------------- | |||||
| # ----------------------------delete a file----------------------------------------------------- | |||||
| # ----------------------------enable rename----------------------------------------------------- | |||||
| # ----------------------------list files----------------------------------------------------- | |||||
| # ----------------------------start parsing----------------------------------------------------- | |||||
| # ----------------------------stop parsing----------------------------------------------------- | |||||
| # ----------------------------show the status of the file----------------------------------------------------- | |||||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||||
| # ----------------------------delete the chunk----------------------------------------------------- | |||||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||||
| # ----------------------------upload a file----------------------------------------------------- | |||||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||||
| # ----------------------------retrieval test----------------------------------------------------- |
| hhh | |||||
| hhh |
| test | |||||
| test | |||||
| test |
| test1 | |||||
| test1 |
| from api.settings import RetCode | |||||
| from test_sdkbase import TestSdk | |||||
| from ragflow import RAGFlow | |||||
| import pytest | |||||
| from common import API_KEY, HOST_ADDRESS | |||||
| from api.contants import NAME_LENGTH_LIMIT | |||||
| class TestFile(TestSdk): | |||||
| """ | |||||
| This class contains a suite of tests for the content management functionality within the dataset. | |||||
| It ensures that the following functionalities as expected: | |||||
| 1. upload local files | |||||
| 2. upload remote files | |||||
| 3. download a file | |||||
| 4. delete a file | |||||
| 5. enable rename | |||||
| 6. list files | |||||
| 7. start parsing | |||||
| 8. end parsing | |||||
| 9. check the status of the file | |||||
| 10. list the chunks | |||||
| 11. delete a chunk | |||||
| 12. insert a new chunk | |||||
| 13. edit the status of chunk | |||||
| 14. get the specific chunk | |||||
| 15. retrieval test | |||||
| """ | |||||
| # ----------------------------upload local files----------------------------------------------------- | |||||
| def test_upload_two_files(self): | |||||
| """ | |||||
| Test uploading two files with success. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_two_files") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt", "test_data/test1.txt"] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' | |||||
| def test_upload_one_file(self): | |||||
| """ | |||||
| Test uploading one file with success. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_one_file") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt"] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' | |||||
| def test_upload_nonexistent_files(self): | |||||
| """ | |||||
| Test uploading a file which does not exist. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_nonexistent_files") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/imagination.txt"] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert res['code'] == RetCode.DATA_ERROR and "does not exist" in res['message'] | |||||
| def test_upload_file_if_dataset_does_not_exist(self): | |||||
| """ | |||||
| Test uploading files if the dataset id does not exist. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| file_paths = ["test_data/test.txt"] | |||||
| res = ragflow.upload_local_file("111", file_paths) | |||||
| assert res['code'] == RetCode.DATA_ERROR and res['message'] == "Can't find this dataset" | |||||
| def test_upload_file_without_name(self): | |||||
| """ | |||||
| Test uploading files that do not have name. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_file_without_name") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/.txt"] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert res['code'] == RetCode.SUCCESS | |||||
| def test_upload_file_without_name1(self): | |||||
| """ | |||||
| Test uploading files that do not have name. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_file_without_name") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/.txt", "test_data/empty.txt"] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert res['code'] == RetCode.SUCCESS | |||||
| def test_upload_files_exceeding_the_number_limit(self): | |||||
| """ | |||||
| Test uploading files whose number exceeds the limit. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_files_exceeding_the_number_limit") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ["test_data/test.txt", "test_data/test1.txt"] * 256 | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert (res['message'] == | |||||
| 'You try to upload 512 files, which exceeds the maximum number of uploading files: 256' | |||||
| and res['code'] == RetCode.DATA_ERROR) | |||||
| def test_upload_files_without_files(self): | |||||
| """ | |||||
| Test uploading files without files. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_files_without_files") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = [None] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert (res['message'] == 'None is not string.' and res['code'] == RetCode.ARGUMENT_ERROR) | |||||
| def test_upload_files_with_two_files_with_same_name(self): | |||||
| """ | |||||
| Test uploading files with the same name. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_files_with_two_files_with_same_name") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ['test_data/test.txt'] * 2 | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert (res['message'] == 'success' and res['code'] == RetCode.SUCCESS) | |||||
| def test_upload_files_with_file_paths(self): | |||||
| """ | |||||
| Test uploading files with only specifying the file path's repo. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_files_with_file_paths") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ['test_data/'] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert (res['message'] == 'The file test_data/ does not exist' and res['code'] == RetCode.DATA_ERROR) | |||||
| def test_upload_files_with_remote_file_path(self): | |||||
| """ | |||||
| Test uploading files with remote files. | |||||
| """ | |||||
| ragflow = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| created_res = ragflow.create_dataset("test_upload_files_with_remote_file_path") | |||||
| dataset_id = created_res['data']['dataset_id'] | |||||
| file_paths = ['https://github.com/genostack/ragflow'] | |||||
| res = ragflow.upload_local_file(dataset_id, file_paths) | |||||
| assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.' | |||||
| # ----------------------------upload remote files----------------------------------------------------- | |||||
| # ----------------------------download a file----------------------------------------------------- | |||||
| # ----------------------------delete a file----------------------------------------------------- | |||||
| # ----------------------------enable rename----------------------------------------------------- | |||||
| # ----------------------------list files----------------------------------------------------- | |||||
| # ----------------------------start parsing----------------------------------------------------- | |||||
| # ----------------------------stop parsing----------------------------------------------------- | |||||
| # ----------------------------show the status of the file----------------------------------------------------- | |||||
| # ----------------------------list the chunks of the file----------------------------------------------------- | |||||
| # ----------------------------delete the chunk----------------------------------------------------- | |||||
| # ----------------------------edit the status of the chunk----------------------------------------------------- | |||||
| # ----------------------------insert a new chunk----------------------------------------------------- | |||||
| # ----------------------------upload a file----------------------------------------------------- | |||||
| # ----------------------------get a specific chunk----------------------------------------------------- | |||||
| # ----------------------------retrieval test----------------------------------------------------- |