| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474 | 
							- #
 - #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 - #
 - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - import functools
 - import json
 - import logging
 - import random
 - import time
 - from base64 import b64encode
 - from functools import wraps
 - from hmac import HMAC
 - from io import BytesIO
 - from urllib.parse import quote, urlencode
 - from uuid import uuid1
 - 
 - import requests
 - from flask import (
 -     Response,
 -     jsonify,
 -     make_response,
 -     send_file,
 - )
 - from flask import (
 -     request as flask_request,
 - )
 - from itsdangerous import URLSafeTimedSerializer
 - from werkzeug.http import HTTP_STATUS_CODES
 - 
 - from api import settings
 - from api.constants import REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC
 - from api.db.db_models import APIToken
 - from api.utils import CustomJSONEncoder, get_uuid, json_dumps
 - 
 - requests.models.complexjson.dumps = functools.partial(json.dumps, cls=CustomJSONEncoder)
 - 
 - 
 - def request(**kwargs):
 -     sess = requests.Session()
 -     stream = kwargs.pop("stream", sess.stream)
 -     timeout = kwargs.pop("timeout", None)
 -     kwargs["headers"] = {k.replace("_", "-").upper(): v for k, v in kwargs.get("headers", {}).items()}
 -     prepped = requests.Request(**kwargs).prepare()
 - 
 -     if settings.CLIENT_AUTHENTICATION and settings.HTTP_APP_KEY and settings.SECRET_KEY:
 -         timestamp = str(round(time() * 1000))
 -         nonce = str(uuid1())
 -         signature = b64encode(
 -             HMAC(
 -                 settings.SECRET_KEY.encode("ascii"),
 -                 b"\n".join(
 -                     [
 -                         timestamp.encode("ascii"),
 -                         nonce.encode("ascii"),
 -                         settings.HTTP_APP_KEY.encode("ascii"),
 -                         prepped.path_url.encode("ascii"),
 -                         prepped.body if kwargs.get("json") else b"",
 -                         urlencode(sorted(kwargs["data"].items()), quote_via=quote, safe="-._~").encode("ascii") if kwargs.get("data") and isinstance(kwargs["data"], dict) else b"",
 -                     ]
 -                 ),
 -                 "sha1",
 -             ).digest()
 -         ).decode("ascii")
 - 
 -         prepped.headers.update(
 -             {
 -                 "TIMESTAMP": timestamp,
 -                 "NONCE": nonce,
 -                 "APP-KEY": settings.HTTP_APP_KEY,
 -                 "SIGNATURE": signature,
 -             }
 -         )
 - 
 -     return sess.send(prepped, stream=stream, timeout=timeout)
 - 
 - 
 - def get_exponential_backoff_interval(retries, full_jitter=False):
 -     """Calculate the exponential backoff wait time."""
 -     # Will be zero if factor equals 0
 -     countdown = min(REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC * (2**retries))
 -     # Full jitter according to
 -     # https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
 -     if full_jitter:
 -         countdown = random.randrange(countdown + 1)
 -     # Adjust according to maximum wait time and account for negative values.
 -     return max(0, countdown)
 - 
 - 
 - def get_data_error_result(code=settings.RetCode.DATA_ERROR, message="Sorry! Data missing!"):
 -     logging.exception(Exception(message))
 -     result_dict = {"code": code, "message": message}
 -     response = {}
 -     for key, value in result_dict.items():
 -         if value is None and key != "code":
 -             continue
 -         else:
 -             response[key] = value
 -     return jsonify(response)
 - 
 - 
 - def server_error_response(e):
 -     logging.exception(e)
 -     try:
 -         if e.code == 401:
 -             return get_json_result(code=401, message=repr(e))
 -     except BaseException:
 -         pass
 -     if len(e.args) > 1:
 -         return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
 -     if repr(e).find("index_not_found_exception") >= 0:
 -         return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message="No chunk found, please upload file and parse it.")
 - 
 -     return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
 - 
 - 
 - def error_response(response_code, message=None):
 -     if message is None:
 -         message = HTTP_STATUS_CODES.get(response_code, "Unknown Error")
 - 
 -     return Response(
 -         json.dumps(
 -             {
 -                 "message": message,
 -                 "code": response_code,
 -             }
 -         ),
 -         status=response_code,
 -         mimetype="application/json",
 -     )
 - 
 - 
 - def validate_request(*args, **kwargs):
 -     def wrapper(func):
 -         @wraps(func)
 -         def decorated_function(*_args, **_kwargs):
 -             input_arguments = flask_request.json or flask_request.form.to_dict()
 -             no_arguments = []
 -             error_arguments = []
 -             for arg in args:
 -                 if arg not in input_arguments:
 -                     no_arguments.append(arg)
 -             for k, v in kwargs.items():
 -                 config_value = input_arguments.get(k, None)
 -                 if config_value is None:
 -                     no_arguments.append(k)
 -                 elif isinstance(v, (tuple, list)):
 -                     if config_value not in v:
 -                         error_arguments.append((k, set(v)))
 -                 elif config_value != v:
 -                     error_arguments.append((k, v))
 -             if no_arguments or error_arguments:
 -                 error_string = ""
 -                 if no_arguments:
 -                     error_string += "required argument are missing: {}; ".format(",".join(no_arguments))
 -                 if error_arguments:
 -                     error_string += "required argument values: {}".format(",".join(["{}={}".format(a[0], a[1]) for a in error_arguments]))
 -                 return get_json_result(code=settings.RetCode.ARGUMENT_ERROR, message=error_string)
 -             return func(*_args, **_kwargs)
 - 
 -         return decorated_function
 - 
 -     return wrapper
 - 
 - 
 - def not_allowed_parameters(*params):
 -     def decorator(f):
 -         def wrapper(*args, **kwargs):
 -             input_arguments = flask_request.json or flask_request.form.to_dict()
 -             for param in params:
 -                 if param in input_arguments:
 -                     return get_json_result(code=settings.RetCode.ARGUMENT_ERROR, message=f"Parameter {param} isn't allowed")
 -             return f(*args, **kwargs)
 - 
 -         return wrapper
 - 
 -     return decorator
 - 
 - 
 - def is_localhost(ip):
 -     return ip in {"127.0.0.1", "::1", "[::1]", "localhost"}
 - 
 - 
 - def send_file_in_mem(data, filename):
 -     if not isinstance(data, (str, bytes)):
 -         data = json_dumps(data)
 -     if isinstance(data, str):
 -         data = data.encode("utf-8")
 - 
 -     f = BytesIO()
 -     f.write(data)
 -     f.seek(0)
 - 
 -     return send_file(f, as_attachment=True, attachment_filename=filename)
 - 
 - 
 - def get_json_result(code=settings.RetCode.SUCCESS, message="success", data=None):
 -     response = {"code": code, "message": message, "data": data}
 -     return jsonify(response)
 - 
 - 
 - def apikey_required(func):
 -     @wraps(func)
 -     def decorated_function(*args, **kwargs):
 -         token = flask_request.headers.get("Authorization").split()[1]
 -         objs = APIToken.query(token=token)
 -         if not objs:
 -             return build_error_result(message="API-KEY is invalid!", code=settings.RetCode.FORBIDDEN)
 -         kwargs["tenant_id"] = objs[0].tenant_id
 -         return func(*args, **kwargs)
 - 
 -     return decorated_function
 - 
 - 
 - def build_error_result(code=settings.RetCode.FORBIDDEN, message="success"):
 -     response = {"code": code, "message": message}
 -     response = jsonify(response)
 -     response.status_code = code
 -     return response
 - 
 - 
 - def construct_response(code=settings.RetCode.SUCCESS, message="success", data=None, auth=None):
 -     result_dict = {"code": code, "message": message, "data": data}
 -     response_dict = {}
 -     for key, value in result_dict.items():
 -         if value is None and key != "code":
 -             continue
 -         else:
 -             response_dict[key] = value
 -     response = make_response(jsonify(response_dict))
 -     if auth:
 -         response.headers["Authorization"] = auth
 -     response.headers["Access-Control-Allow-Origin"] = "*"
 -     response.headers["Access-Control-Allow-Method"] = "*"
 -     response.headers["Access-Control-Allow-Headers"] = "*"
 -     response.headers["Access-Control-Allow-Headers"] = "*"
 -     response.headers["Access-Control-Expose-Headers"] = "Authorization"
 -     return response
 - 
 - 
 - def construct_result(code=settings.RetCode.DATA_ERROR, message="data is missing"):
 -     result_dict = {"code": code, "message": message}
 -     response = {}
 -     for key, value in result_dict.items():
 -         if value is None and key != "code":
 -             continue
 -         else:
 -             response[key] = value
 -     return jsonify(response)
 - 
 - 
 - def construct_json_result(code=settings.RetCode.SUCCESS, message="success", data=None):
 -     if data is None:
 -         return jsonify({"code": code, "message": message})
 -     else:
 -         return jsonify({"code": code, "message": message, "data": data})
 - 
 - 
 - def construct_error_response(e):
 -     logging.exception(e)
 -     try:
 -         if e.code == 401:
 -             return construct_json_result(code=settings.RetCode.UNAUTHORIZED, message=repr(e))
 -     except BaseException:
 -         pass
 -     if len(e.args) > 1:
 -         return construct_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
 -     return construct_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))
 - 
 - 
 - def token_required(func):
 -     @wraps(func)
 -     def decorated_function(*args, **kwargs):
 -         authorization_str = flask_request.headers.get("Authorization")
 -         if not authorization_str:
 -             return get_json_result(data=False, message="`Authorization` can't be empty")
 -         authorization_list = authorization_str.split()
 -         if len(authorization_list) < 2:
 -             return get_json_result(data=False, message="Please check your authorization format.")
 -         token = authorization_list[1]
 -         objs = APIToken.query(token=token)
 -         if not objs:
 -             return get_json_result(data=False, message="Authentication error: API key is invalid!", code=settings.RetCode.AUTHENTICATION_ERROR)
 -         kwargs["tenant_id"] = objs[0].tenant_id
 -         return func(*args, **kwargs)
 - 
 -     return decorated_function
 - 
 - 
 - def get_result(code=settings.RetCode.SUCCESS, message="", data=None):
 -     if code == 0:
 -         if data is not None:
 -             response = {"code": code, "data": data}
 -         else:
 -             response = {"code": code}
 -     else:
 -         response = {"code": code, "message": message}
 -     return jsonify(response)
 - 
 - 
 - def get_error_data_result(
 -     message="Sorry! Data missing!",
 -     code=settings.RetCode.DATA_ERROR,
 - ):
 -     result_dict = {"code": code, "message": message}
 -     response = {}
 -     for key, value in result_dict.items():
 -         if value is None and key != "code":
 -             continue
 -         else:
 -             response[key] = value
 -     return jsonify(response)
 - 
 - 
 - def generate_confirmation_token(tenant_id):
 -     serializer = URLSafeTimedSerializer(tenant_id)
 -     return "ragflow-" + serializer.dumps(get_uuid(), salt=tenant_id)[2:34]
 - 
 - 
 - def valid(permission, valid_permission, chunk_method, valid_chunk_method):
 -     if valid_parameter(permission, valid_permission):
 -         return valid_parameter(permission, valid_permission)
 -     if valid_parameter(chunk_method, valid_chunk_method):
 -         return valid_parameter(chunk_method, valid_chunk_method)
 - 
 - 
 - def valid_parameter(parameter, valid_values):
 -     if parameter and parameter not in valid_values:
 -         return get_error_data_result(f"'{parameter}' is not in {valid_values}")
 - 
 - 
 - def dataset_readonly_fields(field_name):
 -     return field_name in ["chunk_count", "create_date", "create_time", "update_date", "update_time", "created_by", "document_count", "token_num", "status", "tenant_id", "id"]
 - 
 - 
 - def get_parser_config(chunk_method, parser_config):
 -     if parser_config:
 -         return parser_config
 -     if not chunk_method:
 -         chunk_method = "naive"
 -     key_mapping = {
 -         "naive": {"chunk_token_num": 128, "delimiter": "\\n!?;。;!?", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
 -         "qa": {"raptor": {"use_raptor": False}},
 -         "tag": None,
 -         "resume": None,
 -         "manual": {"raptor": {"use_raptor": False}},
 -         "table": None,
 -         "paper": {"raptor": {"use_raptor": False}},
 -         "book": {"raptor": {"use_raptor": False}},
 -         "laws": {"raptor": {"use_raptor": False}},
 -         "presentation": {"raptor": {"use_raptor": False}},
 -         "one": None,
 -         "knowledge_graph": {"chunk_token_num": 8192, "delimiter": "\\n!?;。;!?", "entity_types": ["organization", "person", "location", "event", "time"]},
 -         "email": None,
 -         "picture": None,
 -     }
 -     parser_config = key_mapping[chunk_method]
 -     return parser_config
 - 
 - 
 - def get_data_openai(id=None, 
 -                     created=None, 
 -                     model=None, 
 -                     prompt_tokens= 0, 
 -                     completion_tokens=0, 
 -                     content = None, 
 -                     finish_reason= None,
 -                     object="chat.completion",
 -                     param=None,
 - ):
 -    
 -     total_tokens= prompt_tokens + completion_tokens
 -     return {
 -         "id":f"{id}",
 -         "object": object,
 -         "created": int(time.time()) if created else None,
 -         "model": model,
 -         "param":param,
 -         "usage": {
 -             "prompt_tokens": prompt_tokens,
 -             "completion_tokens": completion_tokens,
 -             "total_tokens": total_tokens,
 -             "completion_tokens_details": {
 -                 "reasoning_tokens": 0,
 -                 "accepted_prediction_tokens": 0,
 -                 "rejected_prediction_tokens": 0
 -             }
 -         },
 -         "choices": [
 -             {
 -                 "message": {
 -                     "role": "assistant",
 -                     "content": content
 -                 },
 -                 "logprobs": None,
 -                 "finish_reason": finish_reason,
 -                 "index": 0
 -             }
 -         ]
 -     } 
 - def valid_parser_config(parser_config):
 -     if not parser_config:
 -         return
 -     scopes = set(
 -         [
 -             "chunk_token_num",
 -             "delimiter",
 -             "raptor",
 -             "graphrag",
 -             "layout_recognize",
 -             "task_page_size",
 -             "pages",
 -             "html4excel",
 -             "auto_keywords",
 -             "auto_questions",
 -             "tag_kb_ids",
 -             "topn_tags",
 -             "filename_embd_weight",
 -         ]
 -     )
 -     for k in parser_config.keys():
 -         assert k in scopes, f"Abnormal 'parser_config'. Invalid key: {k}"
 - 
 -     assert isinstance(parser_config.get("chunk_token_num", 1), int), "chunk_token_num should be int"
 -     assert 1 <= parser_config.get("chunk_token_num", 1) < 100000000, "chunk_token_num should be in range from 1 to 100000000"
 -     assert isinstance(parser_config.get("task_page_size", 1), int), "task_page_size should be int"
 -     assert 1 <= parser_config.get("task_page_size", 1) < 100000000, "task_page_size should be in range from 1 to 100000000"
 -     assert isinstance(parser_config.get("auto_keywords", 1), int), "auto_keywords should be int"
 -     assert 0 <= parser_config.get("auto_keywords", 0) < 32, "auto_keywords should be in range from 0 to 32"
 -     assert isinstance(parser_config.get("auto_questions", 1), int), "auto_questions should be int"
 -     assert 0 <= parser_config.get("auto_questions", 0) < 10, "auto_questions should be in range from 0 to 10"
 -     assert isinstance(parser_config.get("topn_tags", 1), int), "topn_tags should be int"
 -     assert 0 <= parser_config.get("topn_tags", 0) < 10, "topn_tags should be in range from 0 to 10"
 -     assert isinstance(parser_config.get("html4excel", False), bool), "html4excel should be True or False"
 -     assert isinstance(parser_config.get("delimiter", ""), str), "delimiter should be str"
 - 
 - 
 - def check_duplicate_ids(ids, id_type="item"):
 -     """
 -     Check for duplicate IDs in a list and return unique IDs and error messages.
 - 
 -     Args:
 -         ids (list): List of IDs to check for duplicates
 -         id_type (str): Type of ID for error messages (e.g., 'document', 'dataset', 'chunk')
 - 
 -     Returns:
 -         tuple: (unique_ids, error_messages)
 -             - unique_ids (list): List of unique IDs
 -             - error_messages (list): List of error messages for duplicate IDs
 -     """
 -     id_count = {}
 -     duplicate_messages = []
 - 
 -     # Count occurrences of each ID
 -     for id_value in ids:
 -         id_count[id_value] = id_count.get(id_value, 0) + 1
 - 
 -     # Check for duplicates
 -     for id_value, count in id_count.items():
 -         if count > 1:
 -             duplicate_messages.append(f"Duplicate {id_type} ids: {id_value}")
 - 
 -     # Return unique IDs and error messages
 -     return list(set(ids)), duplicate_messages
 
 
  |