| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393 | 
							- #
 - #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 - #
 - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - import uuid
 - from enum import auto
 - from typing import Annotated, Any
 - 
 - from flask import Request
 - from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
 - from strenum import StrEnum
 - from werkzeug.exceptions import BadRequest, UnsupportedMediaType
 - 
 - from api.constants import DATASET_NAME_LIMIT
 - 
 - 
 - def validate_and_parse_json_request(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None, exclude_unset: bool = False) -> tuple[dict[str, Any] | None, str | None]:
 -     """
 -     Validates and parses JSON requests through a multi-stage validation pipeline.
 - 
 -     Implements a four-stage validation process:
 -     1. Content-Type verification (must be application/json)
 -     2. JSON syntax validation
 -     3. Payload structure type checking
 -     4. Pydantic model validation with error formatting
 - 
 -     Args:
 -         request (Request): Flask request object containing HTTP payload
 -         validator (type[BaseModel]): Pydantic model class for data validation
 -         extras (dict[str, Any] | None): Additional fields to merge into payload
 -             before validation. These fields will be removed from the final output
 -         exclude_unset (bool): Whether to exclude fields that have not been explicitly set
 - 
 -     Returns:
 -         tuple[Dict[str, Any] | None, str | None]:
 -         - First element:
 -             - Validated dictionary on success
 -             - None on validation failure
 -         - Second element:
 -             - None on success
 -             - Diagnostic error message on failure
 - 
 -     Raises:
 -         UnsupportedMediaType: When Content-Type header is not application/json
 -         BadRequest: For structural JSON syntax errors
 -         ValidationError: When payload violates Pydantic schema rules
 - 
 -     Examples:
 -         >>> validate_and_parse_json_request(valid_request, DatasetSchema)
 -         ({"name": "Dataset1", "format": "csv"}, None)
 - 
 -         >>> validate_and_parse_json_request(xml_request, DatasetSchema)
 -         (None, "Unsupported content type: Expected application/json, got text/xml")
 - 
 -         >>> validate_and_parse_json_request(bad_json_request, DatasetSchema)
 -         (None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding")
 - 
 -     Notes:
 -         1. Validation Priority:
 -             - Content-Type verification precedes JSON parsing
 -             - Structural validation occurs before schema validation
 -         2. Extra fields added via `extras` parameter are automatically removed
 -            from the final output after validation
 -     """
 -     try:
 -         payload = request.get_json() or {}
 -     except UnsupportedMediaType:
 -         return None, f"Unsupported content type: Expected application/json, got {request.content_type}"
 -     except BadRequest:
 -         return None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding"
 - 
 -     if not isinstance(payload, dict):
 -         return None, f"Invalid request payload: expected object, got {type(payload).__name__}"
 - 
 -     try:
 -         if extras is not None:
 -             payload.update(extras)
 -         validated_request = validator(**payload)
 -     except ValidationError as e:
 -         return None, format_validation_error_message(e)
 - 
 -     parsed_payload = validated_request.model_dump(by_alias=True, exclude_unset=exclude_unset)
 - 
 -     if extras is not None:
 -         for key in list(parsed_payload.keys()):
 -             if key in extras:
 -                 del parsed_payload[key]
 - 
 -     return parsed_payload, None
 - 
 - 
 - def format_validation_error_message(e: ValidationError) -> str:
 -     """
 -     Formats validation errors into a standardized string format.
 - 
 -     Processes pydantic ValidationError objects to create human-readable error messages
 -     containing field locations, error descriptions, and input values.
 - 
 -     Args:
 -         e (ValidationError): The validation error instance containing error details
 - 
 -     Returns:
 -         str: Formatted error messages joined by newlines. Each line contains:
 -             - Field path (dot-separated)
 -             - Error message
 -             - Truncated input value (max 128 chars)
 - 
 -     Example:
 -         >>> try:
 -         ...     UserModel(name=123, email="invalid")
 -         ... except ValidationError as e:
 -         ...     print(format_validation_error_message(e))
 -         Field: <name> - Message: <Input should be a valid string> - Value: <123>
 -         Field: <email> - Message: <value is not a valid email address> - Value: <invalid>
 -     """
 -     error_messages = []
 - 
 -     for error in e.errors():
 -         field = ".".join(map(str, error["loc"]))
 -         msg = error["msg"]
 -         input_val = error["input"]
 -         input_str = str(input_val)
 - 
 -         if len(input_str) > 128:
 -             input_str = input_str[:125] + "..."
 - 
 -         error_msg = f"Field: <{field}> - Message: <{msg}> - Value: <{input_str}>"
 -         error_messages.append(error_msg)
 - 
 -     return "\n".join(error_messages)
 - 
 - 
 - class PermissionEnum(StrEnum):
 -     me = auto()
 -     team = auto()
 - 
 - 
 - class ChunkMethodnEnum(StrEnum):
 -     naive = auto()
 -     book = auto()
 -     email = auto()
 -     laws = auto()
 -     manual = auto()
 -     one = auto()
 -     paper = auto()
 -     picture = auto()
 -     presentation = auto()
 -     qa = auto()
 -     table = auto()
 -     tag = auto()
 - 
 - 
 - class GraphragMethodEnum(StrEnum):
 -     light = auto()
 -     general = auto()
 - 
 - 
 - class Base(BaseModel):
 -     class Config:
 -         extra = "forbid"
 - 
 - 
 - class RaptorConfig(Base):
 -     use_raptor: bool = Field(default=False)
 -     prompt: Annotated[
 -         str,
 -         StringConstraints(strip_whitespace=True, min_length=1),
 -         Field(
 -             default="Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize."
 -         ),
 -     ]
 -     max_token: int = Field(default=256, ge=1, le=2048)
 -     threshold: float = Field(default=0.1, ge=0.0, le=1.0)
 -     max_cluster: int = Field(default=64, ge=1, le=1024)
 -     random_seed: int = Field(default=0, ge=0)
 - 
 - 
 - class GraphragConfig(Base):
 -     use_graphrag: bool = Field(default=False)
 -     entity_types: list[str] = Field(default_factory=lambda: ["organization", "person", "geo", "event", "category"])
 -     method: GraphragMethodEnum = Field(default=GraphragMethodEnum.light)
 -     community: bool = Field(default=False)
 -     resolution: bool = Field(default=False)
 - 
 - 
 - class ParserConfig(Base):
 -     auto_keywords: int = Field(default=0, ge=0, le=32)
 -     auto_questions: int = Field(default=0, ge=0, le=10)
 -     chunk_token_num: int = Field(default=128, ge=1, le=2048)
 -     delimiter: str = Field(default=r"\n", min_length=1)
 -     graphrag: GraphragConfig | None = None
 -     html4excel: bool = False
 -     layout_recognize: str = "DeepDOC"
 -     raptor: RaptorConfig | None = None
 -     tag_kb_ids: list[str] = Field(default_factory=list)
 -     topn_tags: int = Field(default=1, ge=1, le=10)
 -     filename_embd_weight: float | None = Field(default=None, ge=0.0, le=1.0)
 -     task_page_size: int | None = Field(default=None, ge=1)
 -     pages: list[list[int]] | None = None
 - 
 - 
 - class CreateDatasetReq(Base):
 -     name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
 -     avatar: str | None = Field(default=None, max_length=65535)
 -     description: str | None = Field(default=None, max_length=65535)
 -     embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")]
 -     permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)]
 -     chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")]
 -     pagerank: int = Field(default=0, ge=0, le=100)
 -     parser_config: ParserConfig | None = Field(default=None)
 - 
 -     @field_validator("avatar")
 -     @classmethod
 -     def validate_avatar_base64(cls, v: str | None) -> str | None:
 -         """
 -         Validates Base64-encoded avatar string format and MIME type compliance.
 - 
 -         Implements a three-stage validation workflow:
 -         1. MIME prefix existence check
 -         2. MIME type format validation
 -         3. Supported type verification
 - 
 -         Args:
 -             v (str): Raw avatar field value
 - 
 -         Returns:
 -             str: Validated Base64 string
 - 
 -         Raises:
 -             ValueError: For structural errors in these cases:
 -                 - Missing MIME prefix header
 -                 - Invalid MIME prefix format
 -                 - Unsupported image MIME type
 - 
 -         Example:
 -             ```python
 -             # Valid case
 -             CreateDatasetReq(avatar="...")
 - 
 -             # Invalid cases
 -             CreateDatasetReq(avatar="image/jpeg;base64,...")  # Missing 'data:' prefix
 -             CreateDatasetReq(avatar="data:video/mp4;base64,...")  # Unsupported MIME type
 -             ```
 -         """
 -         if v is None:
 -             return v
 - 
 -         if "," in v:
 -             prefix, _ = v.split(",", 1)
 -             if not prefix.startswith("data:"):
 -                 raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
 - 
 -             mime_type = prefix[5:].split(";")[0]
 -             supported_mime_types = ["image/jpeg", "image/png"]
 -             if mime_type not in supported_mime_types:
 -                 raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
 - 
 -             return v
 -         else:
 -             raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
 - 
 -     @field_validator("embedding_model", mode="after")
 -     @classmethod
 -     def validate_embedding_model(cls, v: str) -> str:
 -         """
 -         Validates embedding model identifier format compliance.
 - 
 -         Validation pipeline:
 -         1. Structural format verification
 -         2. Component non-empty check
 -         3. Value normalization
 - 
 -         Args:
 -             v (str): Raw model identifier
 - 
 -         Returns:
 -             str: Validated <model_name>@<provider> format
 - 
 -         Raises:
 -             ValueError: For these violations:
 -                 - Missing @ separator
 -                 - Empty model_name/provider
 -                 - Invalid component structure
 - 
 -         Examples:
 -             Valid: "text-embedding-3-large@openai"
 -             Invalid: "invalid_model" (no @)
 -             Invalid: "@openai" (empty model_name)
 -             Invalid: "text-embedding-3-large@" (empty provider)
 -         """
 -         if "@" not in v:
 -             raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
 - 
 -         components = v.split("@", 1)
 -         if len(components) != 2 or not all(components):
 -             raise ValueError("Both model_name and provider must be non-empty strings")
 - 
 -         model_name, provider = components
 -         if not model_name.strip() or not provider.strip():
 -             raise ValueError("Model name and provider cannot be whitespace-only strings")
 -         return v
 - 
 -     @field_validator("permission", mode="before")
 -     @classmethod
 -     def permission_auto_lowercase(cls, v: Any) -> Any:
 -         """
 -         Normalize permission input to lowercase for consistent PermissionEnum matching.
 - 
 -         Args:
 -             v (Any): Raw input value for the permission field
 - 
 -         Returns:
 -             Lowercase string if input is string type, otherwise returns original value
 - 
 -         Behavior:
 -             - Converts string inputs to lowercase (e.g., "ME" → "me")
 -             - Non-string values pass through unchanged
 -             - Works in validation pre-processing stage (before enum conversion)
 -         """
 -         return v.lower() if isinstance(v, str) else v
 - 
 -     @field_validator("parser_config", mode="before")
 -     @classmethod
 -     def normalize_empty_parser_config(cls, v: Any) -> Any:
 -         """
 -         Normalizes empty parser configuration by converting empty dictionaries to None.
 - 
 -         This validator ensures consistent handling of empty parser configurations across
 -         the application by converting empty dicts to None values.
 - 
 -         Args:
 -             v (Any): Raw input value for the parser config field
 - 
 -         Returns:
 -             Any: Returns None if input is an empty dict, otherwise returns the original value
 - 
 -         Example:
 -             >>> normalize_empty_parser_config({})
 -             None
 - 
 -             >>> normalize_empty_parser_config({"key": "value"})
 -             {"key": "value"}
 -         """
 -         if v == {}:
 -             return None
 -         return v
 - 
 -     @field_validator("parser_config", mode="after")
 -     @classmethod
 -     def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None:
 -         """
 -         Validates serialized JSON length constraints for parser configuration.
 - 
 -         Implements a two-stage validation workflow:
 -         1. Null check - bypass validation for empty configurations
 -         2. Model serialization - convert Pydantic model to JSON string
 -         3. Size verification - enforce maximum allowed payload size
 - 
 -         Args:
 -             v (ParserConfig | None): Raw parser configuration object
 - 
 -         Returns:
 -             ParserConfig | None: Validated configuration object
 - 
 -         Raises:
 -             ValueError: When serialized JSON exceeds 65,535 characters
 -         """
 -         if v is None:
 -             return None
 - 
 -         if (json_str := v.model_dump_json()) and len(json_str) > 65535:
 -             raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
 -         return v
 - 
 - 
 - class UpdateDatasetReq(CreateDatasetReq):
 -     dataset_id: UUID1 = Field(...)
 -     name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(default="")]
 - 
 -     @field_serializer("dataset_id")
 -     def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
 -         return v.hex
 
 
  |