You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import uuid
  17. from collections import Counter
  18. from enum import auto
  19. from typing import Annotated, Any
  20. from flask import Request
  21. from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
  22. from pydantic_core import PydanticCustomError
  23. from strenum import StrEnum
  24. from werkzeug.exceptions import BadRequest, UnsupportedMediaType
  25. from api.constants import DATASET_NAME_LIMIT
  26. def validate_and_parse_json_request(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None, exclude_unset: bool = False) -> tuple[dict[str, Any] | None, str | None]:
  27. """
  28. Validates and parses JSON requests through a multi-stage validation pipeline.
  29. Implements a four-stage validation process:
  30. 1. Content-Type verification (must be application/json)
  31. 2. JSON syntax validation
  32. 3. Payload structure type checking
  33. 4. Pydantic model validation with error formatting
  34. Args:
  35. request (Request): Flask request object containing HTTP payload
  36. validator (type[BaseModel]): Pydantic model class for data validation
  37. extras (dict[str, Any] | None): Additional fields to merge into payload
  38. before validation. These fields will be removed from the final output
  39. exclude_unset (bool): Whether to exclude fields that have not been explicitly set
  40. Returns:
  41. tuple[Dict[str, Any] | None, str | None]:
  42. - First element:
  43. - Validated dictionary on success
  44. - None on validation failure
  45. - Second element:
  46. - None on success
  47. - Diagnostic error message on failure
  48. Raises:
  49. UnsupportedMediaType: When Content-Type header is not application/json
  50. BadRequest: For structural JSON syntax errors
  51. ValidationError: When payload violates Pydantic schema rules
  52. Examples:
  53. >>> validate_and_parse_json_request(valid_request, DatasetSchema)
  54. ({"name": "Dataset1", "format": "csv"}, None)
  55. >>> validate_and_parse_json_request(xml_request, DatasetSchema)
  56. (None, "Unsupported content type: Expected application/json, got text/xml")
  57. >>> validate_and_parse_json_request(bad_json_request, DatasetSchema)
  58. (None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding")
  59. Notes:
  60. 1. Validation Priority:
  61. - Content-Type verification precedes JSON parsing
  62. - Structural validation occurs before schema validation
  63. 2. Extra fields added via `extras` parameter are automatically removed
  64. from the final output after validation
  65. """
  66. try:
  67. payload = request.get_json() or {}
  68. except UnsupportedMediaType:
  69. return None, f"Unsupported content type: Expected application/json, got {request.content_type}"
  70. except BadRequest:
  71. return None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding"
  72. if not isinstance(payload, dict):
  73. return None, f"Invalid request payload: expected object, got {type(payload).__name__}"
  74. try:
  75. if extras is not None:
  76. payload.update(extras)
  77. validated_request = validator(**payload)
  78. except ValidationError as e:
  79. return None, format_validation_error_message(e)
  80. parsed_payload = validated_request.model_dump(by_alias=True, exclude_unset=exclude_unset)
  81. if extras is not None:
  82. for key in list(parsed_payload.keys()):
  83. if key in extras:
  84. del parsed_payload[key]
  85. return parsed_payload, None
  86. def format_validation_error_message(e: ValidationError) -> str:
  87. """
  88. Formats validation errors into a standardized string format.
  89. Processes pydantic ValidationError objects to create human-readable error messages
  90. containing field locations, error descriptions, and input values.
  91. Args:
  92. e (ValidationError): The validation error instance containing error details
  93. Returns:
  94. str: Formatted error messages joined by newlines. Each line contains:
  95. - Field path (dot-separated)
  96. - Error message
  97. - Truncated input value (max 128 chars)
  98. Example:
  99. >>> try:
  100. ... UserModel(name=123, email="invalid")
  101. ... except ValidationError as e:
  102. ... print(format_validation_error_message(e))
  103. Field: <name> - Message: <Input should be a valid string> - Value: <123>
  104. Field: <email> - Message: <value is not a valid email address> - Value: <invalid>
  105. """
  106. error_messages = []
  107. for error in e.errors():
  108. field = ".".join(map(str, error["loc"]))
  109. msg = error["msg"]
  110. input_val = error["input"]
  111. input_str = str(input_val)
  112. if len(input_str) > 128:
  113. input_str = input_str[:125] + "..."
  114. error_msg = f"Field: <{field}> - Message: <{msg}> - Value: <{input_str}>"
  115. error_messages.append(error_msg)
  116. return "\n".join(error_messages)
  117. class PermissionEnum(StrEnum):
  118. me = auto()
  119. team = auto()
  120. class ChunkMethodnEnum(StrEnum):
  121. naive = auto()
  122. book = auto()
  123. email = auto()
  124. laws = auto()
  125. manual = auto()
  126. one = auto()
  127. paper = auto()
  128. picture = auto()
  129. presentation = auto()
  130. qa = auto()
  131. table = auto()
  132. tag = auto()
  133. class GraphragMethodEnum(StrEnum):
  134. light = auto()
  135. general = auto()
  136. class Base(BaseModel):
  137. class Config:
  138. extra = "forbid"
  139. class RaptorConfig(Base):
  140. use_raptor: bool = Field(default=False)
  141. prompt: Annotated[
  142. str,
  143. StringConstraints(strip_whitespace=True, min_length=1),
  144. Field(
  145. default="Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize."
  146. ),
  147. ]
  148. max_token: int = Field(default=256, ge=1, le=2048)
  149. threshold: float = Field(default=0.1, ge=0.0, le=1.0)
  150. max_cluster: int = Field(default=64, ge=1, le=1024)
  151. random_seed: int = Field(default=0, ge=0)
  152. class GraphragConfig(Base):
  153. use_graphrag: bool = Field(default=False)
  154. entity_types: list[str] = Field(default_factory=lambda: ["organization", "person", "geo", "event", "category"])
  155. method: GraphragMethodEnum = Field(default=GraphragMethodEnum.light)
  156. community: bool = Field(default=False)
  157. resolution: bool = Field(default=False)
  158. class ParserConfig(Base):
  159. auto_keywords: int = Field(default=0, ge=0, le=32)
  160. auto_questions: int = Field(default=0, ge=0, le=10)
  161. chunk_token_num: int = Field(default=128, ge=1, le=2048)
  162. delimiter: str = Field(default=r"\n", min_length=1)
  163. graphrag: GraphragConfig | None = None
  164. html4excel: bool = False
  165. layout_recognize: str = "DeepDOC"
  166. raptor: RaptorConfig | None = None
  167. tag_kb_ids: list[str] = Field(default_factory=list)
  168. topn_tags: int = Field(default=1, ge=1, le=10)
  169. filename_embd_weight: float | None = Field(default=None, ge=0.0, le=1.0)
  170. task_page_size: int | None = Field(default=None, ge=1)
  171. pages: list[list[int]] | None = None
  172. class CreateDatasetReq(Base):
  173. name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
  174. avatar: str | None = Field(default=None, max_length=65535)
  175. description: str | None = Field(default=None, max_length=65535)
  176. embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")]
  177. permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)]
  178. chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")]
  179. pagerank: int = Field(default=0, ge=0, le=100)
  180. parser_config: ParserConfig | None = Field(default=None)
  181. @field_validator("avatar")
  182. @classmethod
  183. def validate_avatar_base64(cls, v: str | None) -> str | None:
  184. """
  185. Validates Base64-encoded avatar string format and MIME type compliance.
  186. Implements a three-stage validation workflow:
  187. 1. MIME prefix existence check
  188. 2. MIME type format validation
  189. 3. Supported type verification
  190. Args:
  191. v (str): Raw avatar field value
  192. Returns:
  193. str: Validated Base64 string
  194. Raises:
  195. PydanticCustomError: For structural errors in these cases:
  196. - Missing MIME prefix header
  197. - Invalid MIME prefix format
  198. - Unsupported image MIME type
  199. Example:
  200. ```python
  201. # Valid case
  202. CreateDatasetReq(avatar="...")
  203. # Invalid cases
  204. CreateDatasetReq(avatar="image/jpeg;base64,...") # Missing 'data:' prefix
  205. CreateDatasetReq(avatar="data:video/mp4;base64,...") # Unsupported MIME type
  206. ```
  207. """
  208. if v is None:
  209. return v
  210. if "," in v:
  211. prefix, _ = v.split(",", 1)
  212. if not prefix.startswith("data:"):
  213. raise PydanticCustomError("format_invalid", "Invalid MIME prefix format. Must start with 'data:'")
  214. mime_type = prefix[5:].split(";")[0]
  215. supported_mime_types = ["image/jpeg", "image/png"]
  216. if mime_type not in supported_mime_types:
  217. raise PydanticCustomError("format_invalid", "Unsupported MIME type. Allowed: {supported_mime_types}", {"supported_mime_types": supported_mime_types})
  218. return v
  219. else:
  220. raise PydanticCustomError("format_invalid", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
  221. @field_validator("embedding_model", mode="after")
  222. @classmethod
  223. def validate_embedding_model(cls, v: str) -> str:
  224. """
  225. Validates embedding model identifier format compliance.
  226. Validation pipeline:
  227. 1. Structural format verification
  228. 2. Component non-empty check
  229. 3. Value normalization
  230. Args:
  231. v (str): Raw model identifier
  232. Returns:
  233. str: Validated <model_name>@<provider> format
  234. Raises:
  235. PydanticCustomError: For these violations:
  236. - Missing @ separator
  237. - Empty model_name/provider
  238. - Invalid component structure
  239. Examples:
  240. Valid: "text-embedding-3-large@openai"
  241. Invalid: "invalid_model" (no @)
  242. Invalid: "@openai" (empty model_name)
  243. Invalid: "text-embedding-3-large@" (empty provider)
  244. """
  245. if "@" not in v:
  246. raise PydanticCustomError("format_invalid", "Embedding model identifier must follow <model_name>@<provider> format")
  247. components = v.split("@", 1)
  248. if len(components) != 2 or not all(components):
  249. raise PydanticCustomError("format_invalid", "Both model_name and provider must be non-empty strings")
  250. model_name, provider = components
  251. if not model_name.strip() or not provider.strip():
  252. raise PydanticCustomError("format_invalid", "Model name and provider cannot be whitespace-only strings")
  253. return v
  254. @field_validator("permission", mode="before")
  255. @classmethod
  256. def permission_auto_lowercase(cls, v: Any) -> Any:
  257. """
  258. Normalize permission input to lowercase for consistent PermissionEnum matching.
  259. Args:
  260. v (Any): Raw input value for the permission field
  261. Returns:
  262. Lowercase string if input is string type, otherwise returns original value
  263. Behavior:
  264. - Converts string inputs to lowercase (e.g., "ME" → "me")
  265. - Non-string values pass through unchanged
  266. - Works in validation pre-processing stage (before enum conversion)
  267. """
  268. return v.lower() if isinstance(v, str) else v
  269. @field_validator("parser_config", mode="before")
  270. @classmethod
  271. def normalize_empty_parser_config(cls, v: Any) -> Any:
  272. """
  273. Normalizes empty parser configuration by converting empty dictionaries to None.
  274. This validator ensures consistent handling of empty parser configurations across
  275. the application by converting empty dicts to None values.
  276. Args:
  277. v (Any): Raw input value for the parser config field
  278. Returns:
  279. Any: Returns None if input is an empty dict, otherwise returns the original value
  280. Example:
  281. >>> normalize_empty_parser_config({})
  282. None
  283. >>> normalize_empty_parser_config({"key": "value"})
  284. {"key": "value"}
  285. """
  286. if v == {}:
  287. return None
  288. return v
  289. @field_validator("parser_config", mode="after")
  290. @classmethod
  291. def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None:
  292. """
  293. Validates serialized JSON length constraints for parser configuration.
  294. Implements a two-stage validation workflow:
  295. 1. Null check - bypass validation for empty configurations
  296. 2. Model serialization - convert Pydantic model to JSON string
  297. 3. Size verification - enforce maximum allowed payload size
  298. Args:
  299. v (ParserConfig | None): Raw parser configuration object
  300. Returns:
  301. ParserConfig | None: Validated configuration object
  302. Raises:
  303. PydanticCustomError: When serialized JSON exceeds 65,535 characters
  304. """
  305. if v is None:
  306. return None
  307. if (json_str := v.model_dump_json()) and len(json_str) > 65535:
  308. raise PydanticCustomError("string_too_long", "Parser config exceeds size limit (max 65,535 characters). Current size: {actual}", {"actual": len(json_str)})
  309. return v
  310. class UpdateDatasetReq(CreateDatasetReq):
  311. dataset_id: UUID1 = Field(...)
  312. name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(default="")]
  313. @field_serializer("dataset_id")
  314. def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
  315. """
  316. Serializes a UUID version 1 object to its hexadecimal string representation.
  317. This field serializer specifically handles UUID version 1 objects, converting them
  318. to their canonical 32-character hexadecimal format without hyphens. The conversion
  319. is designed for consistent serialization in API responses and database storage.
  320. Args:
  321. v (uuid.UUID1): The UUID version 1 object to serialize. Must be a valid
  322. UUID1 instance generated by Python's uuid module.
  323. Returns:
  324. str: 32-character lowercase hexadecimal string representation
  325. Example: "550e8400e29b41d4a716446655440000"
  326. Raises:
  327. AttributeError: If input is not a proper UUID object (missing hex attribute)
  328. TypeError: If input is not a UUID1 instance (when type checking is enabled)
  329. Notes:
  330. - Version 1 UUIDs contain timestamp and MAC address information
  331. - The .hex property automatically converts to lowercase hexadecimal
  332. - For cross-version compatibility, consider typing as uuid.UUID instead
  333. """
  334. return v.hex
  335. class DeleteReq(Base):
  336. ids: list[UUID1] | None = Field(...)
  337. @field_validator("ids", mode="after")
  338. def check_duplicate_ids(cls, v: list[UUID1] | None) -> list[str] | None:
  339. """
  340. Validates and converts a list of UUID1 objects to hexadecimal strings while checking for duplicates.
  341. This validator implements a three-stage processing pipeline:
  342. 1. Null Handling - returns None for empty/null input
  343. 2. UUID Conversion - transforms UUID objects to hex strings
  344. 3. Duplicate Validation - ensures all IDs are unique
  345. Behavior Specifications:
  346. - Input: None → Returns None (indicates no operation)
  347. - Input: [] → Returns [] (empty list for explicit no-op)
  348. - Input: [UUID1,...] → Returns validated hex strings
  349. - Duplicates: Raises formatted PydanticCustomError
  350. Args:
  351. v (list[UUID1] | None):
  352. - None: Indicates no datasets should be processed
  353. - Empty list: Explicit empty operation
  354. - Populated list: Dataset UUIDs to validate/convert
  355. Returns:
  356. list[str] | None:
  357. - None when input is None
  358. - List of 32-character hex strings (lowercase, no hyphens)
  359. Example: ["550e8400e29b41d4a716446655440000"]
  360. Raises:
  361. PydanticCustomError: When duplicates detected, containing:
  362. - Error type: "duplicate_uuids"
  363. - Template message: "Duplicate ids: '{duplicate_ids}'"
  364. - Context: {"duplicate_ids": "id1, id2, ..."}
  365. Example:
  366. >>> validate([UUID("..."), UUID("...")])
  367. ["2cdf0456e9a711ee8000000000000000", ...]
  368. >>> validate([UUID("..."), UUID("...")]) # Duplicates
  369. PydanticCustomError: Duplicate ids: '2cdf0456e9a711ee8000000000000000'
  370. """
  371. if not v:
  372. return v
  373. uuid_hex_list = [ids.hex for ids in v]
  374. duplicates = [item for item, count in Counter(uuid_hex_list).items() if count > 1]
  375. if duplicates:
  376. duplicates_str = ", ".join(duplicates)
  377. raise PydanticCustomError("duplicate_uuids", "Duplicate ids: '{duplicate_ids}'", {"duplicate_ids": duplicates_str})
  378. return uuid_hex_list
  379. class DeleteDatasetReq(DeleteReq): ...