You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

validation_utils.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import uuid
  17. from enum import auto
  18. from typing import Annotated, Any
  19. from flask import Request
  20. from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
  21. from strenum import StrEnum
  22. from werkzeug.exceptions import BadRequest, UnsupportedMediaType
  23. from api.constants import DATASET_NAME_LIMIT
  24. def validate_and_parse_json_request(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None, exclude_unset: bool = False) -> tuple[dict[str, Any] | None, str | None]:
  25. """
  26. Validates and parses JSON requests through a multi-stage validation pipeline.
  27. Implements a four-stage validation process:
  28. 1. Content-Type verification (must be application/json)
  29. 2. JSON syntax validation
  30. 3. Payload structure type checking
  31. 4. Pydantic model validation with error formatting
  32. Args:
  33. request (Request): Flask request object containing HTTP payload
  34. validator (type[BaseModel]): Pydantic model class for data validation
  35. extras (dict[str, Any] | None): Additional fields to merge into payload
  36. before validation. These fields will be removed from the final output
  37. exclude_unset (bool): Whether to exclude fields that have not been explicitly set
  38. Returns:
  39. tuple[Dict[str, Any] | None, str | None]:
  40. - First element:
  41. - Validated dictionary on success
  42. - None on validation failure
  43. - Second element:
  44. - None on success
  45. - Diagnostic error message on failure
  46. Raises:
  47. UnsupportedMediaType: When Content-Type header is not application/json
  48. BadRequest: For structural JSON syntax errors
  49. ValidationError: When payload violates Pydantic schema rules
  50. Examples:
  51. >>> validate_and_parse_json_request(valid_request, DatasetSchema)
  52. ({"name": "Dataset1", "format": "csv"}, None)
  53. >>> validate_and_parse_json_request(xml_request, DatasetSchema)
  54. (None, "Unsupported content type: Expected application/json, got text/xml")
  55. >>> validate_and_parse_json_request(bad_json_request, DatasetSchema)
  56. (None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding")
  57. Notes:
  58. 1. Validation Priority:
  59. - Content-Type verification precedes JSON parsing
  60. - Structural validation occurs before schema validation
  61. 2. Extra fields added via `extras` parameter are automatically removed
  62. from the final output after validation
  63. """
  64. try:
  65. payload = request.get_json() or {}
  66. except UnsupportedMediaType:
  67. return None, f"Unsupported content type: Expected application/json, got {request.content_type}"
  68. except BadRequest:
  69. return None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding"
  70. if not isinstance(payload, dict):
  71. return None, f"Invalid request payload: expected object, got {type(payload).__name__}"
  72. try:
  73. if extras is not None:
  74. payload.update(extras)
  75. validated_request = validator(**payload)
  76. except ValidationError as e:
  77. return None, format_validation_error_message(e)
  78. parsed_payload = validated_request.model_dump(by_alias=True, exclude_unset=exclude_unset)
  79. if extras is not None:
  80. for key in list(parsed_payload.keys()):
  81. if key in extras:
  82. del parsed_payload[key]
  83. return parsed_payload, None
  84. def format_validation_error_message(e: ValidationError) -> str:
  85. """
  86. Formats validation errors into a standardized string format.
  87. Processes pydantic ValidationError objects to create human-readable error messages
  88. containing field locations, error descriptions, and input values.
  89. Args:
  90. e (ValidationError): The validation error instance containing error details
  91. Returns:
  92. str: Formatted error messages joined by newlines. Each line contains:
  93. - Field path (dot-separated)
  94. - Error message
  95. - Truncated input value (max 128 chars)
  96. Example:
  97. >>> try:
  98. ... UserModel(name=123, email="invalid")
  99. ... except ValidationError as e:
  100. ... print(format_validation_error_message(e))
  101. Field: <name> - Message: <Input should be a valid string> - Value: <123>
  102. Field: <email> - Message: <value is not a valid email address> - Value: <invalid>
  103. """
  104. error_messages = []
  105. for error in e.errors():
  106. field = ".".join(map(str, error["loc"]))
  107. msg = error["msg"]
  108. input_val = error["input"]
  109. input_str = str(input_val)
  110. if len(input_str) > 128:
  111. input_str = input_str[:125] + "..."
  112. error_msg = f"Field: <{field}> - Message: <{msg}> - Value: <{input_str}>"
  113. error_messages.append(error_msg)
  114. return "\n".join(error_messages)
  115. class PermissionEnum(StrEnum):
  116. me = auto()
  117. team = auto()
  118. class ChunkMethodnEnum(StrEnum):
  119. naive = auto()
  120. book = auto()
  121. email = auto()
  122. laws = auto()
  123. manual = auto()
  124. one = auto()
  125. paper = auto()
  126. picture = auto()
  127. presentation = auto()
  128. qa = auto()
  129. table = auto()
  130. tag = auto()
  131. class GraphragMethodEnum(StrEnum):
  132. light = auto()
  133. general = auto()
  134. class Base(BaseModel):
  135. class Config:
  136. extra = "forbid"
  137. class RaptorConfig(Base):
  138. use_raptor: bool = Field(default=False)
  139. prompt: Annotated[
  140. str,
  141. StringConstraints(strip_whitespace=True, min_length=1),
  142. Field(
  143. default="Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize."
  144. ),
  145. ]
  146. max_token: int = Field(default=256, ge=1, le=2048)
  147. threshold: float = Field(default=0.1, ge=0.0, le=1.0)
  148. max_cluster: int = Field(default=64, ge=1, le=1024)
  149. random_seed: int = Field(default=0, ge=0)
  150. class GraphragConfig(Base):
  151. use_graphrag: bool = Field(default=False)
  152. entity_types: list[str] = Field(default_factory=lambda: ["organization", "person", "geo", "event", "category"])
  153. method: GraphragMethodEnum = Field(default=GraphragMethodEnum.light)
  154. community: bool = Field(default=False)
  155. resolution: bool = Field(default=False)
  156. class ParserConfig(Base):
  157. auto_keywords: int = Field(default=0, ge=0, le=32)
  158. auto_questions: int = Field(default=0, ge=0, le=10)
  159. chunk_token_num: int = Field(default=128, ge=1, le=2048)
  160. delimiter: str = Field(default=r"\n", min_length=1)
  161. graphrag: GraphragConfig | None = None
  162. html4excel: bool = False
  163. layout_recognize: str = "DeepDOC"
  164. raptor: RaptorConfig | None = None
  165. tag_kb_ids: list[str] = Field(default_factory=list)
  166. topn_tags: int = Field(default=1, ge=1, le=10)
  167. filename_embd_weight: float | None = Field(default=None, ge=0.0, le=1.0)
  168. task_page_size: int | None = Field(default=None, ge=1)
  169. pages: list[list[int]] | None = None
  170. class CreateDatasetReq(Base):
  171. name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
  172. avatar: str | None = Field(default=None, max_length=65535)
  173. description: str | None = Field(default=None, max_length=65535)
  174. embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")]
  175. permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)]
  176. chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")]
  177. pagerank: int = Field(default=0, ge=0, le=100)
  178. parser_config: ParserConfig = Field(default_factory=dict)
  179. @field_validator("avatar")
  180. @classmethod
  181. def validate_avatar_base64(cls, v: str | None) -> str | None:
  182. """
  183. Validates Base64-encoded avatar string format and MIME type compliance.
  184. Implements a three-stage validation workflow:
  185. 1. MIME prefix existence check
  186. 2. MIME type format validation
  187. 3. Supported type verification
  188. Args:
  189. v (str): Raw avatar field value
  190. Returns:
  191. str: Validated Base64 string
  192. Raises:
  193. ValueError: For structural errors in these cases:
  194. - Missing MIME prefix header
  195. - Invalid MIME prefix format
  196. - Unsupported image MIME type
  197. Example:
  198. ```python
  199. # Valid case
  200. CreateDatasetReq(avatar="...")
  201. # Invalid cases
  202. CreateDatasetReq(avatar="image/jpeg;base64,...") # Missing 'data:' prefix
  203. CreateDatasetReq(avatar="data:video/mp4;base64,...") # Unsupported MIME type
  204. ```
  205. """
  206. if v is None:
  207. return v
  208. if "," in v:
  209. prefix, _ = v.split(",", 1)
  210. if not prefix.startswith("data:"):
  211. raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
  212. mime_type = prefix[5:].split(";")[0]
  213. supported_mime_types = ["image/jpeg", "image/png"]
  214. if mime_type not in supported_mime_types:
  215. raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
  216. return v
  217. else:
  218. raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
  219. @field_validator("embedding_model", mode="after")
  220. @classmethod
  221. def validate_embedding_model(cls, v: str) -> str:
  222. """
  223. Validates embedding model identifier format compliance.
  224. Validation pipeline:
  225. 1. Structural format verification
  226. 2. Component non-empty check
  227. 3. Value normalization
  228. Args:
  229. v (str): Raw model identifier
  230. Returns:
  231. str: Validated <model_name>@<provider> format
  232. Raises:
  233. ValueError: For these violations:
  234. - Missing @ separator
  235. - Empty model_name/provider
  236. - Invalid component structure
  237. Examples:
  238. Valid: "text-embedding-3-large@openai"
  239. Invalid: "invalid_model" (no @)
  240. Invalid: "@openai" (empty model_name)
  241. Invalid: "text-embedding-3-large@" (empty provider)
  242. """
  243. if "@" not in v:
  244. raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
  245. components = v.split("@", 1)
  246. if len(components) != 2 or not all(components):
  247. raise ValueError("Both model_name and provider must be non-empty strings")
  248. model_name, provider = components
  249. if not model_name.strip() or not provider.strip():
  250. raise ValueError("Model name and provider cannot be whitespace-only strings")
  251. return v
  252. @field_validator("permission", mode="before")
  253. @classmethod
  254. def permission_auto_lowercase(cls, v: Any) -> Any:
  255. """
  256. Normalize permission input to lowercase for consistent PermissionEnum matching.
  257. Args:
  258. v (Any): Raw input value for the permission field
  259. Returns:
  260. Lowercase string if input is string type, otherwise returns original value
  261. Behavior:
  262. - Converts string inputs to lowercase (e.g., "ME" → "me")
  263. - Non-string values pass through unchanged
  264. - Works in validation pre-processing stage (before enum conversion)
  265. """
  266. return v.lower() if isinstance(v, str) else v
  267. @field_validator("parser_config", mode="after")
  268. @classmethod
  269. def validate_parser_config_json_length(cls, v: ParserConfig) -> ParserConfig:
  270. """
  271. Validates serialized JSON length constraints for parser configuration.
  272. Implements a two-stage validation workflow:
  273. 1. Model serialization - convert Pydantic model to JSON string
  274. 2. Size verification - enforce maximum allowed payload size
  275. Args:
  276. v (ParserConfig | None): Raw parser configuration object
  277. Returns:
  278. ParserConfig | None: Validated configuration object
  279. Raises:
  280. ValueError: When serialized JSON exceeds 65,535 characters
  281. """
  282. if (json_str := v.model_dump_json()) and len(json_str) > 65535:
  283. raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
  284. return v
  285. class UpdateDatasetReq(CreateDatasetReq):
  286. dataset_id: UUID1 = Field(...)
  287. name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(default="")]
  288. @field_serializer("dataset_id")
  289. def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
  290. return v.hex