Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

validation_utils.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import uuid
  17. from enum import auto
  18. from typing import Annotated, Any
  19. from flask import Request
  20. from pydantic import UUID1, BaseModel, Field, StringConstraints, ValidationError, field_serializer, field_validator
  21. from strenum import StrEnum
  22. from werkzeug.exceptions import BadRequest, UnsupportedMediaType
  23. from api.constants import DATASET_NAME_LIMIT
  24. def validate_and_parse_json_request(request: Request, validator: type[BaseModel], *, extras: dict[str, Any] | None = None, exclude_unset: bool = False) -> tuple[dict[str, Any] | None, str | None]:
  25. """
  26. Validates and parses JSON requests through a multi-stage validation pipeline.
  27. Implements a four-stage validation process:
  28. 1. Content-Type verification (must be application/json)
  29. 2. JSON syntax validation
  30. 3. Payload structure type checking
  31. 4. Pydantic model validation with error formatting
  32. Args:
  33. request (Request): Flask request object containing HTTP payload
  34. validator (type[BaseModel]): Pydantic model class for data validation
  35. extras (dict[str, Any] | None): Additional fields to merge into payload
  36. before validation. These fields will be removed from the final output
  37. exclude_unset (bool): Whether to exclude fields that have not been explicitly set
  38. Returns:
  39. tuple[Dict[str, Any] | None, str | None]:
  40. - First element:
  41. - Validated dictionary on success
  42. - None on validation failure
  43. - Second element:
  44. - None on success
  45. - Diagnostic error message on failure
  46. Raises:
  47. UnsupportedMediaType: When Content-Type header is not application/json
  48. BadRequest: For structural JSON syntax errors
  49. ValidationError: When payload violates Pydantic schema rules
  50. Examples:
  51. >>> validate_and_parse_json_request(valid_request, DatasetSchema)
  52. ({"name": "Dataset1", "format": "csv"}, None)
  53. >>> validate_and_parse_json_request(xml_request, DatasetSchema)
  54. (None, "Unsupported content type: Expected application/json, got text/xml")
  55. >>> validate_and_parse_json_request(bad_json_request, DatasetSchema)
  56. (None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding")
  57. Notes:
  58. 1. Validation Priority:
  59. - Content-Type verification precedes JSON parsing
  60. - Structural validation occurs before schema validation
  61. 2. Extra fields added via `extras` parameter are automatically removed
  62. from the final output after validation
  63. """
  64. try:
  65. payload = request.get_json() or {}
  66. except UnsupportedMediaType:
  67. return None, f"Unsupported content type: Expected application/json, got {request.content_type}"
  68. except BadRequest:
  69. return None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding"
  70. if not isinstance(payload, dict):
  71. return None, f"Invalid request payload: expected object, got {type(payload).__name__}"
  72. try:
  73. if extras is not None:
  74. payload.update(extras)
  75. validated_request = validator(**payload)
  76. except ValidationError as e:
  77. return None, format_validation_error_message(e)
  78. parsed_payload = validated_request.model_dump(by_alias=True, exclude_unset=exclude_unset)
  79. if extras is not None:
  80. for key in list(parsed_payload.keys()):
  81. if key in extras:
  82. del parsed_payload[key]
  83. return parsed_payload, None
  84. def format_validation_error_message(e: ValidationError) -> str:
  85. """
  86. Formats validation errors into a standardized string format.
  87. Processes pydantic ValidationError objects to create human-readable error messages
  88. containing field locations, error descriptions, and input values.
  89. Args:
  90. e (ValidationError): The validation error instance containing error details
  91. Returns:
  92. str: Formatted error messages joined by newlines. Each line contains:
  93. - Field path (dot-separated)
  94. - Error message
  95. - Truncated input value (max 128 chars)
  96. Example:
  97. >>> try:
  98. ... UserModel(name=123, email="invalid")
  99. ... except ValidationError as e:
  100. ... print(format_validation_error_message(e))
  101. Field: <name> - Message: <Input should be a valid string> - Value: <123>
  102. Field: <email> - Message: <value is not a valid email address> - Value: <invalid>
  103. """
  104. error_messages = []
  105. for error in e.errors():
  106. field = ".".join(map(str, error["loc"]))
  107. msg = error["msg"]
  108. input_val = error["input"]
  109. input_str = str(input_val)
  110. if len(input_str) > 128:
  111. input_str = input_str[:125] + "..."
  112. error_msg = f"Field: <{field}> - Message: <{msg}> - Value: <{input_str}>"
  113. error_messages.append(error_msg)
  114. return "\n".join(error_messages)
  115. class PermissionEnum(StrEnum):
  116. me = auto()
  117. team = auto()
  118. class ChunkMethodnEnum(StrEnum):
  119. naive = auto()
  120. book = auto()
  121. email = auto()
  122. laws = auto()
  123. manual = auto()
  124. one = auto()
  125. paper = auto()
  126. picture = auto()
  127. presentation = auto()
  128. qa = auto()
  129. table = auto()
  130. tag = auto()
  131. class GraphragMethodEnum(StrEnum):
  132. light = auto()
  133. general = auto()
  134. class Base(BaseModel):
  135. class Config:
  136. extra = "forbid"
  137. class RaptorConfig(Base):
  138. use_raptor: bool = Field(default=False)
  139. prompt: Annotated[
  140. str,
  141. StringConstraints(strip_whitespace=True, min_length=1),
  142. Field(
  143. default="Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize."
  144. ),
  145. ]
  146. max_token: int = Field(default=256, ge=1, le=2048)
  147. threshold: float = Field(default=0.1, ge=0.0, le=1.0)
  148. max_cluster: int = Field(default=64, ge=1, le=1024)
  149. random_seed: int = Field(default=0, ge=0)
  150. class GraphragConfig(Base):
  151. use_graphrag: bool = Field(default=False)
  152. entity_types: list[str] = Field(default_factory=lambda: ["organization", "person", "geo", "event", "category"])
  153. method: GraphragMethodEnum = Field(default=GraphragMethodEnum.light)
  154. community: bool = Field(default=False)
  155. resolution: bool = Field(default=False)
  156. class ParserConfig(Base):
  157. auto_keywords: int = Field(default=0, ge=0, le=32)
  158. auto_questions: int = Field(default=0, ge=0, le=10)
  159. chunk_token_num: int = Field(default=128, ge=1, le=2048)
  160. delimiter: str = Field(default=r"\n", min_length=1)
  161. graphrag: GraphragConfig | None = None
  162. html4excel: bool = False
  163. layout_recognize: str = "DeepDOC"
  164. raptor: RaptorConfig | None = None
  165. tag_kb_ids: list[str] = Field(default_factory=list)
  166. topn_tags: int = Field(default=1, ge=1, le=10)
  167. filename_embd_weight: float | None = Field(default=None, ge=0.0, le=1.0)
  168. task_page_size: int | None = Field(default=None, ge=1)
  169. pages: list[list[int]] | None = None
  170. class CreateDatasetReq(Base):
  171. name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
  172. avatar: str | None = Field(default=None, max_length=65535)
  173. description: str | None = Field(default=None, max_length=65535)
  174. embedding_model: Annotated[str, StringConstraints(strip_whitespace=True, max_length=255), Field(default="", serialization_alias="embd_id")]
  175. permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)]
  176. chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")]
  177. pagerank: int = Field(default=0, ge=0, le=100)
  178. parser_config: ParserConfig | None = Field(default=None)
  179. @field_validator("avatar")
  180. @classmethod
  181. def validate_avatar_base64(cls, v: str | None) -> str | None:
  182. """
  183. Validates Base64-encoded avatar string format and MIME type compliance.
  184. Implements a three-stage validation workflow:
  185. 1. MIME prefix existence check
  186. 2. MIME type format validation
  187. 3. Supported type verification
  188. Args:
  189. v (str): Raw avatar field value
  190. Returns:
  191. str: Validated Base64 string
  192. Raises:
  193. ValueError: For structural errors in these cases:
  194. - Missing MIME prefix header
  195. - Invalid MIME prefix format
  196. - Unsupported image MIME type
  197. Example:
  198. ```python
  199. # Valid case
  200. CreateDatasetReq(avatar="data:image/png;base64,iVBORw0KGg...")
  201. # Invalid cases
  202. CreateDatasetReq(avatar="image/jpeg;base64,...") # Missing 'data:' prefix
  203. CreateDatasetReq(avatar="data:video/mp4;base64,...") # Unsupported MIME type
  204. ```
  205. """
  206. if v is None:
  207. return v
  208. if "," in v:
  209. prefix, _ = v.split(",", 1)
  210. if not prefix.startswith("data:"):
  211. raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
  212. mime_type = prefix[5:].split(";")[0]
  213. supported_mime_types = ["image/jpeg", "image/png"]
  214. if mime_type not in supported_mime_types:
  215. raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
  216. return v
  217. else:
  218. raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
  219. @field_validator("embedding_model", mode="after")
  220. @classmethod
  221. def validate_embedding_model(cls, v: str) -> str:
  222. """
  223. Validates embedding model identifier format compliance.
  224. Validation pipeline:
  225. 1. Structural format verification
  226. 2. Component non-empty check
  227. 3. Value normalization
  228. Args:
  229. v (str): Raw model identifier
  230. Returns:
  231. str: Validated <model_name>@<provider> format
  232. Raises:
  233. ValueError: For these violations:
  234. - Missing @ separator
  235. - Empty model_name/provider
  236. - Invalid component structure
  237. Examples:
  238. Valid: "text-embedding-3-large@openai"
  239. Invalid: "invalid_model" (no @)
  240. Invalid: "@openai" (empty model_name)
  241. Invalid: "text-embedding-3-large@" (empty provider)
  242. """
  243. if "@" not in v:
  244. raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
  245. components = v.split("@", 1)
  246. if len(components) != 2 or not all(components):
  247. raise ValueError("Both model_name and provider must be non-empty strings")
  248. model_name, provider = components
  249. if not model_name.strip() or not provider.strip():
  250. raise ValueError("Model name and provider cannot be whitespace-only strings")
  251. return v
  252. @field_validator("permission", mode="before")
  253. @classmethod
  254. def permission_auto_lowercase(cls, v: Any) -> Any:
  255. """
  256. Normalize permission input to lowercase for consistent PermissionEnum matching.
  257. Args:
  258. v (Any): Raw input value for the permission field
  259. Returns:
  260. Lowercase string if input is string type, otherwise returns original value
  261. Behavior:
  262. - Converts string inputs to lowercase (e.g., "ME" → "me")
  263. - Non-string values pass through unchanged
  264. - Works in validation pre-processing stage (before enum conversion)
  265. """
  266. return v.lower() if isinstance(v, str) else v
  267. @field_validator("parser_config", mode="before")
  268. @classmethod
  269. def normalize_empty_parser_config(cls, v: Any) -> Any:
  270. """
  271. Normalizes empty parser configuration by converting empty dictionaries to None.
  272. This validator ensures consistent handling of empty parser configurations across
  273. the application by converting empty dicts to None values.
  274. Args:
  275. v (Any): Raw input value for the parser config field
  276. Returns:
  277. Any: Returns None if input is an empty dict, otherwise returns the original value
  278. Example:
  279. >>> normalize_empty_parser_config({})
  280. None
  281. >>> normalize_empty_parser_config({"key": "value"})
  282. {"key": "value"}
  283. """
  284. if v == {}:
  285. return None
  286. return v
  287. @field_validator("parser_config", mode="after")
  288. @classmethod
  289. def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None:
  290. """
  291. Validates serialized JSON length constraints for parser configuration.
  292. Implements a two-stage validation workflow:
  293. 1. Null check - bypass validation for empty configurations
  294. 2. Model serialization - convert Pydantic model to JSON string
  295. 3. Size verification - enforce maximum allowed payload size
  296. Args:
  297. v (ParserConfig | None): Raw parser configuration object
  298. Returns:
  299. ParserConfig | None: Validated configuration object
  300. Raises:
  301. ValueError: When serialized JSON exceeds 65,535 characters
  302. """
  303. if v is None:
  304. return None
  305. if (json_str := v.model_dump_json()) and len(json_str) > 65535:
  306. raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
  307. return v
  308. class UpdateDatasetReq(CreateDatasetReq):
  309. dataset_id: UUID1 = Field(...)
  310. name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(default="")]
  311. @field_serializer("dataset_id")
  312. def serialize_uuid_to_hex(self, v: uuid.UUID) -> str:
  313. return v.hex