You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. from enum import auto
  17. from typing import Annotated, Any
  18. from flask import Request
  19. from pydantic import BaseModel, Field, StringConstraints, ValidationError, field_validator
  20. from strenum import StrEnum
  21. from werkzeug.exceptions import BadRequest, UnsupportedMediaType
  22. from api.constants import DATASET_NAME_LIMIT
  23. def validate_and_parse_json_request(request: Request, validator: type[BaseModel]) -> tuple[dict[str, Any] | None, str | None]:
  24. """Validates and parses JSON requests through a multi-stage validation pipeline.
  25. Implements a robust four-stage validation process:
  26. 1. Content-Type verification (must be application/json)
  27. 2. JSON syntax validation
  28. 3. Payload structure type checking
  29. 4. Pydantic model validation with error formatting
  30. Args:
  31. request (Request): Flask request object containing HTTP payload
  32. Returns:
  33. tuple[Dict[str, Any] | None, str | None]:
  34. - First element:
  35. - Validated dictionary on success
  36. - None on validation failure
  37. - Second element:
  38. - None on success
  39. - Diagnostic error message on failure
  40. Raises:
  41. UnsupportedMediaType: When Content-Type ≠ application/json
  42. BadRequest: For structural JSON syntax errors
  43. ValidationError: When payload violates Pydantic schema rules
  44. Examples:
  45. Successful validation:
  46. ```python
  47. # Input: {"name": "Dataset1", "format": "csv"}
  48. # Returns: ({"name": "Dataset1", "format": "csv"}, None)
  49. ```
  50. Invalid Content-Type:
  51. ```python
  52. # Returns: (None, "Unsupported content type: Expected application/json, got text/xml")
  53. ```
  54. Malformed JSON:
  55. ```python
  56. # Returns: (None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding")
  57. ```
  58. """
  59. try:
  60. payload = request.get_json() or {}
  61. except UnsupportedMediaType:
  62. return None, f"Unsupported content type: Expected application/json, got {request.content_type}"
  63. except BadRequest:
  64. return None, "Malformed JSON syntax: Missing commas/brackets or invalid encoding"
  65. if not isinstance(payload, dict):
  66. return None, f"Invalid request payload: expected object, got {type(payload).__name__}"
  67. try:
  68. validated_request = validator(**payload)
  69. except ValidationError as e:
  70. return None, format_validation_error_message(e)
  71. parsed_payload = validated_request.model_dump(by_alias=True)
  72. return parsed_payload, None
  73. def format_validation_error_message(e: ValidationError) -> str:
  74. """Formats validation errors into a standardized string format.
  75. Processes pydantic ValidationError objects to create human-readable error messages
  76. containing field locations, error descriptions, and input values.
  77. Args:
  78. e (ValidationError): The validation error instance containing error details
  79. Returns:
  80. str: Formatted error messages joined by newlines. Each line contains:
  81. - Field path (dot-separated)
  82. - Error message
  83. - Truncated input value (max 128 chars)
  84. Example:
  85. >>> try:
  86. ... UserModel(name=123, email="invalid")
  87. ... except ValidationError as e:
  88. ... print(format_validation_error_message(e))
  89. Field: <name> - Message: <Input should be a valid string> - Value: <123>
  90. Field: <email> - Message: <value is not a valid email address> - Value: <invalid>
  91. """
  92. error_messages = []
  93. for error in e.errors():
  94. field = ".".join(map(str, error["loc"]))
  95. msg = error["msg"]
  96. input_val = error["input"]
  97. input_str = str(input_val)
  98. if len(input_str) > 128:
  99. input_str = input_str[:125] + "..."
  100. error_msg = f"Field: <{field}> - Message: <{msg}> - Value: <{input_str}>"
  101. error_messages.append(error_msg)
  102. return "\n".join(error_messages)
  103. class PermissionEnum(StrEnum):
  104. me = auto()
  105. team = auto()
  106. class ChunkMethodnEnum(StrEnum):
  107. naive = auto()
  108. book = auto()
  109. email = auto()
  110. laws = auto()
  111. manual = auto()
  112. one = auto()
  113. paper = auto()
  114. picture = auto()
  115. presentation = auto()
  116. qa = auto()
  117. table = auto()
  118. tag = auto()
  119. class GraphragMethodEnum(StrEnum):
  120. light = auto()
  121. general = auto()
  122. class Base(BaseModel):
  123. class Config:
  124. extra = "forbid"
  125. json_schema_extra = {"charset": "utf8mb4", "collation": "utf8mb4_0900_ai_ci"}
  126. class RaptorConfig(Base):
  127. use_raptor: bool = Field(default=False)
  128. prompt: Annotated[
  129. str,
  130. StringConstraints(strip_whitespace=True, min_length=1),
  131. Field(
  132. default="Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n {cluster_content}\nThe above is the content you need to summarize."
  133. ),
  134. ]
  135. max_token: int = Field(default=256, ge=1, le=2048)
  136. threshold: float = Field(default=0.1, ge=0.0, le=1.0)
  137. max_cluster: int = Field(default=64, ge=1, le=1024)
  138. random_seed: int = Field(default=0, ge=0)
  139. class GraphragConfig(Base):
  140. use_graphrag: bool = Field(default=False)
  141. entity_types: list[str] = Field(default_factory=lambda: ["organization", "person", "geo", "event", "category"])
  142. method: GraphragMethodEnum = Field(default=GraphragMethodEnum.light)
  143. community: bool = Field(default=False)
  144. resolution: bool = Field(default=False)
  145. class ParserConfig(Base):
  146. auto_keywords: int = Field(default=0, ge=0, le=32)
  147. auto_questions: int = Field(default=0, ge=0, le=10)
  148. chunk_token_num: int = Field(default=128, ge=1, le=2048)
  149. delimiter: str = Field(default=r"\n", min_length=1)
  150. graphrag: GraphragConfig | None = None
  151. html4excel: bool = False
  152. layout_recognize: str = "DeepDOC"
  153. raptor: RaptorConfig | None = None
  154. tag_kb_ids: list[str] = Field(default_factory=list)
  155. topn_tags: int = Field(default=1, ge=1, le=10)
  156. filename_embd_weight: float | None = Field(default=None, ge=0.0, le=1.0)
  157. task_page_size: int | None = Field(default=None, ge=1)
  158. pages: list[list[int]] | None = None
  159. class CreateDatasetReq(Base):
  160. name: Annotated[str, StringConstraints(strip_whitespace=True, min_length=1, max_length=DATASET_NAME_LIMIT), Field(...)]
  161. avatar: str | None = Field(default=None, max_length=65535)
  162. description: str | None = Field(default=None, max_length=65535)
  163. embedding_model: Annotated[str | None, StringConstraints(strip_whitespace=True, max_length=255), Field(default=None, serialization_alias="embd_id")]
  164. permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)]
  165. chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")]
  166. pagerank: int = Field(default=0, ge=0, le=100)
  167. parser_config: ParserConfig | None = Field(default=None)
  168. @field_validator("avatar")
  169. @classmethod
  170. def validate_avatar_base64(cls, v: str) -> str:
  171. """Validates Base64-encoded avatar string format and MIME type compliance.
  172. Implements a three-stage validation workflow:
  173. 1. MIME prefix existence check
  174. 2. MIME type format validation
  175. 3. Supported type verification
  176. Args:
  177. v (str): Raw avatar field value
  178. Returns:
  179. str: Validated Base64 string
  180. Raises:
  181. ValueError: For structural errors in these cases:
  182. - Missing MIME prefix header
  183. - Invalid MIME prefix format
  184. - Unsupported image MIME type
  185. Example:
  186. ```python
  187. # Valid case
  188. CreateDatasetReq(avatar="...")
  189. # Invalid cases
  190. CreateDatasetReq(avatar="image/jpeg;base64,...") # Missing 'data:' prefix
  191. CreateDatasetReq(avatar="data:video/mp4;base64,...") # Unsupported MIME type
  192. ```
  193. """
  194. if v is None:
  195. return v
  196. if "," in v:
  197. prefix, _ = v.split(",", 1)
  198. if not prefix.startswith("data:"):
  199. raise ValueError("Invalid MIME prefix format. Must start with 'data:'")
  200. mime_type = prefix[5:].split(";")[0]
  201. supported_mime_types = ["image/jpeg", "image/png"]
  202. if mime_type not in supported_mime_types:
  203. raise ValueError(f"Unsupported MIME type. Allowed: {supported_mime_types}")
  204. return v
  205. else:
  206. raise ValueError("Missing MIME prefix. Expected format: data:<mime>;base64,<data>")
  207. @field_validator("embedding_model", mode="after")
  208. @classmethod
  209. def validate_embedding_model(cls, v: str) -> str:
  210. """Validates embedding model identifier format compliance.
  211. Validation pipeline:
  212. 1. Structural format verification
  213. 2. Component non-empty check
  214. 3. Value normalization
  215. Args:
  216. v (str): Raw model identifier
  217. Returns:
  218. str: Validated <model_name>@<provider> format
  219. Raises:
  220. ValueError: For these violations:
  221. - Missing @ separator
  222. - Empty model_name/provider
  223. - Invalid component structure
  224. Examples:
  225. Valid: "text-embedding-3-large@openai"
  226. Invalid: "invalid_model" (no @)
  227. Invalid: "@openai" (empty model_name)
  228. Invalid: "text-embedding-3-large@" (empty provider)
  229. """
  230. if "@" not in v:
  231. raise ValueError("Embedding model identifier must follow <model_name>@<provider> format")
  232. components = v.split("@", 1)
  233. if len(components) != 2 or not all(components):
  234. raise ValueError("Both model_name and provider must be non-empty strings")
  235. model_name, provider = components
  236. if not model_name.strip() or not provider.strip():
  237. raise ValueError("Model name and provider cannot be whitespace-only strings")
  238. return v
  239. @field_validator("permission", mode="before")
  240. @classmethod
  241. def permission_auto_lowercase(cls, v: str) -> str:
  242. """Normalize permission input to lowercase for consistent PermissionEnum matching.
  243. Args:
  244. v (str): Raw input value for the permission field
  245. Returns:
  246. Lowercase string if input is string type, otherwise returns original value
  247. Behavior:
  248. - Converts string inputs to lowercase (e.g., "ME" → "me")
  249. - Non-string values pass through unchanged
  250. - Works in validation pre-processing stage (before enum conversion)
  251. """
  252. return v.lower() if isinstance(v, str) else v
  253. @field_validator("parser_config", mode="after")
  254. @classmethod
  255. def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None:
  256. """Validates serialized JSON length constraints for parser configuration.
  257. Implements a three-stage validation workflow:
  258. 1. Null check - bypass validation for empty configurations
  259. 2. Model serialization - convert Pydantic model to JSON string
  260. 3. Size verification - enforce maximum allowed payload size
  261. Args:
  262. v (ParserConfig | None): Raw parser configuration object
  263. Returns:
  264. ParserConfig | None: Validated configuration object
  265. Raises:
  266. ValueError: When serialized JSON exceeds 65,535 characters
  267. """
  268. if v is None:
  269. return v
  270. if (json_str := v.model_dump_json()) and len(json_str) > 65535:
  271. raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
  272. return v