Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. import logging
  2. import re
  3. import threading
  4. from collections import deque
  5. from dataclasses import dataclass
  6. from typing import Any, Optional, Union
  7. from core.schemas.registry import SchemaRegistry
  8. logger = logging.getLogger(__name__)
  9. # Type aliases for better clarity
  10. SchemaType = Union[dict[str, Any], list[Any], str, int, float, bool, None]
  11. SchemaDict = dict[str, Any]
  12. # Pre-compiled pattern for better performance
  13. _DIFY_SCHEMA_PATTERN = re.compile(r"^https://dify\.ai/schemas/(v\d+)/(.+)\.json$")
  14. class SchemaResolutionError(Exception):
  15. """Base exception for schema resolution errors"""
  16. pass
  17. class CircularReferenceError(SchemaResolutionError):
  18. """Raised when a circular reference is detected"""
  19. def __init__(self, ref_uri: str, ref_path: list[str]):
  20. self.ref_uri = ref_uri
  21. self.ref_path = ref_path
  22. super().__init__(f"Circular reference detected: {ref_uri} in path {' -> '.join(ref_path)}")
  23. class MaxDepthExceededError(SchemaResolutionError):
  24. """Raised when maximum resolution depth is exceeded"""
  25. def __init__(self, max_depth: int):
  26. self.max_depth = max_depth
  27. super().__init__(f"Maximum resolution depth ({max_depth}) exceeded")
  28. class SchemaNotFoundError(SchemaResolutionError):
  29. """Raised when a referenced schema cannot be found"""
  30. def __init__(self, ref_uri: str):
  31. self.ref_uri = ref_uri
  32. super().__init__(f"Schema not found: {ref_uri}")
  33. @dataclass
  34. class QueueItem:
  35. """Represents an item in the BFS queue"""
  36. current: Any
  37. parent: Optional[Any]
  38. key: Optional[Union[str, int]]
  39. depth: int
  40. ref_path: set[str]
  41. class SchemaResolver:
  42. """Resolver for Dify schema references with caching and optimizations"""
  43. _cache: dict[str, SchemaDict] = {}
  44. _cache_lock = threading.Lock()
  45. def __init__(self, registry: Optional[SchemaRegistry] = None, max_depth: int = 10):
  46. """
  47. Initialize the schema resolver
  48. Args:
  49. registry: Schema registry to use (defaults to default registry)
  50. max_depth: Maximum depth for reference resolution
  51. """
  52. self.registry = registry or SchemaRegistry.default_registry()
  53. self.max_depth = max_depth
  54. @classmethod
  55. def clear_cache(cls) -> None:
  56. """Clear the global schema cache"""
  57. with cls._cache_lock:
  58. cls._cache.clear()
  59. def resolve(self, schema: SchemaType) -> SchemaType:
  60. """
  61. Resolve all $ref references in the schema
  62. Performance optimization: quickly checks for $ref presence before processing.
  63. Args:
  64. schema: Schema to resolve
  65. Returns:
  66. Resolved schema with all references expanded
  67. Raises:
  68. CircularReferenceError: If circular reference detected
  69. MaxDepthExceededError: If max depth exceeded
  70. SchemaNotFoundError: If referenced schema not found
  71. """
  72. if not isinstance(schema, (dict, list)):
  73. return schema
  74. # Fast path: if no Dify refs found, return original schema unchanged
  75. # This avoids expensive deepcopy and BFS traversal for schemas without refs
  76. if not _has_dify_refs(schema):
  77. return schema
  78. # Slow path: schema contains refs, perform full resolution
  79. import copy
  80. result = copy.deepcopy(schema)
  81. # Initialize BFS queue
  82. queue = deque([QueueItem(current=result, parent=None, key=None, depth=0, ref_path=set())])
  83. while queue:
  84. item = queue.popleft()
  85. # Process the current item
  86. self._process_queue_item(queue, item)
  87. return result
  88. def _process_queue_item(self, queue: deque, item: QueueItem) -> None:
  89. """Process a single queue item"""
  90. if isinstance(item.current, dict):
  91. self._process_dict(queue, item)
  92. elif isinstance(item.current, list):
  93. self._process_list(queue, item)
  94. def _process_dict(self, queue: deque, item: QueueItem) -> None:
  95. """Process a dictionary item"""
  96. ref_uri = item.current.get("$ref")
  97. if ref_uri and _is_dify_schema_ref(ref_uri):
  98. # Handle $ref resolution
  99. self._resolve_ref(queue, item, ref_uri)
  100. else:
  101. # Process nested items
  102. for key, value in item.current.items():
  103. if isinstance(value, (dict, list)):
  104. next_depth = item.depth + 1
  105. if next_depth >= self.max_depth:
  106. raise MaxDepthExceededError(self.max_depth)
  107. queue.append(
  108. QueueItem(current=value, parent=item.current, key=key, depth=next_depth, ref_path=item.ref_path)
  109. )
  110. def _process_list(self, queue: deque, item: QueueItem) -> None:
  111. """Process a list item"""
  112. for idx, value in enumerate(item.current):
  113. if isinstance(value, (dict, list)):
  114. next_depth = item.depth + 1
  115. if next_depth >= self.max_depth:
  116. raise MaxDepthExceededError(self.max_depth)
  117. queue.append(
  118. QueueItem(current=value, parent=item.current, key=idx, depth=next_depth, ref_path=item.ref_path)
  119. )
  120. def _resolve_ref(self, queue: deque, item: QueueItem, ref_uri: str) -> None:
  121. """Resolve a $ref reference"""
  122. # Check for circular reference
  123. if ref_uri in item.ref_path:
  124. # Mark as circular and skip
  125. item.current["$circular_ref"] = True
  126. logger.warning("Circular reference detected: %s", ref_uri)
  127. return
  128. # Get resolved schema (from cache or registry)
  129. resolved_schema = self._get_resolved_schema(ref_uri)
  130. if not resolved_schema:
  131. logger.warning("Schema not found: %s", ref_uri)
  132. return
  133. # Update ref path
  134. new_ref_path = item.ref_path | {ref_uri}
  135. # Replace the reference with resolved schema
  136. next_depth = item.depth + 1
  137. if next_depth >= self.max_depth:
  138. raise MaxDepthExceededError(self.max_depth)
  139. if item.parent is None:
  140. # Root level replacement
  141. item.current.clear()
  142. item.current.update(resolved_schema)
  143. queue.append(
  144. QueueItem(current=item.current, parent=None, key=None, depth=next_depth, ref_path=new_ref_path)
  145. )
  146. else:
  147. # Update parent container
  148. item.parent[item.key] = resolved_schema.copy()
  149. queue.append(
  150. QueueItem(
  151. current=item.parent[item.key],
  152. parent=item.parent,
  153. key=item.key,
  154. depth=next_depth,
  155. ref_path=new_ref_path,
  156. )
  157. )
  158. def _get_resolved_schema(self, ref_uri: str) -> Optional[SchemaDict]:
  159. """Get resolved schema from cache or registry"""
  160. # Check cache first
  161. with self._cache_lock:
  162. if ref_uri in self._cache:
  163. return self._cache[ref_uri].copy()
  164. # Fetch from registry
  165. schema = self.registry.get_schema(ref_uri)
  166. if not schema:
  167. return None
  168. # Clean and cache
  169. cleaned = _remove_metadata_fields(schema)
  170. with self._cache_lock:
  171. self._cache[ref_uri] = cleaned
  172. return cleaned.copy()
  173. def resolve_dify_schema_refs(
  174. schema: SchemaType, registry: Optional[SchemaRegistry] = None, max_depth: int = 30
  175. ) -> SchemaType:
  176. """
  177. Resolve $ref references in Dify schema to actual schema content
  178. This is a convenience function that creates a resolver and resolves the schema.
  179. Performance optimization: quickly checks for $ref presence before processing.
  180. Args:
  181. schema: Schema object that may contain $ref references
  182. registry: Optional schema registry, defaults to default registry
  183. max_depth: Maximum depth to prevent infinite loops (default: 30)
  184. Returns:
  185. Schema with all $ref references resolved to actual content
  186. Raises:
  187. CircularReferenceError: If circular reference detected
  188. MaxDepthExceededError: If maximum depth exceeded
  189. SchemaNotFoundError: If referenced schema not found
  190. """
  191. # Fast path: if no Dify refs found, return original schema unchanged
  192. # This avoids expensive deepcopy and BFS traversal for schemas without refs
  193. if not _has_dify_refs(schema):
  194. return schema
  195. # Slow path: schema contains refs, perform full resolution
  196. resolver = SchemaResolver(registry, max_depth)
  197. return resolver.resolve(schema)
  198. def _remove_metadata_fields(schema: dict) -> dict:
  199. """
  200. Remove metadata fields from schema that shouldn't be included in resolved output
  201. Args:
  202. schema: Schema dictionary
  203. Returns:
  204. Cleaned schema without metadata fields
  205. """
  206. # Create a copy and remove metadata fields
  207. cleaned = schema.copy()
  208. metadata_fields = ["$id", "$schema", "version"]
  209. for field in metadata_fields:
  210. cleaned.pop(field, None)
  211. return cleaned
  212. def _is_dify_schema_ref(ref_uri: Any) -> bool:
  213. """
  214. Check if the reference URI is a Dify schema reference
  215. Args:
  216. ref_uri: URI to check
  217. Returns:
  218. True if it's a Dify schema reference
  219. """
  220. if not isinstance(ref_uri, str):
  221. return False
  222. # Use pre-compiled pattern for better performance
  223. return bool(_DIFY_SCHEMA_PATTERN.match(ref_uri))
  224. def _has_dify_refs_recursive(schema: SchemaType) -> bool:
  225. """
  226. Recursively check if a schema contains any Dify $ref references
  227. This is the fallback method when string-based detection is not possible.
  228. Args:
  229. schema: Schema to check for references
  230. Returns:
  231. True if any Dify $ref is found, False otherwise
  232. """
  233. if isinstance(schema, dict):
  234. # Check if this dict has a $ref field
  235. ref_uri = schema.get("$ref")
  236. if ref_uri and _is_dify_schema_ref(ref_uri):
  237. return True
  238. # Check nested values
  239. for value in schema.values():
  240. if _has_dify_refs_recursive(value):
  241. return True
  242. elif isinstance(schema, list):
  243. # Check each item in the list
  244. for item in schema:
  245. if _has_dify_refs_recursive(item):
  246. return True
  247. # Primitive types don't contain refs
  248. return False
  249. def _has_dify_refs_hybrid(schema: SchemaType) -> bool:
  250. """
  251. Hybrid detection: fast string scan followed by precise recursive check
  252. Performance optimization using two-phase detection:
  253. 1. Fast string scan to quickly eliminate schemas without $ref
  254. 2. Precise recursive validation only for potential candidates
  255. Args:
  256. schema: Schema to check for references
  257. Returns:
  258. True if any Dify $ref is found, False otherwise
  259. """
  260. # Phase 1: Fast string-based pre-filtering
  261. try:
  262. import json
  263. schema_str = json.dumps(schema, separators=(",", ":"))
  264. # Quick elimination: no $ref at all
  265. if '"$ref"' not in schema_str:
  266. return False
  267. # Quick elimination: no Dify schema URLs
  268. if "https://dify.ai/schemas/" not in schema_str:
  269. return False
  270. except (TypeError, ValueError, OverflowError):
  271. # JSON serialization failed (e.g., circular references, non-serializable objects)
  272. # Fall back to recursive detection
  273. logger.debug("JSON serialization failed for schema, using recursive detection")
  274. return _has_dify_refs_recursive(schema)
  275. # Phase 2: Precise recursive validation
  276. # Only executed for schemas that passed string pre-filtering
  277. return _has_dify_refs_recursive(schema)
  278. def _has_dify_refs(schema: SchemaType) -> bool:
  279. """
  280. Check if a schema contains any Dify $ref references
  281. Uses hybrid detection for optimal performance:
  282. - Fast string scan for quick elimination
  283. - Precise recursive check for validation
  284. Args:
  285. schema: Schema to check for references
  286. Returns:
  287. True if any Dify $ref is found, False otherwise
  288. """
  289. return _has_dify_refs_hybrid(schema)
  290. def parse_dify_schema_uri(uri: str) -> tuple[str, str]:
  291. """
  292. Parse a Dify schema URI to extract version and schema name
  293. Args:
  294. uri: Schema URI to parse
  295. Returns:
  296. Tuple of (version, schema_name) or ("", "") if invalid
  297. """
  298. match = _DIFY_SCHEMA_PATTERN.match(uri)
  299. if not match:
  300. return "", ""
  301. return match.group(1), match.group(2)