You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

file_lifecycle.py 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. """ClickZetta Volume file lifecycle management
  2. This module provides file lifecycle management features including version control, automatic cleanup, backup and restore.
  3. Supports complete lifecycle management for knowledge base files.
  4. """
  5. import json
  6. import logging
  7. from dataclasses import asdict, dataclass
  8. from datetime import datetime
  9. from enum import Enum
  10. from typing import Any, Optional
  11. logger = logging.getLogger(__name__)
  12. class FileStatus(Enum):
  13. """File status enumeration"""
  14. ACTIVE = "active" # Active status
  15. ARCHIVED = "archived" # Archived
  16. DELETED = "deleted" # Deleted (soft delete)
  17. BACKUP = "backup" # Backup file
  18. @dataclass
  19. class FileMetadata:
  20. """File metadata"""
  21. filename: str
  22. size: int | None
  23. created_at: datetime
  24. modified_at: datetime
  25. version: int | None
  26. status: FileStatus
  27. checksum: Optional[str] = None
  28. tags: Optional[dict[str, str]] = None
  29. parent_version: Optional[int] = None
  30. def to_dict(self) -> dict:
  31. """Convert to dictionary format"""
  32. data = asdict(self)
  33. data["created_at"] = self.created_at.isoformat()
  34. data["modified_at"] = self.modified_at.isoformat()
  35. data["status"] = self.status.value
  36. return data
  37. @classmethod
  38. def from_dict(cls, data: dict) -> "FileMetadata":
  39. """Create instance from dictionary"""
  40. data = data.copy()
  41. data["created_at"] = datetime.fromisoformat(data["created_at"])
  42. data["modified_at"] = datetime.fromisoformat(data["modified_at"])
  43. data["status"] = FileStatus(data["status"])
  44. return cls(**data)
  45. class FileLifecycleManager:
  46. """File lifecycle manager"""
  47. def __init__(self, storage, dataset_id: Optional[str] = None):
  48. """Initialize lifecycle manager
  49. Args:
  50. storage: ClickZetta Volume storage instance
  51. dataset_id: Dataset ID (for Table Volume)
  52. """
  53. self._storage = storage
  54. self._dataset_id = dataset_id
  55. self._metadata_file = ".dify_file_metadata.json"
  56. self._version_prefix = ".versions/"
  57. self._backup_prefix = ".backups/"
  58. self._deleted_prefix = ".deleted/"
  59. # Get permission manager (if exists)
  60. self._permission_manager: Optional[Any] = getattr(storage, "_permission_manager", None)
  61. def save_with_lifecycle(self, filename: str, data: bytes, tags: Optional[dict[str, str]] = None) -> FileMetadata:
  62. """Save file and manage lifecycle
  63. Args:
  64. filename: File name
  65. data: File content
  66. tags: File tags
  67. Returns:
  68. File metadata
  69. """
  70. # Permission check
  71. if not self._check_permission(filename, "save"):
  72. from .volume_permissions import VolumePermissionError
  73. raise VolumePermissionError(
  74. f"Permission denied for lifecycle save operation on file: {filename}",
  75. operation="save",
  76. volume_type=getattr(self._storage, "_config", {}).get("volume_type", "unknown"),
  77. dataset_id=self._dataset_id,
  78. )
  79. try:
  80. # 1. Check if old version exists
  81. metadata_dict = self._load_metadata()
  82. current_metadata = metadata_dict.get(filename)
  83. # 2. If old version exists, create version backup
  84. if current_metadata:
  85. self._create_version_backup(filename, current_metadata)
  86. # 3. Calculate file information
  87. now = datetime.now()
  88. checksum = self._calculate_checksum(data)
  89. new_version = (current_metadata["version"] + 1) if current_metadata else 1
  90. # 4. Save new file
  91. self._storage.save(filename, data)
  92. # 5. Create metadata
  93. created_at = now
  94. parent_version = None
  95. if current_metadata:
  96. # If created_at is string, convert to datetime
  97. if isinstance(current_metadata["created_at"], str):
  98. created_at = datetime.fromisoformat(current_metadata["created_at"])
  99. else:
  100. created_at = current_metadata["created_at"]
  101. parent_version = current_metadata["version"]
  102. file_metadata = FileMetadata(
  103. filename=filename,
  104. size=len(data),
  105. created_at=created_at,
  106. modified_at=now,
  107. version=new_version,
  108. status=FileStatus.ACTIVE,
  109. checksum=checksum,
  110. tags=tags or {},
  111. parent_version=parent_version,
  112. )
  113. # 6. Update metadata
  114. metadata_dict[filename] = file_metadata.to_dict()
  115. self._save_metadata(metadata_dict)
  116. logger.info("File %s saved with lifecycle management, version %s", filename, new_version)
  117. return file_metadata
  118. except Exception as e:
  119. logger.exception("Failed to save file with lifecycle")
  120. raise
  121. def get_file_metadata(self, filename: str) -> Optional[FileMetadata]:
  122. """Get file metadata
  123. Args:
  124. filename: File name
  125. Returns:
  126. File metadata, returns None if not exists
  127. """
  128. try:
  129. metadata_dict = self._load_metadata()
  130. if filename in metadata_dict:
  131. return FileMetadata.from_dict(metadata_dict[filename])
  132. return None
  133. except Exception as e:
  134. logger.exception("Failed to get file metadata for %s", filename)
  135. return None
  136. def list_file_versions(self, filename: str) -> list[FileMetadata]:
  137. """List all versions of a file
  138. Args:
  139. filename: File name
  140. Returns:
  141. File version list, sorted by version number
  142. """
  143. try:
  144. versions = []
  145. # Get current version
  146. current_metadata = self.get_file_metadata(filename)
  147. if current_metadata:
  148. versions.append(current_metadata)
  149. # Get historical versions
  150. try:
  151. version_files = self._storage.scan(self._dataset_id or "", files=True)
  152. for file_path in version_files:
  153. if file_path.startswith(f"{self._version_prefix}{filename}.v"):
  154. # Parse version number
  155. version_str = file_path.split(".v")[-1].split(".")[0]
  156. try:
  157. version_num = int(version_str)
  158. # Simplified processing here, should actually read metadata from version file
  159. # Temporarily create basic metadata information
  160. except ValueError:
  161. continue
  162. except:
  163. # If cannot scan version files, only return current version
  164. pass
  165. return sorted(versions, key=lambda x: x.version or 0, reverse=True)
  166. except Exception as e:
  167. logger.exception("Failed to list file versions for %s", filename)
  168. return []
  169. def restore_version(self, filename: str, version: int) -> bool:
  170. """Restore file to specified version
  171. Args:
  172. filename: File name
  173. version: Version number to restore
  174. Returns:
  175. Whether restore succeeded
  176. """
  177. try:
  178. version_filename = f"{self._version_prefix}{filename}.v{version}"
  179. # Check if version file exists
  180. if not self._storage.exists(version_filename):
  181. logger.warning("Version %s of %s not found", version, filename)
  182. return False
  183. # Read version file content
  184. version_data = self._storage.load_once(version_filename)
  185. # Save current version as backup
  186. current_metadata = self.get_file_metadata(filename)
  187. if current_metadata:
  188. self._create_version_backup(filename, current_metadata.to_dict())
  189. # Restore file
  190. self.save_with_lifecycle(filename, version_data, {"restored_from": str(version)})
  191. return True
  192. except Exception as e:
  193. logger.exception("Failed to restore %s to version %s", filename, version)
  194. return False
  195. def archive_file(self, filename: str) -> bool:
  196. """Archive file
  197. Args:
  198. filename: File name
  199. Returns:
  200. Whether archive succeeded
  201. """
  202. # Permission check
  203. if not self._check_permission(filename, "archive"):
  204. logger.warning("Permission denied for archive operation on file: %s", filename)
  205. return False
  206. try:
  207. # Update file status to archived
  208. metadata_dict = self._load_metadata()
  209. if filename not in metadata_dict:
  210. logger.warning("File %s not found in metadata", filename)
  211. return False
  212. metadata_dict[filename]["status"] = FileStatus.ARCHIVED.value
  213. metadata_dict[filename]["modified_at"] = datetime.now().isoformat()
  214. self._save_metadata(metadata_dict)
  215. logger.info("File %s archived successfully", filename)
  216. return True
  217. except Exception as e:
  218. logger.exception("Failed to archive file %s", filename)
  219. return False
  220. def soft_delete_file(self, filename: str) -> bool:
  221. """Soft delete file (move to deleted directory)
  222. Args:
  223. filename: File name
  224. Returns:
  225. Whether delete succeeded
  226. """
  227. # Permission check
  228. if not self._check_permission(filename, "delete"):
  229. logger.warning("Permission denied for soft delete operation on file: %s", filename)
  230. return False
  231. try:
  232. # Check if file exists
  233. if not self._storage.exists(filename):
  234. logger.warning("File %s not found", filename)
  235. return False
  236. # Read file content
  237. file_data = self._storage.load_once(filename)
  238. # Move to deleted directory
  239. deleted_filename = f"{self._deleted_prefix}{filename}.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
  240. self._storage.save(deleted_filename, file_data)
  241. # Delete original file
  242. self._storage.delete(filename)
  243. # Update metadata
  244. metadata_dict = self._load_metadata()
  245. if filename in metadata_dict:
  246. metadata_dict[filename]["status"] = FileStatus.DELETED.value
  247. metadata_dict[filename]["modified_at"] = datetime.now().isoformat()
  248. self._save_metadata(metadata_dict)
  249. logger.info("File %s soft deleted successfully", filename)
  250. return True
  251. except Exception as e:
  252. logger.exception("Failed to soft delete file %s", filename)
  253. return False
  254. def cleanup_old_versions(self, max_versions: int = 5, max_age_days: int = 30) -> int:
  255. """Cleanup old version files
  256. Args:
  257. max_versions: Maximum number of versions to keep
  258. max_age_days: Maximum retention days for version files
  259. Returns:
  260. Number of files cleaned
  261. """
  262. try:
  263. cleaned_count = 0
  264. # Get all version files
  265. try:
  266. all_files = self._storage.scan(self._dataset_id or "", files=True)
  267. version_files = [f for f in all_files if f.startswith(self._version_prefix)]
  268. # Group by file
  269. file_versions: dict[str, list[tuple[int, str]]] = {}
  270. for version_file in version_files:
  271. # Parse filename and version
  272. parts = version_file[len(self._version_prefix) :].split(".v")
  273. if len(parts) >= 2:
  274. base_filename = parts[0]
  275. version_part = parts[1].split(".")[0]
  276. try:
  277. version_num = int(version_part)
  278. if base_filename not in file_versions:
  279. file_versions[base_filename] = []
  280. file_versions[base_filename].append((version_num, version_file))
  281. except ValueError:
  282. continue
  283. # Cleanup old versions for each file
  284. for base_filename, versions in file_versions.items():
  285. # Sort by version number
  286. versions.sort(key=lambda x: x[0], reverse=True)
  287. # Keep the newest max_versions versions, delete the rest
  288. if len(versions) > max_versions:
  289. to_delete = versions[max_versions:]
  290. for version_num, version_file in to_delete:
  291. self._storage.delete(version_file)
  292. cleaned_count += 1
  293. logger.debug("Cleaned old version: %s", version_file)
  294. logger.info("Cleaned %d old version files", cleaned_count)
  295. except Exception as e:
  296. logger.warning("Could not scan for version files: %s", e)
  297. return cleaned_count
  298. except Exception as e:
  299. logger.exception("Failed to cleanup old versions")
  300. return 0
  301. def get_storage_statistics(self) -> dict[str, Any]:
  302. """Get storage statistics
  303. Returns:
  304. Storage statistics dictionary
  305. """
  306. try:
  307. metadata_dict = self._load_metadata()
  308. stats: dict[str, Any] = {
  309. "total_files": len(metadata_dict),
  310. "active_files": 0,
  311. "archived_files": 0,
  312. "deleted_files": 0,
  313. "total_size": 0,
  314. "versions_count": 0,
  315. "oldest_file": None,
  316. "newest_file": None,
  317. }
  318. oldest_date = None
  319. newest_date = None
  320. for filename, metadata in metadata_dict.items():
  321. file_meta = FileMetadata.from_dict(metadata)
  322. # Count file status
  323. if file_meta.status == FileStatus.ACTIVE:
  324. stats["active_files"] = (stats["active_files"] or 0) + 1
  325. elif file_meta.status == FileStatus.ARCHIVED:
  326. stats["archived_files"] = (stats["archived_files"] or 0) + 1
  327. elif file_meta.status == FileStatus.DELETED:
  328. stats["deleted_files"] = (stats["deleted_files"] or 0) + 1
  329. # Count size
  330. stats["total_size"] = (stats["total_size"] or 0) + (file_meta.size or 0)
  331. # Count versions
  332. stats["versions_count"] = (stats["versions_count"] or 0) + (file_meta.version or 0)
  333. # Find newest and oldest files
  334. if oldest_date is None or file_meta.created_at < oldest_date:
  335. oldest_date = file_meta.created_at
  336. stats["oldest_file"] = filename
  337. if newest_date is None or file_meta.modified_at > newest_date:
  338. newest_date = file_meta.modified_at
  339. stats["newest_file"] = filename
  340. return stats
  341. except Exception as e:
  342. logger.exception("Failed to get storage statistics")
  343. return {}
  344. def _create_version_backup(self, filename: str, metadata: dict):
  345. """Create version backup"""
  346. try:
  347. # Read current file content
  348. current_data = self._storage.load_once(filename)
  349. # Save as version file
  350. version_filename = f"{self._version_prefix}{filename}.v{metadata['version']}"
  351. self._storage.save(version_filename, current_data)
  352. logger.debug("Created version backup: %s", version_filename)
  353. except Exception as e:
  354. logger.warning("Failed to create version backup for %s: %s", filename, e)
  355. def _load_metadata(self) -> dict[str, Any]:
  356. """Load metadata file"""
  357. try:
  358. if self._storage.exists(self._metadata_file):
  359. metadata_content = self._storage.load_once(self._metadata_file)
  360. result = json.loads(metadata_content.decode("utf-8"))
  361. return dict(result) if result else {}
  362. else:
  363. return {}
  364. except Exception as e:
  365. logger.warning("Failed to load metadata: %s", e)
  366. return {}
  367. def _save_metadata(self, metadata_dict: dict):
  368. """Save metadata file"""
  369. try:
  370. metadata_content = json.dumps(metadata_dict, indent=2, ensure_ascii=False)
  371. self._storage.save(self._metadata_file, metadata_content.encode("utf-8"))
  372. logger.debug("Metadata saved successfully")
  373. except Exception as e:
  374. logger.exception("Failed to save metadata")
  375. raise
  376. def _calculate_checksum(self, data: bytes) -> str:
  377. """Calculate file checksum"""
  378. import hashlib
  379. return hashlib.md5(data).hexdigest()
  380. def _check_permission(self, filename: str, operation: str) -> bool:
  381. """Check file operation permission
  382. Args:
  383. filename: File name
  384. operation: Operation type
  385. Returns:
  386. True if permission granted, False otherwise
  387. """
  388. # If no permission manager, allow by default
  389. if not self._permission_manager:
  390. return True
  391. try:
  392. # Map operation type to permission
  393. operation_mapping = {
  394. "save": "save",
  395. "load": "load_once",
  396. "delete": "delete",
  397. "archive": "delete", # Archive requires delete permission
  398. "restore": "save", # Restore requires write permission
  399. "cleanup": "delete", # Cleanup requires delete permission
  400. "read": "load_once",
  401. "write": "save",
  402. }
  403. mapped_operation = operation_mapping.get(operation, operation)
  404. # Check permission
  405. result = self._permission_manager.validate_operation(mapped_operation, self._dataset_id)
  406. return bool(result)
  407. except Exception as e:
  408. logger.exception("Permission check failed for %s operation %s", filename, operation)
  409. # Safe default: deny access when permission check fails
  410. return False