You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

file_lifecycle.py 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. """ClickZetta Volume文件生命周期管理
  2. 该模块提供文件版本控制、自动清理、备份和恢复等生命周期管理功能。
  3. 支持知识库文件的完整生命周期管理。
  4. """
  5. import json
  6. import logging
  7. from dataclasses import asdict, dataclass
  8. from datetime import datetime, timedelta
  9. from enum import Enum
  10. from typing import Any, Optional
  11. logger = logging.getLogger(__name__)
  12. class FileStatus(Enum):
  13. """文件状态枚举"""
  14. ACTIVE = "active" # 活跃状态
  15. ARCHIVED = "archived" # 已归档
  16. DELETED = "deleted" # 已删除(软删除)
  17. BACKUP = "backup" # 备份文件
  18. @dataclass
  19. class FileMetadata:
  20. """文件元数据"""
  21. filename: str
  22. size: int | None
  23. created_at: datetime
  24. modified_at: datetime
  25. version: int | None
  26. status: FileStatus
  27. checksum: Optional[str] = None
  28. tags: Optional[dict[str, str]] = None
  29. parent_version: Optional[int] = None
  30. def to_dict(self) -> dict:
  31. """转换为字典格式"""
  32. data = asdict(self)
  33. data["created_at"] = self.created_at.isoformat()
  34. data["modified_at"] = self.modified_at.isoformat()
  35. data["status"] = self.status.value
  36. return data
  37. @classmethod
  38. def from_dict(cls, data: dict) -> "FileMetadata":
  39. """从字典创建实例"""
  40. data = data.copy()
  41. data["created_at"] = datetime.fromisoformat(data["created_at"])
  42. data["modified_at"] = datetime.fromisoformat(data["modified_at"])
  43. data["status"] = FileStatus(data["status"])
  44. return cls(**data)
  45. class FileLifecycleManager:
  46. """文件生命周期管理器"""
  47. def __init__(self, storage, dataset_id: Optional[str] = None):
  48. """初始化生命周期管理器
  49. Args:
  50. storage: ClickZetta Volume存储实例
  51. dataset_id: 数据集ID(用于Table Volume)
  52. """
  53. self._storage = storage
  54. self._dataset_id = dataset_id
  55. self._metadata_file = ".dify_file_metadata.json"
  56. self._version_prefix = ".versions/"
  57. self._backup_prefix = ".backups/"
  58. self._deleted_prefix = ".deleted/"
  59. # 获取权限管理器(如果存在)
  60. self._permission_manager: Optional[Any] = getattr(storage, "_permission_manager", None)
  61. def save_with_lifecycle(self, filename: str, data: bytes, tags: Optional[dict[str, str]] = None) -> FileMetadata:
  62. """保存文件并管理生命周期
  63. Args:
  64. filename: 文件名
  65. data: 文件内容
  66. tags: 文件标签
  67. Returns:
  68. 文件元数据
  69. """
  70. # 权限检查
  71. if not self._check_permission(filename, "save"):
  72. from .volume_permissions import VolumePermissionError
  73. raise VolumePermissionError(
  74. f"Permission denied for lifecycle save operation on file: {filename}",
  75. operation="save",
  76. volume_type=getattr(self._storage, "_config", {}).get("volume_type", "unknown"),
  77. dataset_id=self._dataset_id,
  78. )
  79. try:
  80. # 1. 检查是否存在旧版本
  81. metadata_dict = self._load_metadata()
  82. current_metadata = metadata_dict.get(filename)
  83. # 2. 如果存在旧版本,创建版本备份
  84. if current_metadata:
  85. self._create_version_backup(filename, current_metadata)
  86. # 3. 计算文件信息
  87. now = datetime.now()
  88. checksum = self._calculate_checksum(data)
  89. new_version = (current_metadata["version"] + 1) if current_metadata else 1
  90. # 4. 保存新文件
  91. self._storage.save(filename, data)
  92. # 5. 创建元数据
  93. created_at = now
  94. parent_version = None
  95. if current_metadata:
  96. # 如果created_at是字符串,转换为datetime
  97. if isinstance(current_metadata["created_at"], str):
  98. created_at = datetime.fromisoformat(current_metadata["created_at"])
  99. else:
  100. created_at = current_metadata["created_at"]
  101. parent_version = current_metadata["version"]
  102. file_metadata = FileMetadata(
  103. filename=filename,
  104. size=len(data),
  105. created_at=created_at,
  106. modified_at=now,
  107. version=new_version,
  108. status=FileStatus.ACTIVE,
  109. checksum=checksum,
  110. tags=tags or {},
  111. parent_version=parent_version,
  112. )
  113. # 6. 更新元数据
  114. metadata_dict[filename] = file_metadata.to_dict()
  115. self._save_metadata(metadata_dict)
  116. logger.info("File %s saved with lifecycle management, version %s", filename, new_version)
  117. return file_metadata
  118. except Exception as e:
  119. logger.exception("Failed to save file with lifecycle")
  120. raise
  121. def get_file_metadata(self, filename: str) -> Optional[FileMetadata]:
  122. """获取文件元数据
  123. Args:
  124. filename: 文件名
  125. Returns:
  126. 文件元数据,如果不存在返回None
  127. """
  128. try:
  129. metadata_dict = self._load_metadata()
  130. if filename in metadata_dict:
  131. return FileMetadata.from_dict(metadata_dict[filename])
  132. return None
  133. except Exception as e:
  134. logger.exception("Failed to get file metadata for %s", filename)
  135. return None
  136. def list_file_versions(self, filename: str) -> list[FileMetadata]:
  137. """列出文件的所有版本
  138. Args:
  139. filename: 文件名
  140. Returns:
  141. 文件版本列表,按版本号排序
  142. """
  143. try:
  144. versions = []
  145. # 获取当前版本
  146. current_metadata = self.get_file_metadata(filename)
  147. if current_metadata:
  148. versions.append(current_metadata)
  149. # 获取历史版本
  150. version_pattern = f"{self._version_prefix}{filename}.v*"
  151. try:
  152. version_files = self._storage.scan(self._dataset_id or "", files=True)
  153. for file_path in version_files:
  154. if file_path.startswith(f"{self._version_prefix}{filename}.v"):
  155. # 解析版本号
  156. version_str = file_path.split(".v")[-1].split(".")[0]
  157. try:
  158. version_num = int(version_str)
  159. # 这里简化处理,实际应该从版本文件中读取元数据
  160. # 暂时创建基本的元数据信息
  161. except ValueError:
  162. continue
  163. except:
  164. # 如果无法扫描版本文件,只返回当前版本
  165. pass
  166. return sorted(versions, key=lambda x: x.version or 0, reverse=True)
  167. except Exception as e:
  168. logger.exception("Failed to list file versions for %s", filename)
  169. return []
  170. def restore_version(self, filename: str, version: int) -> bool:
  171. """恢复文件到指定版本
  172. Args:
  173. filename: 文件名
  174. version: 要恢复的版本号
  175. Returns:
  176. 恢复是否成功
  177. """
  178. try:
  179. version_filename = f"{self._version_prefix}{filename}.v{version}"
  180. # 检查版本文件是否存在
  181. if not self._storage.exists(version_filename):
  182. logger.warning("Version %s of %s not found", version, filename)
  183. return False
  184. # 读取版本文件内容
  185. version_data = self._storage.load_once(version_filename)
  186. # 保存当前版本为备份
  187. current_metadata = self.get_file_metadata(filename)
  188. if current_metadata:
  189. self._create_version_backup(filename, current_metadata.to_dict())
  190. # 恢复文件
  191. self.save_with_lifecycle(filename, version_data, {"restored_from": str(version)})
  192. return True
  193. except Exception as e:
  194. logger.exception("Failed to restore %s to version %s", filename, version)
  195. return False
  196. def archive_file(self, filename: str) -> bool:
  197. """归档文件
  198. Args:
  199. filename: 文件名
  200. Returns:
  201. 归档是否成功
  202. """
  203. # 权限检查
  204. if not self._check_permission(filename, "archive"):
  205. logger.warning("Permission denied for archive operation on file: %s", filename)
  206. return False
  207. try:
  208. # 更新文件状态为归档
  209. metadata_dict = self._load_metadata()
  210. if filename not in metadata_dict:
  211. logger.warning("File %s not found in metadata", filename)
  212. return False
  213. metadata_dict[filename]["status"] = FileStatus.ARCHIVED.value
  214. metadata_dict[filename]["modified_at"] = datetime.now().isoformat()
  215. self._save_metadata(metadata_dict)
  216. logger.info("File %s archived successfully", filename)
  217. return True
  218. except Exception as e:
  219. logger.exception("Failed to archive file %s", filename)
  220. return False
  221. def soft_delete_file(self, filename: str) -> bool:
  222. """软删除文件(移动到删除目录)
  223. Args:
  224. filename: 文件名
  225. Returns:
  226. 删除是否成功
  227. """
  228. # 权限检查
  229. if not self._check_permission(filename, "delete"):
  230. logger.warning("Permission denied for soft delete operation on file: %s", filename)
  231. return False
  232. try:
  233. # 检查文件是否存在
  234. if not self._storage.exists(filename):
  235. logger.warning("File %s not found", filename)
  236. return False
  237. # 读取文件内容
  238. file_data = self._storage.load_once(filename)
  239. # 移动到删除目录
  240. deleted_filename = f"{self._deleted_prefix}{filename}.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
  241. self._storage.save(deleted_filename, file_data)
  242. # 删除原文件
  243. self._storage.delete(filename)
  244. # 更新元数据
  245. metadata_dict = self._load_metadata()
  246. if filename in metadata_dict:
  247. metadata_dict[filename]["status"] = FileStatus.DELETED.value
  248. metadata_dict[filename]["modified_at"] = datetime.now().isoformat()
  249. self._save_metadata(metadata_dict)
  250. logger.info("File %s soft deleted successfully", filename)
  251. return True
  252. except Exception as e:
  253. logger.exception("Failed to soft delete file %s", filename)
  254. return False
  255. def cleanup_old_versions(self, max_versions: int = 5, max_age_days: int = 30) -> int:
  256. """清理旧版本文件
  257. Args:
  258. max_versions: 保留的最大版本数
  259. max_age_days: 版本文件的最大保留天数
  260. Returns:
  261. 清理的文件数量
  262. """
  263. try:
  264. cleaned_count = 0
  265. cutoff_date = datetime.now() - timedelta(days=max_age_days)
  266. # 获取所有版本文件
  267. try:
  268. all_files = self._storage.scan(self._dataset_id or "", files=True)
  269. version_files = [f for f in all_files if f.startswith(self._version_prefix)]
  270. # 按文件分组
  271. file_versions: dict[str, list[tuple[int, str]]] = {}
  272. for version_file in version_files:
  273. # 解析文件名和版本
  274. parts = version_file[len(self._version_prefix) :].split(".v")
  275. if len(parts) >= 2:
  276. base_filename = parts[0]
  277. version_part = parts[1].split(".")[0]
  278. try:
  279. version_num = int(version_part)
  280. if base_filename not in file_versions:
  281. file_versions[base_filename] = []
  282. file_versions[base_filename].append((version_num, version_file))
  283. except ValueError:
  284. continue
  285. # 清理每个文件的旧版本
  286. for base_filename, versions in file_versions.items():
  287. # 按版本号排序
  288. versions.sort(key=lambda x: x[0], reverse=True)
  289. # 保留最新的max_versions个版本,删除其余的
  290. if len(versions) > max_versions:
  291. to_delete = versions[max_versions:]
  292. for version_num, version_file in to_delete:
  293. self._storage.delete(version_file)
  294. cleaned_count += 1
  295. logger.debug("Cleaned old version: %s", version_file)
  296. logger.info("Cleaned %d old version files", cleaned_count)
  297. except Exception as e:
  298. logger.warning("Could not scan for version files: %s", e)
  299. return cleaned_count
  300. except Exception as e:
  301. logger.exception("Failed to cleanup old versions")
  302. return 0
  303. def get_storage_statistics(self) -> dict[str, Any]:
  304. """获取存储统计信息
  305. Returns:
  306. 存储统计字典
  307. """
  308. try:
  309. metadata_dict = self._load_metadata()
  310. stats: dict[str, Any] = {
  311. "total_files": len(metadata_dict),
  312. "active_files": 0,
  313. "archived_files": 0,
  314. "deleted_files": 0,
  315. "total_size": 0,
  316. "versions_count": 0,
  317. "oldest_file": None,
  318. "newest_file": None,
  319. }
  320. oldest_date = None
  321. newest_date = None
  322. for filename, metadata in metadata_dict.items():
  323. file_meta = FileMetadata.from_dict(metadata)
  324. # 统计文件状态
  325. if file_meta.status == FileStatus.ACTIVE:
  326. stats["active_files"] = (stats["active_files"] or 0) + 1
  327. elif file_meta.status == FileStatus.ARCHIVED:
  328. stats["archived_files"] = (stats["archived_files"] or 0) + 1
  329. elif file_meta.status == FileStatus.DELETED:
  330. stats["deleted_files"] = (stats["deleted_files"] or 0) + 1
  331. # 统计大小
  332. stats["total_size"] = (stats["total_size"] or 0) + (file_meta.size or 0)
  333. # 统计版本
  334. stats["versions_count"] = (stats["versions_count"] or 0) + (file_meta.version or 0)
  335. # 找出最新和最旧的文件
  336. if oldest_date is None or file_meta.created_at < oldest_date:
  337. oldest_date = file_meta.created_at
  338. stats["oldest_file"] = filename
  339. if newest_date is None or file_meta.modified_at > newest_date:
  340. newest_date = file_meta.modified_at
  341. stats["newest_file"] = filename
  342. return stats
  343. except Exception as e:
  344. logger.exception("Failed to get storage statistics")
  345. return {}
  346. def _create_version_backup(self, filename: str, metadata: dict):
  347. """创建版本备份"""
  348. try:
  349. # 读取当前文件内容
  350. current_data = self._storage.load_once(filename)
  351. # 保存为版本文件
  352. version_filename = f"{self._version_prefix}{filename}.v{metadata['version']}"
  353. self._storage.save(version_filename, current_data)
  354. logger.debug("Created version backup: %s", version_filename)
  355. except Exception as e:
  356. logger.warning("Failed to create version backup for %s: %s", filename, e)
  357. def _load_metadata(self) -> dict[str, Any]:
  358. """加载元数据文件"""
  359. try:
  360. if self._storage.exists(self._metadata_file):
  361. metadata_content = self._storage.load_once(self._metadata_file)
  362. result = json.loads(metadata_content.decode("utf-8"))
  363. return dict(result) if result else {}
  364. else:
  365. return {}
  366. except Exception as e:
  367. logger.warning("Failed to load metadata: %s", e)
  368. return {}
  369. def _save_metadata(self, metadata_dict: dict):
  370. """保存元数据文件"""
  371. try:
  372. metadata_content = json.dumps(metadata_dict, indent=2, ensure_ascii=False)
  373. self._storage.save(self._metadata_file, metadata_content.encode("utf-8"))
  374. logger.debug("Metadata saved successfully")
  375. except Exception as e:
  376. logger.exception("Failed to save metadata")
  377. raise
  378. def _calculate_checksum(self, data: bytes) -> str:
  379. """计算文件校验和"""
  380. import hashlib
  381. return hashlib.md5(data).hexdigest()
  382. def _check_permission(self, filename: str, operation: str) -> bool:
  383. """检查文件操作权限
  384. Args:
  385. filename: 文件名
  386. operation: 操作类型
  387. Returns:
  388. True if permission granted, False otherwise
  389. """
  390. # 如果没有权限管理器,默认允许
  391. if not self._permission_manager:
  392. return True
  393. try:
  394. # 根据操作类型映射到权限
  395. operation_mapping = {
  396. "save": "save",
  397. "load": "load_once",
  398. "delete": "delete",
  399. "archive": "delete", # 归档需要删除权限
  400. "restore": "save", # 恢复需要写权限
  401. "cleanup": "delete", # 清理需要删除权限
  402. "read": "load_once",
  403. "write": "save",
  404. }
  405. mapped_operation = operation_mapping.get(operation, operation)
  406. # 检查权限
  407. result = self._permission_manager.validate_operation(mapped_operation, self._dataset_id)
  408. return bool(result)
  409. except Exception as e:
  410. logger.exception("Permission check failed for %s operation %s", filename, operation)
  411. # 安全默认:权限检查失败时拒绝访问
  412. return False