You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

opendal_storage.py 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. import logging
  2. import os
  3. from collections.abc import Generator
  4. from pathlib import Path
  5. from dotenv import dotenv_values
  6. from opendal import Operator
  7. from opendal.layers import RetryLayer
  8. from extensions.storage.base_storage import BaseStorage
  9. logger = logging.getLogger(__name__)
  10. def _get_opendal_kwargs(*, scheme: str, env_file_path: str = ".env", prefix: str = "OPENDAL_"):
  11. kwargs = {}
  12. config_prefix = prefix + scheme.upper() + "_"
  13. for key, value in os.environ.items():
  14. if key.startswith(config_prefix):
  15. kwargs[key[len(config_prefix) :].lower()] = value
  16. file_env_vars: dict = dotenv_values(env_file_path) or {}
  17. for key, value in file_env_vars.items():
  18. if key.startswith(config_prefix) and key[len(config_prefix) :].lower() not in kwargs and value:
  19. kwargs[key[len(config_prefix) :].lower()] = value
  20. return kwargs
  21. class OpenDALStorage(BaseStorage):
  22. def __init__(self, scheme: str, **kwargs):
  23. kwargs = kwargs or _get_opendal_kwargs(scheme=scheme)
  24. if scheme == "fs":
  25. root = kwargs.get("root", "storage")
  26. Path(root).mkdir(parents=True, exist_ok=True)
  27. retry_layer = RetryLayer(max_times=3, factor=2.0, jitter=True)
  28. self.op = Operator(scheme=scheme, **kwargs).layer(retry_layer)
  29. logger.debug("opendal operator created with scheme %s", scheme)
  30. logger.debug("added retry layer to opendal operator")
  31. def save(self, filename: str, data: bytes):
  32. self.op.write(path=filename, bs=data)
  33. logger.debug("file %s saved", filename)
  34. def load_once(self, filename: str) -> bytes:
  35. if not self.exists(filename):
  36. raise FileNotFoundError("File not found")
  37. content: bytes = self.op.read(path=filename)
  38. logger.debug("file %s loaded", filename)
  39. return content
  40. def load_stream(self, filename: str) -> Generator:
  41. if not self.exists(filename):
  42. raise FileNotFoundError("File not found")
  43. batch_size = 4096
  44. with self.op.open(
  45. path=filename,
  46. mode="rb",
  47. chunck=batch_size,
  48. ) as file:
  49. while chunk := file.read(batch_size):
  50. yield chunk
  51. logger.debug("file %s loaded as stream", filename)
  52. def download(self, filename: str, target_filepath: str):
  53. if not self.exists(filename):
  54. raise FileNotFoundError("File not found")
  55. Path(target_filepath).write_bytes(self.op.read(path=filename))
  56. logger.debug("file %s downloaded to %s", filename, target_filepath)
  57. def exists(self, filename: str) -> bool:
  58. return self.op.exists(path=filename)
  59. def delete(self, filename: str):
  60. if self.exists(filename):
  61. self.op.delete(path=filename)
  62. logger.debug("file %s deleted", filename)
  63. return
  64. logger.debug("file %s not found, skip delete", filename)
  65. def scan(self, path: str, files: bool = True, directories: bool = False) -> list[str]:
  66. if not self.exists(path):
  67. raise FileNotFoundError("Path not found")
  68. all_files = self.op.list(path=path)
  69. if files and directories:
  70. logger.debug("files and directories on %s scanned", path)
  71. return [f.path for f in all_files]
  72. if files:
  73. logger.debug("files on %s scanned", path)
  74. return [f.path for f in all_files if not f.path.endswith("/")]
  75. elif directories:
  76. logger.debug("directories on %s scanned", path)
  77. return [f.path for f in all_files if f.path.endswith("/")]
  78. else:
  79. raise ValueError("At least one of files or directories must be True")