Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

batch_clean_document_task.py 3.2KB

10 месяцев назад
2 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
2 месяцев назад
10 месяцев назад
7 месяцев назад
10 месяцев назад
10 месяцев назад
2 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. import logging
  2. import time
  3. from typing import Optional
  4. import click
  5. from celery import shared_task
  6. from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
  7. from core.tools.utils.web_reader_tool import get_image_upload_file_ids
  8. from extensions.ext_database import db
  9. from extensions.ext_storage import storage
  10. from models.dataset import Dataset, DocumentSegment
  11. from models.model import UploadFile
  12. logger = logging.getLogger(__name__)
  13. @shared_task(queue="dataset")
  14. def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form: Optional[str], file_ids: list[str]):
  15. """
  16. Clean document when document deleted.
  17. :param document_ids: document ids
  18. :param dataset_id: dataset id
  19. :param doc_form: doc_form
  20. :param file_ids: file ids
  21. Usage: batch_clean_document_task.delay(document_ids, dataset_id)
  22. """
  23. logger.info(click.style("Start batch clean documents when documents deleted", fg="green"))
  24. start_at = time.perf_counter()
  25. try:
  26. if not doc_form:
  27. raise ValueError("doc_form is required")
  28. dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
  29. if not dataset:
  30. raise Exception("Document has no dataset")
  31. segments = db.session.query(DocumentSegment).where(DocumentSegment.document_id.in_(document_ids)).all()
  32. # check segment is exist
  33. if segments:
  34. index_node_ids = [segment.index_node_id for segment in segments]
  35. index_processor = IndexProcessorFactory(doc_form).init_index_processor()
  36. index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True)
  37. for segment in segments:
  38. image_upload_file_ids = get_image_upload_file_ids(segment.content)
  39. for upload_file_id in image_upload_file_ids:
  40. image_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first()
  41. try:
  42. if image_file and image_file.key:
  43. storage.delete(image_file.key)
  44. except Exception:
  45. logger.exception(
  46. "Delete image_files failed when storage deleted, \
  47. image_upload_file_is: %s",
  48. upload_file_id,
  49. )
  50. db.session.delete(image_file)
  51. db.session.delete(segment)
  52. db.session.commit()
  53. if file_ids:
  54. files = db.session.query(UploadFile).where(UploadFile.id.in_(file_ids)).all()
  55. for file in files:
  56. try:
  57. storage.delete(file.key)
  58. except Exception:
  59. logger.exception("Delete file failed when document deleted, file_id: %s", file.id)
  60. db.session.delete(file)
  61. db.session.commit()
  62. end_at = time.perf_counter()
  63. logger.info(
  64. click.style(
  65. f"Cleaned documents when documents deleted latency: {end_at - start_at}",
  66. fg="green",
  67. )
  68. )
  69. except Exception:
  70. logger.exception("Cleaned documents when documents deleted failed")
  71. finally:
  72. db.session.close()