您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. from abc import ABC, abstractmethod
  2. from collections.abc import Sequence
  3. from typing import Any, Optional
  4. from pydantic import BaseModel
  5. class ChildDocument(BaseModel):
  6. """Class for storing a piece of text and associated metadata."""
  7. page_content: str
  8. vector: Optional[list[float]] = None
  9. """Arbitrary metadata about the page content (e.g., source, relationships to other
  10. documents, etc.).
  11. """
  12. metadata: dict = {}
  13. class Document(BaseModel):
  14. """Class for storing a piece of text and associated metadata."""
  15. page_content: str
  16. vector: Optional[list[float]] = None
  17. """Arbitrary metadata about the page content (e.g., source, relationships to other
  18. documents, etc.).
  19. """
  20. metadata: dict = {}
  21. provider: Optional[str] = "dify"
  22. children: Optional[list[ChildDocument]] = None
  23. class BaseDocumentTransformer(ABC):
  24. """Abstract base class for document transformation systems.
  25. A document transformation system takes a sequence of Documents and returns a
  26. sequence of transformed Documents.
  27. Example:
  28. .. code-block:: python
  29. class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
  30. model_config = ConfigDict(arbitrary_types_allowed=True)
  31. embeddings: Embeddings
  32. similarity_fn: Callable = cosine_similarity
  33. similarity_threshold: float = 0.95
  34. def transform_documents(
  35. self, documents: Sequence[Document], **kwargs: Any
  36. ) -> Sequence[Document]:
  37. stateful_documents = get_stateful_documents(documents)
  38. embedded_documents = _get_embeddings_from_stateful_docs(
  39. self.embeddings, stateful_documents
  40. )
  41. included_idxs = _filter_similar_embeddings(
  42. embedded_documents, self.similarity_fn, self.similarity_threshold
  43. )
  44. return [stateful_documents[i] for i in sorted(included_idxs)]
  45. async def atransform_documents(
  46. self, documents: Sequence[Document], **kwargs: Any
  47. ) -> Sequence[Document]:
  48. raise NotImplementedError
  49. """
  50. @abstractmethod
  51. def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  52. """Transform a list of documents.
  53. Args:
  54. documents: A sequence of Documents to be transformed.
  55. Returns:
  56. A list of transformed Documents.
  57. """
  58. @abstractmethod
  59. async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  60. """Asynchronously transform a list of documents.
  61. Args:
  62. documents: A sequence of Documents to be transformed.
  63. Returns:
  64. A list of transformed Documents.
  65. """