| 
                        12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 | 
                        - from abc import ABC, abstractmethod
 - from collections.abc import Sequence
 - from typing import Any, Optional
 - 
 - from pydantic import BaseModel, Field
 - 
 - 
 - class ChildDocument(BaseModel):
 -     """Class for storing a piece of text and associated metadata."""
 - 
 -     page_content: str
 - 
 -     vector: Optional[list[float]] = None
 - 
 -     """Arbitrary metadata about the page content (e.g., source, relationships to other
 -         documents, etc.).
 -     """
 -     metadata: Optional[dict] = Field(default_factory=dict)
 - 
 - 
 - class Document(BaseModel):
 -     """Class for storing a piece of text and associated metadata."""
 - 
 -     page_content: str
 - 
 -     vector: Optional[list[float]] = None
 - 
 -     """Arbitrary metadata about the page content (e.g., source, relationships to other
 -         documents, etc.).
 -     """
 -     metadata: Optional[dict] = Field(default_factory=dict)
 - 
 -     provider: Optional[str] = "dify"
 - 
 -     children: Optional[list[ChildDocument]] = None
 - 
 - 
 - class BaseDocumentTransformer(ABC):
 -     """Abstract base class for document transformation systems.
 - 
 -     A document transformation system takes a sequence of Documents and returns a
 -     sequence of transformed Documents.
 - 
 -     Example:
 -         .. code-block:: python
 - 
 -             class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
 -                 embeddings: Embeddings
 -                 similarity_fn: Callable = cosine_similarity
 -                 similarity_threshold: float = 0.95
 - 
 -                 class Config:
 -                     arbitrary_types_allowed = True
 - 
 -                 def transform_documents(
 -                     self, documents: Sequence[Document], **kwargs: Any
 -                 ) -> Sequence[Document]:
 -                     stateful_documents = get_stateful_documents(documents)
 -                     embedded_documents = _get_embeddings_from_stateful_docs(
 -                         self.embeddings, stateful_documents
 -                     )
 -                     included_idxs = _filter_similar_embeddings(
 -                         embedded_documents, self.similarity_fn, self.similarity_threshold
 -                     )
 -                     return [stateful_documents[i] for i in sorted(included_idxs)]
 - 
 -                 async def atransform_documents(
 -                     self, documents: Sequence[Document], **kwargs: Any
 -                 ) -> Sequence[Document]:
 -                     raise NotImplementedError
 - 
 -     """
 - 
 -     @abstractmethod
 -     def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
 -         """Transform a list of documents.
 - 
 -         Args:
 -             documents: A sequence of Documents to be transformed.
 - 
 -         Returns:
 -             A list of transformed Documents.
 -         """
 - 
 -     @abstractmethod
 -     async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
 -         """Asynchronously transform a list of documents.
 - 
 -         Args:
 -             documents: A sequence of Documents to be transformed.
 - 
 -         Returns:
 -             A list of transformed Documents.
 -         """
 
 
  |