You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

document.py 3.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. from abc import ABC, abstractmethod
  2. from collections.abc import Sequence
  3. from typing import Any, Optional
  4. from pydantic import BaseModel
  5. class ChildDocument(BaseModel):
  6. """Class for storing a piece of text and associated metadata."""
  7. page_content: str
  8. vector: Optional[list[float]] = None
  9. """Arbitrary metadata about the page content (e.g., source, relationships to other
  10. documents, etc.).
  11. """
  12. metadata: dict = {}
  13. class Document(BaseModel):
  14. """Class for storing a piece of text and associated metadata."""
  15. page_content: str
  16. vector: Optional[list[float]] = None
  17. """Arbitrary metadata about the page content (e.g., source, relationships to other
  18. documents, etc.).
  19. """
  20. metadata: dict = {}
  21. provider: Optional[str] = "dify"
  22. children: Optional[list[ChildDocument]] = None
  23. class GeneralStructureChunk(BaseModel):
  24. """
  25. General Structure Chunk.
  26. """
  27. general_chunks: list[str]
  28. class ParentChildChunk(BaseModel):
  29. """
  30. Parent Child Chunk.
  31. """
  32. parent_content: str
  33. child_contents: list[str]
  34. class ParentChildStructureChunk(BaseModel):
  35. """
  36. Parent Child Structure Chunk.
  37. """
  38. parent_child_chunks: list[ParentChildChunk]
  39. class QAChunk(BaseModel):
  40. """
  41. QA Chunk.
  42. """
  43. question: str
  44. answer: str
  45. class QAStructureChunk(BaseModel):
  46. """
  47. QAStructureChunk.
  48. """
  49. qa_chunks: list[QAChunk]
  50. class BaseDocumentTransformer(ABC):
  51. """Abstract base class for document transformation systems.
  52. A document transformation system takes a sequence of Documents and returns a
  53. sequence of transformed Documents.
  54. Example:
  55. .. code-block:: python
  56. class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
  57. model_config = ConfigDict(arbitrary_types_allowed=True)
  58. embeddings: Embeddings
  59. similarity_fn: Callable = cosine_similarity
  60. similarity_threshold: float = 0.95
  61. def transform_documents(
  62. self, documents: Sequence[Document], **kwargs: Any
  63. ) -> Sequence[Document]:
  64. stateful_documents = get_stateful_documents(documents)
  65. embedded_documents = _get_embeddings_from_stateful_docs(
  66. self.embeddings, stateful_documents
  67. )
  68. included_idxs = _filter_similar_embeddings(
  69. embedded_documents, self.similarity_fn, self.similarity_threshold
  70. )
  71. return [stateful_documents[i] for i in sorted(included_idxs)]
  72. async def atransform_documents(
  73. self, documents: Sequence[Document], **kwargs: Any
  74. ) -> Sequence[Document]:
  75. raise NotImplementedError
  76. """
  77. @abstractmethod
  78. def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  79. """Transform a list of documents.
  80. Args:
  81. documents: A sequence of Documents to be transformed.
  82. Returns:
  83. A list of transformed Documents.
  84. """
  85. @abstractmethod
  86. async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  87. """Asynchronously transform a list of documents.
  88. Args:
  89. documents: A sequence of Documents to be transformed.
  90. Returns:
  91. A list of transformed Documents.
  92. """