You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

document.py 3.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. from abc import ABC, abstractmethod
  2. from collections.abc import Sequence
  3. from typing import Any
  4. from pydantic import BaseModel, Field
  5. class ChildDocument(BaseModel):
  6. """Class for storing a piece of text and associated metadata."""
  7. page_content: str
  8. vector: list[float] | None = None
  9. """Arbitrary metadata about the page content (e.g., source, relationships to other
  10. documents, etc.).
  11. """
  12. metadata: dict = Field(default_factory=dict)
  13. class Document(BaseModel):
  14. """Class for storing a piece of text and associated metadata."""
  15. page_content: str
  16. vector: list[float] | None = None
  17. """Arbitrary metadata about the page content (e.g., source, relationships to other
  18. documents, etc.).
  19. """
  20. metadata: dict = Field(default_factory=dict)
  21. provider: str | None = "dify"
  22. children: list[ChildDocument] | None = None
  23. class GeneralStructureChunk(BaseModel):
  24. """
  25. General Structure Chunk.
  26. """
  27. general_chunks: list[str]
  28. class ParentChildChunk(BaseModel):
  29. """
  30. Parent Child Chunk.
  31. """
  32. parent_content: str
  33. child_contents: list[str]
  34. class ParentChildStructureChunk(BaseModel):
  35. """
  36. Parent Child Structure Chunk.
  37. """
  38. parent_child_chunks: list[ParentChildChunk]
  39. parent_mode: str = "paragraph"
  40. class QAChunk(BaseModel):
  41. """
  42. QA Chunk.
  43. """
  44. question: str
  45. answer: str
  46. class QAStructureChunk(BaseModel):
  47. """
  48. QAStructureChunk.
  49. """
  50. qa_chunks: list[QAChunk]
  51. class BaseDocumentTransformer(ABC):
  52. """Abstract base class for document transformation systems.
  53. A document transformation system takes a sequence of Documents and returns a
  54. sequence of transformed Documents.
  55. Example:
  56. .. code-block:: python
  57. class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
  58. model_config = ConfigDict(arbitrary_types_allowed=True)
  59. embeddings: Embeddings
  60. similarity_fn: Callable = cosine_similarity
  61. similarity_threshold: float = 0.95
  62. def transform_documents(
  63. self, documents: Sequence[Document], **kwargs: Any
  64. ) -> Sequence[Document]:
  65. stateful_documents = get_stateful_documents(documents)
  66. embedded_documents = _get_embeddings_from_stateful_docs(
  67. self.embeddings, stateful_documents
  68. )
  69. included_idxs = _filter_similar_embeddings(
  70. embedded_documents, self.similarity_fn, self.similarity_threshold
  71. )
  72. return [stateful_documents[i] for i in sorted(included_idxs)]
  73. async def atransform_documents(
  74. self, documents: Sequence[Document], **kwargs: Any
  75. ) -> Sequence[Document]:
  76. raise NotImplementedError
  77. """
  78. @abstractmethod
  79. def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  80. """Transform a list of documents.
  81. Args:
  82. documents: A sequence of Documents to be transformed.
  83. Returns:
  84. A list of transformed Documents.
  85. """
  86. @abstractmethod
  87. async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  88. """Asynchronously transform a list of documents.
  89. Args:
  90. documents: A sequence of Documents to be transformed.
  91. Returns:
  92. A list of transformed Documents.
  93. """