| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220 | 
							- import base64
 - import enum
 - import hashlib
 - import hmac
 - import json
 - import logging
 - import os
 - import pickle
 - import re
 - import time
 - from json import JSONDecodeError
 - from typing import Any, cast
 - 
 - from sqlalchemy import func
 - from sqlalchemy.dialects.postgresql import JSONB
 - from sqlalchemy.orm import Mapped
 - 
 - from configs import dify_config
 - from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
 - from core.rag.retrieval.retrieval_methods import RetrievalMethod
 - from extensions.ext_storage import storage
 - from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
 - 
 - from .account import Account
 - from .base import Base
 - from .engine import db
 - from .model import App, Tag, TagBinding, UploadFile
 - from .types import StringUUID
 - 
 - 
 - class DatasetPermissionEnum(enum.StrEnum):
 -     ONLY_ME = "only_me"
 -     ALL_TEAM = "all_team_members"
 -     PARTIAL_TEAM = "partial_members"
 - 
 - 
 - class Dataset(Base):
 -     __tablename__ = "datasets"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_pkey"),
 -         db.Index("dataset_tenant_idx", "tenant_id"),
 -         db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
 -     )
 - 
 -     INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
 -     PROVIDER_LIST = ["vendor", "external", None]
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     name = db.Column(db.String(255), nullable=False)
 -     description = db.Column(db.Text, nullable=True)
 -     provider = db.Column(db.String(255), nullable=False, server_default=db.text("'vendor'::character varying"))
 -     permission = db.Column(db.String(255), nullable=False, server_default=db.text("'only_me'::character varying"))
 -     data_source_type = db.Column(db.String(255))
 -     indexing_technique = db.Column(db.String(255), nullable=True)
 -     index_struct = db.Column(db.Text, nullable=True)
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     updated_by = db.Column(StringUUID, nullable=True)
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     embedding_model = db.Column(db.String(255), nullable=True)
 -     embedding_model_provider = db.Column(db.String(255), nullable=True)
 -     keyword_number = db.Column(db.Integer, nullable=True, server_default=db.text("10"))
 -     collection_binding_id = db.Column(StringUUID, nullable=True)
 -     retrieval_model = db.Column(JSONB, nullable=True)
 -     built_in_field_enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 -     icon_info = db.Column(JSONB, nullable=True)
 -     runtime_mode = db.Column(db.String(255), nullable=True, server_default=db.text("'general'::character varying"))
 -     pipeline_id = db.Column(StringUUID, nullable=True)
 -     chunk_structure = db.Column(db.String(255), nullable=True)
 - 
 -     @property
 -     def dataset_keyword_table(self):
 -         dataset_keyword_table = (
 -             db.session.query(DatasetKeywordTable).filter(DatasetKeywordTable.dataset_id == self.id).first()
 -         )
 -         if dataset_keyword_table:
 -             return dataset_keyword_table
 - 
 -         return None
 - 
 -     @property
 -     def index_struct_dict(self):
 -         return json.loads(self.index_struct) if self.index_struct else None
 - 
 -     @property
 -     def external_retrieval_model(self):
 -         default_retrieval_model = {
 -             "top_k": 2,
 -             "score_threshold": 0.0,
 -         }
 -         return self.retrieval_model or default_retrieval_model
 - 
 -     @property
 -     def created_by_account(self):
 -         return db.session.get(Account, self.created_by)
 - 
 -     @property
 -     def latest_process_rule(self):
 -         return (
 -             db.session.query(DatasetProcessRule)
 -             .filter(DatasetProcessRule.dataset_id == self.id)
 -             .order_by(DatasetProcessRule.created_at.desc())
 -             .first()
 -         )
 - 
 -     @property
 -     def app_count(self):
 -         return (
 -             db.session.query(func.count(AppDatasetJoin.id))
 -             .filter(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
 -             .scalar()
 -         )
 - 
 -     @property
 -     def document_count(self):
 -         return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
 - 
 -     @property
 -     def available_document_count(self):
 -         return (
 -             db.session.query(func.count(Document.id))
 -             .filter(
 -                 Document.dataset_id == self.id,
 -                 Document.indexing_status == "completed",
 -                 Document.enabled == True,
 -                 Document.archived == False,
 -             )
 -             .scalar()
 -         )
 - 
 -     @property
 -     def available_segment_count(self):
 -         return (
 -             db.session.query(func.count(DocumentSegment.id))
 -             .filter(
 -                 DocumentSegment.dataset_id == self.id,
 -                 DocumentSegment.status == "completed",
 -                 DocumentSegment.enabled == True,
 -             )
 -             .scalar()
 -         )
 - 
 -     @property
 -     def word_count(self):
 -         return (
 -             db.session.query(Document)
 -             .with_entities(func.coalesce(func.sum(Document.word_count)))
 -             .filter(Document.dataset_id == self.id)
 -             .scalar()
 -         )
 - 
 -     @property
 -     def doc_form(self):
 -         if self.chunk_structure:
 -             return self.chunk_structure
 -         document = db.session.query(Document).filter(Document.dataset_id == self.id).first()
 -         if document:
 -             return document.doc_form
 -         return None
 - 
 -     @property
 -     def retrieval_model_dict(self):
 -         default_retrieval_model = {
 -             "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
 -             "reranking_enable": False,
 -             "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
 -             "top_k": 2,
 -             "score_threshold_enabled": False,
 -         }
 -         return self.retrieval_model or default_retrieval_model
 - 
 -     @property
 -     def tags(self):
 -         tags = (
 -             db.session.query(Tag)
 -             .join(TagBinding, Tag.id == TagBinding.tag_id)
 -             .filter(
 -                 TagBinding.target_id == self.id,
 -                 TagBinding.tenant_id == self.tenant_id,
 -                 Tag.tenant_id == self.tenant_id,
 -                 Tag.type == "knowledge",
 -             )
 -             .all()
 -         )
 - 
 -         return tags or []
 - 
 -     @property
 -     def external_knowledge_info(self):
 -         if self.provider != "external":
 -             return None
 -         external_knowledge_binding = (
 -             db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
 -         )
 -         if not external_knowledge_binding:
 -             return None
 -         external_knowledge_api = (
 -             db.session.query(ExternalKnowledgeApis)
 -             .filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
 -             .first()
 -         )
 -         if not external_knowledge_api:
 -             return None
 -         return {
 -             "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
 -             "external_knowledge_api_id": external_knowledge_api.id,
 -             "external_knowledge_api_name": external_knowledge_api.name,
 -             "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
 -         }
 -     @property
 -     def is_published(self):
 -         if self.pipeline_id:
 -             pipeline = db.session.query(Pipeline).filter(Pipeline.id == self.pipeline_id).first()
 -             if pipeline:
 -                 return pipeline.is_published
 -         return False
 - 
 -     @property
 -     def doc_metadata(self):
 -         dataset_metadatas = db.session.query(DatasetMetadata).filter(DatasetMetadata.dataset_id == self.id).all()
 - 
 -         doc_metadata = [
 -             {
 -                 "id": dataset_metadata.id,
 -                 "name": dataset_metadata.name,
 -                 "type": dataset_metadata.type,
 -             }
 -             for dataset_metadata in dataset_metadatas
 -         ]
 -         if self.built_in_field_enabled:
 -             doc_metadata.append(
 -                 {
 -                     "id": "built-in",
 -                     "name": BuiltInField.document_name.value,
 -                     "type": "string",
 -                 }
 -             )
 -             doc_metadata.append(
 -                 {
 -                     "id": "built-in",
 -                     "name": BuiltInField.uploader.value,
 -                     "type": "string",
 -                 }
 -             )
 -             doc_metadata.append(
 -                 {
 -                     "id": "built-in",
 -                     "name": BuiltInField.upload_date.value,
 -                     "type": "time",
 -                 }
 -             )
 -             doc_metadata.append(
 -                 {
 -                     "id": "built-in",
 -                     "name": BuiltInField.last_update_date.value,
 -                     "type": "time",
 -                 }
 -             )
 -             doc_metadata.append(
 -                 {
 -                     "id": "built-in",
 -                     "name": BuiltInField.source.value,
 -                     "type": "string",
 -                 }
 -             )
 -         return doc_metadata
 - 
 -     @staticmethod
 -     def gen_collection_name_by_id(dataset_id: str) -> str:
 -         normalized_dataset_id = dataset_id.replace("-", "_")
 -         return f"Vector_index_{normalized_dataset_id}_Node"
 - 
 - 
 - class DatasetProcessRule(Base):
 -     __tablename__ = "dataset_process_rules"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
 -         db.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     mode = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
 -     rules = db.Column(db.Text, nullable=True)
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 -     MODES = ["automatic", "custom", "hierarchical"]
 -     PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
 -     AUTOMATIC_RULES: dict[str, Any] = {
 -         "pre_processing_rules": [
 -             {"id": "remove_extra_spaces", "enabled": True},
 -             {"id": "remove_urls_emails", "enabled": False},
 -         ],
 -         "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
 -     }
 - 
 -     def to_dict(self):
 -         return {
 -             "id": self.id,
 -             "dataset_id": self.dataset_id,
 -             "mode": self.mode,
 -             "rules": self.rules_dict,
 -         }
 - 
 -     @property
 -     def rules_dict(self):
 -         try:
 -             return json.loads(self.rules) if self.rules else None
 -         except JSONDecodeError:
 -             return None
 - 
 - 
 - class Document(Base):
 -     __tablename__ = "documents"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="document_pkey"),
 -         db.Index("document_dataset_id_idx", "dataset_id"),
 -         db.Index("document_is_paused_idx", "is_paused"),
 -         db.Index("document_tenant_idx", "tenant_id"),
 -         db.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
 -     )
 - 
 -     # initial fields
 -     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     position = db.Column(db.Integer, nullable=False)
 -     data_source_type = db.Column(db.String(255), nullable=False)
 -     data_source_info = db.Column(db.Text, nullable=True)
 -     dataset_process_rule_id = db.Column(StringUUID, nullable=True)
 -     batch = db.Column(db.String(255), nullable=False)
 -     name = db.Column(db.String(255), nullable=False)
 -     created_from = db.Column(db.String(255), nullable=False)
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_api_request_id = db.Column(StringUUID, nullable=True)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 -     # start processing
 -     processing_started_at = db.Column(db.DateTime, nullable=True)
 - 
 -     # parsing
 -     file_id = db.Column(db.Text, nullable=True)
 -     word_count = db.Column(db.Integer, nullable=True)
 -     parsing_completed_at = db.Column(db.DateTime, nullable=True)
 - 
 -     # cleaning
 -     cleaning_completed_at = db.Column(db.DateTime, nullable=True)
 - 
 -     # split
 -     splitting_completed_at = db.Column(db.DateTime, nullable=True)
 - 
 -     # indexing
 -     tokens = db.Column(db.Integer, nullable=True)
 -     indexing_latency = db.Column(db.Float, nullable=True)
 -     completed_at = db.Column(db.DateTime, nullable=True)
 - 
 -     # pause
 -     is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
 -     paused_by = db.Column(StringUUID, nullable=True)
 -     paused_at = db.Column(db.DateTime, nullable=True)
 - 
 -     # error
 -     error = db.Column(db.Text, nullable=True)
 -     stopped_at = db.Column(db.DateTime, nullable=True)
 - 
 -     # basic fields
 -     indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
 -     enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 -     disabled_at = db.Column(db.DateTime, nullable=True)
 -     disabled_by = db.Column(StringUUID, nullable=True)
 -     archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 -     archived_reason = db.Column(db.String(255), nullable=True)
 -     archived_by = db.Column(StringUUID, nullable=True)
 -     archived_at = db.Column(db.DateTime, nullable=True)
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     doc_type = db.Column(db.String(40), nullable=True)
 -     doc_metadata = db.Column(JSONB, nullable=True)
 -     doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
 -     doc_language = db.Column(db.String(255), nullable=True)
 - 
 -     DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
 - 
 -     @property
 -     def display_status(self):
 -         status = None
 -         if self.indexing_status == "waiting":
 -             status = "queuing"
 -         elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
 -             status = "paused"
 -         elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
 -             status = "indexing"
 -         elif self.indexing_status == "error":
 -             status = "error"
 -         elif self.indexing_status == "completed" and not self.archived and self.enabled:
 -             status = "available"
 -         elif self.indexing_status == "completed" and not self.archived and not self.enabled:
 -             status = "disabled"
 -         elif self.indexing_status == "completed" and self.archived:
 -             status = "archived"
 -         return status
 - 
 -     @property
 -     def data_source_info_dict(self):
 -         if self.data_source_info:
 -             try:
 -                 data_source_info_dict = json.loads(self.data_source_info)
 -             except JSONDecodeError:
 -                 data_source_info_dict = {}
 - 
 -             return data_source_info_dict
 -         return None
 - 
 -     @property
 -     def data_source_detail_dict(self):
 -         if self.data_source_info:
 -             if self.data_source_type == "upload_file":
 -                 data_source_info_dict = json.loads(self.data_source_info)
 -                 file_detail = (
 -                     db.session.query(UploadFile)
 -                     .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
 -                     .one_or_none()
 -                 )
 -                 if file_detail:
 -                     return {
 -                         "upload_file": {
 -                             "id": file_detail.id,
 -                             "name": file_detail.name,
 -                             "size": file_detail.size,
 -                             "extension": file_detail.extension,
 -                             "mime_type": file_detail.mime_type,
 -                             "created_by": file_detail.created_by,
 -                             "created_at": file_detail.created_at.timestamp(),
 -                         }
 -                     }
 -             elif self.data_source_type in {"notion_import", "website_crawl"}:
 -                 return json.loads(self.data_source_info)
 -         return {}
 - 
 -     @property
 -     def average_segment_length(self):
 -         if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
 -             return self.word_count // self.segment_count
 -         return 0
 - 
 -     @property
 -     def dataset_process_rule(self):
 -         if self.dataset_process_rule_id:
 -             return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
 -         return None
 - 
 -     @property
 -     def dataset(self):
 -         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
 - 
 -     @property
 -     def segment_count(self):
 -         return db.session.query(DocumentSegment).filter(DocumentSegment.document_id == self.id).count()
 - 
 -     @property
 -     def hit_count(self):
 -         return (
 -             db.session.query(DocumentSegment)
 -             .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count)))
 -             .filter(DocumentSegment.document_id == self.id)
 -             .scalar()
 -         )
 - 
 -     @property
 -     def uploader(self):
 -         user = db.session.query(Account).filter(Account.id == self.created_by).first()
 -         return user.name if user else None
 - 
 -     @property
 -     def upload_date(self):
 -         return self.created_at
 - 
 -     @property
 -     def last_update_date(self):
 -         return self.updated_at
 - 
 -     @property
 -     def doc_metadata_details(self):
 -         if self.doc_metadata:
 -             document_metadatas = (
 -                 db.session.query(DatasetMetadata)
 -                 .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
 -                 .filter(
 -                     DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
 -                 )
 -                 .all()
 -             )
 -             metadata_list = []
 -             for metadata in document_metadatas:
 -                 metadata_dict = {
 -                     "id": metadata.id,
 -                     "name": metadata.name,
 -                     "type": metadata.type,
 -                     "value": self.doc_metadata.get(metadata.name),
 -                 }
 -                 metadata_list.append(metadata_dict)
 -             # deal built-in fields
 -             metadata_list.extend(self.get_built_in_fields())
 - 
 -             return metadata_list
 -         return None
 - 
 -     @property
 -     def process_rule_dict(self):
 -         if self.dataset_process_rule_id:
 -             return self.dataset_process_rule.to_dict()
 -         return None
 - 
 -     def get_built_in_fields(self):
 -         built_in_fields = []
 -         built_in_fields.append(
 -             {
 -                 "id": "built-in",
 -                 "name": BuiltInField.document_name,
 -                 "type": "string",
 -                 "value": self.name,
 -             }
 -         )
 -         built_in_fields.append(
 -             {
 -                 "id": "built-in",
 -                 "name": BuiltInField.uploader,
 -                 "type": "string",
 -                 "value": self.uploader,
 -             }
 -         )
 -         built_in_fields.append(
 -             {
 -                 "id": "built-in",
 -                 "name": BuiltInField.upload_date,
 -                 "type": "time",
 -                 "value": self.created_at.timestamp(),
 -             }
 -         )
 -         built_in_fields.append(
 -             {
 -                 "id": "built-in",
 -                 "name": BuiltInField.last_update_date,
 -                 "type": "time",
 -                 "value": self.updated_at.timestamp(),
 -             }
 -         )
 -         built_in_fields.append(
 -             {
 -                 "id": "built-in",
 -                 "name": BuiltInField.source,
 -                 "type": "string",
 -                 "value": MetadataDataSource[self.data_source_type].value,
 -             }
 -         )
 -         return built_in_fields
 - 
 -     def to_dict(self):
 -         return {
 -             "id": self.id,
 -             "tenant_id": self.tenant_id,
 -             "dataset_id": self.dataset_id,
 -             "position": self.position,
 -             "data_source_type": self.data_source_type,
 -             "data_source_info": self.data_source_info,
 -             "dataset_process_rule_id": self.dataset_process_rule_id,
 -             "batch": self.batch,
 -             "name": self.name,
 -             "created_from": self.created_from,
 -             "created_by": self.created_by,
 -             "created_api_request_id": self.created_api_request_id,
 -             "created_at": self.created_at,
 -             "processing_started_at": self.processing_started_at,
 -             "file_id": self.file_id,
 -             "word_count": self.word_count,
 -             "parsing_completed_at": self.parsing_completed_at,
 -             "cleaning_completed_at": self.cleaning_completed_at,
 -             "splitting_completed_at": self.splitting_completed_at,
 -             "tokens": self.tokens,
 -             "indexing_latency": self.indexing_latency,
 -             "completed_at": self.completed_at,
 -             "is_paused": self.is_paused,
 -             "paused_by": self.paused_by,
 -             "paused_at": self.paused_at,
 -             "error": self.error,
 -             "stopped_at": self.stopped_at,
 -             "indexing_status": self.indexing_status,
 -             "enabled": self.enabled,
 -             "disabled_at": self.disabled_at,
 -             "disabled_by": self.disabled_by,
 -             "archived": self.archived,
 -             "archived_reason": self.archived_reason,
 -             "archived_by": self.archived_by,
 -             "archived_at": self.archived_at,
 -             "updated_at": self.updated_at,
 -             "doc_type": self.doc_type,
 -             "doc_metadata": self.doc_metadata,
 -             "doc_form": self.doc_form,
 -             "doc_language": self.doc_language,
 -             "display_status": self.display_status,
 -             "data_source_info_dict": self.data_source_info_dict,
 -             "average_segment_length": self.average_segment_length,
 -             "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
 -             "dataset": self.dataset.to_dict() if self.dataset else None,
 -             "segment_count": self.segment_count,
 -             "hit_count": self.hit_count,
 -         }
 - 
 -     @classmethod
 -     def from_dict(cls, data: dict):
 -         return cls(
 -             id=data.get("id"),
 -             tenant_id=data.get("tenant_id"),
 -             dataset_id=data.get("dataset_id"),
 -             position=data.get("position"),
 -             data_source_type=data.get("data_source_type"),
 -             data_source_info=data.get("data_source_info"),
 -             dataset_process_rule_id=data.get("dataset_process_rule_id"),
 -             batch=data.get("batch"),
 -             name=data.get("name"),
 -             created_from=data.get("created_from"),
 -             created_by=data.get("created_by"),
 -             created_api_request_id=data.get("created_api_request_id"),
 -             created_at=data.get("created_at"),
 -             processing_started_at=data.get("processing_started_at"),
 -             file_id=data.get("file_id"),
 -             word_count=data.get("word_count"),
 -             parsing_completed_at=data.get("parsing_completed_at"),
 -             cleaning_completed_at=data.get("cleaning_completed_at"),
 -             splitting_completed_at=data.get("splitting_completed_at"),
 -             tokens=data.get("tokens"),
 -             indexing_latency=data.get("indexing_latency"),
 -             completed_at=data.get("completed_at"),
 -             is_paused=data.get("is_paused"),
 -             paused_by=data.get("paused_by"),
 -             paused_at=data.get("paused_at"),
 -             error=data.get("error"),
 -             stopped_at=data.get("stopped_at"),
 -             indexing_status=data.get("indexing_status"),
 -             enabled=data.get("enabled"),
 -             disabled_at=data.get("disabled_at"),
 -             disabled_by=data.get("disabled_by"),
 -             archived=data.get("archived"),
 -             archived_reason=data.get("archived_reason"),
 -             archived_by=data.get("archived_by"),
 -             archived_at=data.get("archived_at"),
 -             updated_at=data.get("updated_at"),
 -             doc_type=data.get("doc_type"),
 -             doc_metadata=data.get("doc_metadata"),
 -             doc_form=data.get("doc_form"),
 -             doc_language=data.get("doc_language"),
 -         )
 - 
 - 
 - class DocumentSegment(Base):
 -     __tablename__ = "document_segments"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="document_segment_pkey"),
 -         db.Index("document_segment_dataset_id_idx", "dataset_id"),
 -         db.Index("document_segment_document_id_idx", "document_id"),
 -         db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
 -         db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
 -         db.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
 -         db.Index("document_segment_tenant_idx", "tenant_id"),
 -     )
 - 
 -     # initial fields
 -     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     document_id = db.Column(StringUUID, nullable=False)
 -     position: Mapped[int]
 -     content = db.Column(db.Text, nullable=False)
 -     answer = db.Column(db.Text, nullable=True)
 -     word_count = db.Column(db.Integer, nullable=False)
 -     tokens = db.Column(db.Integer, nullable=False)
 - 
 -     # indexing fields
 -     keywords = db.Column(db.JSON, nullable=True)
 -     index_node_id = db.Column(db.String(255), nullable=True)
 -     index_node_hash = db.Column(db.String(255), nullable=True)
 - 
 -     # basic fields
 -     hit_count = db.Column(db.Integer, nullable=False, default=0)
 -     enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 -     disabled_at = db.Column(db.DateTime, nullable=True)
 -     disabled_by = db.Column(StringUUID, nullable=True)
 -     status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     updated_by = db.Column(StringUUID, nullable=True)
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     indexing_at = db.Column(db.DateTime, nullable=True)
 -     completed_at = db.Column(db.DateTime, nullable=True)
 -     error = db.Column(db.Text, nullable=True)
 -     stopped_at = db.Column(db.DateTime, nullable=True)
 - 
 -     @property
 -     def dataset(self):
 -         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
 - 
 -     @property
 -     def document(self):
 -         return db.session.query(Document).filter(Document.id == self.document_id).first()
 - 
 -     @property
 -     def previous_segment(self):
 -         return (
 -             db.session.query(DocumentSegment)
 -             .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1)
 -             .first()
 -         )
 - 
 -     @property
 -     def next_segment(self):
 -         return (
 -             db.session.query(DocumentSegment)
 -             .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1)
 -             .first()
 -         )
 - 
 -     @property
 -     def child_chunks(self):
 -         process_rule = self.document.dataset_process_rule
 -         if process_rule.mode == "hierarchical":
 -             rules = Rule(**process_rule.rules_dict)
 -             if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
 -                 child_chunks = (
 -                     db.session.query(ChildChunk)
 -                     .filter(ChildChunk.segment_id == self.id)
 -                     .order_by(ChildChunk.position.asc())
 -                     .all()
 -                 )
 -                 return child_chunks or []
 -             else:
 -                 return []
 -         else:
 -             return []
 - 
 -     def get_child_chunks(self):
 -         process_rule = self.document.dataset_process_rule
 -         if process_rule.mode == "hierarchical":
 -             rules = Rule(**process_rule.rules_dict)
 -             if rules.parent_mode:
 -                 child_chunks = (
 -                     db.session.query(ChildChunk)
 -                     .filter(ChildChunk.segment_id == self.id)
 -                     .order_by(ChildChunk.position.asc())
 -                     .all()
 -                 )
 -                 return child_chunks or []
 -             else:
 -                 return []
 -         else:
 -             return []
 - 
 -     @property
 -     def sign_content(self):
 -         return self.get_sign_content()
 - 
 -     def get_sign_content(self):
 -         signed_urls = []
 -         text = self.content
 - 
 -         # For data before v0.10.0
 -         pattern = r"/files/([a-f0-9\-]+)/image-preview"
 -         matches = re.finditer(pattern, text)
 -         for match in matches:
 -             upload_file_id = match.group(1)
 -             nonce = os.urandom(16).hex()
 -             timestamp = str(int(time.time()))
 -             data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
 -             secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
 -             sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
 -             encoded_sign = base64.urlsafe_b64encode(sign).decode()
 - 
 -             params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
 -             signed_url = f"{match.group(0)}?{params}"
 -             signed_urls.append((match.start(), match.end(), signed_url))
 - 
 -         # For data after v0.10.0
 -         pattern = r"/files/([a-f0-9\-]+)/file-preview"
 -         matches = re.finditer(pattern, text)
 -         for match in matches:
 -             upload_file_id = match.group(1)
 -             nonce = os.urandom(16).hex()
 -             timestamp = str(int(time.time()))
 -             data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
 -             secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
 -             sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
 -             encoded_sign = base64.urlsafe_b64encode(sign).decode()
 - 
 -             params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
 -             signed_url = f"{match.group(0)}?{params}"
 -             signed_urls.append((match.start(), match.end(), signed_url))
 - 
 -         # Reconstruct the text with signed URLs
 -         offset = 0
 -         for start, end, signed_url in signed_urls:
 -             text = text[: start + offset] + signed_url + text[end + offset :]
 -             offset += len(signed_url) - (end - start)
 - 
 -         return text
 - 
 - 
 - class ChildChunk(Base):
 -     __tablename__ = "child_chunks"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
 -         db.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
 -         db.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
 -         db.Index("child_chunks_segment_idx", "segment_id"),
 -     )
 - 
 -     # initial fields
 -     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     document_id = db.Column(StringUUID, nullable=False)
 -     segment_id = db.Column(StringUUID, nullable=False)
 -     position = db.Column(db.Integer, nullable=False)
 -     content = db.Column(db.Text, nullable=False)
 -     word_count = db.Column(db.Integer, nullable=False)
 -     # indexing fields
 -     index_node_id = db.Column(db.String(255), nullable=True)
 -     index_node_hash = db.Column(db.String(255), nullable=True)
 -     type = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 -     updated_by = db.Column(StringUUID, nullable=True)
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 -     indexing_at = db.Column(db.DateTime, nullable=True)
 -     completed_at = db.Column(db.DateTime, nullable=True)
 -     error = db.Column(db.Text, nullable=True)
 - 
 -     @property
 -     def dataset(self):
 -         return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
 - 
 -     @property
 -     def document(self):
 -         return db.session.query(Document).filter(Document.id == self.document_id).first()
 - 
 -     @property
 -     def segment(self):
 -         return db.session.query(DocumentSegment).filter(DocumentSegment.id == self.segment_id).first()
 - 
 - 
 - class AppDatasetJoin(Base):
 -     __tablename__ = "app_dataset_joins"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
 -         db.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     app_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
 - 
 -     @property
 -     def app(self):
 -         return db.session.get(App, self.app_id)
 - 
 - 
 - class DatasetQuery(Base):
 -     __tablename__ = "dataset_queries"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
 -         db.Index("dataset_query_dataset_id_idx", "dataset_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     content = db.Column(db.Text, nullable=False)
 -     source = db.Column(db.String(255), nullable=False)
 -     source_app_id = db.Column(StringUUID, nullable=True)
 -     created_by_role = db.Column(db.String, nullable=False)
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
 - 
 - 
 - class DatasetKeywordTable(Base):
 -     __tablename__ = "dataset_keyword_tables"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
 -         db.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 -     dataset_id = db.Column(StringUUID, nullable=False, unique=True)
 -     keyword_table = db.Column(db.Text, nullable=False)
 -     data_source_type = db.Column(
 -         db.String(255), nullable=False, server_default=db.text("'database'::character varying")
 -     )
 - 
 -     @property
 -     def keyword_table_dict(self):
 -         class SetDecoder(json.JSONDecoder):
 -             def __init__(self, *args, **kwargs):
 -                 super().__init__(object_hook=self.object_hook, *args, **kwargs)
 - 
 -             def object_hook(self, dct):
 -                 if isinstance(dct, dict):
 -                     for keyword, node_idxs in dct.items():
 -                         if isinstance(node_idxs, list):
 -                             dct[keyword] = set(node_idxs)
 -                 return dct
 - 
 -         # get dataset
 -         dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
 -         if not dataset:
 -             return None
 -         if self.data_source_type == "database":
 -             return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
 -         else:
 -             file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
 -             try:
 -                 keyword_table_text = storage.load_once(file_key)
 -                 if keyword_table_text:
 -                     return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
 -                 return None
 -             except Exception as e:
 -                 logging.exception(f"Failed to load keyword table from file: {file_key}")
 -                 return None
 - 
 - 
 - class Embedding(Base):
 -     __tablename__ = "embeddings"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="embedding_pkey"),
 -         db.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
 -         db.Index("created_at_idx", "created_at"),
 -     )
 - 
 -     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 -     model_name = db.Column(
 -         db.String(255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying")
 -     )
 -     hash = db.Column(db.String(64), nullable=False)
 -     embedding = db.Column(db.LargeBinary, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     provider_name = db.Column(db.String(255), nullable=False, server_default=db.text("''::character varying"))
 - 
 -     def set_embedding(self, embedding_data: list[float]):
 -         self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
 - 
 -     def get_embedding(self) -> list[float]:
 -         return cast(list[float], pickle.loads(self.embedding))  # noqa: S301
 - 
 - 
 - class DatasetCollectionBinding(Base):
 -     __tablename__ = "dataset_collection_bindings"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
 -         db.Index("provider_model_name_idx", "provider_name", "model_name"),
 -     )
 - 
 -     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 -     provider_name = db.Column(db.String(255), nullable=False)
 -     model_name = db.Column(db.String(255), nullable=False)
 -     type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
 -     collection_name = db.Column(db.String(64), nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 - 
 - class TidbAuthBinding(Base):
 -     __tablename__ = "tidb_auth_bindings"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
 -         db.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
 -         db.Index("tidb_auth_bindings_active_idx", "active"),
 -         db.Index("tidb_auth_bindings_created_at_idx", "created_at"),
 -         db.Index("tidb_auth_bindings_status_idx", "status"),
 -     )
 -     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=True)
 -     cluster_id = db.Column(db.String(255), nullable=False)
 -     cluster_name = db.Column(db.String(255), nullable=False)
 -     active = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 -     status = db.Column(db.String(255), nullable=False, server_default=db.text("CREATING"))
 -     account = db.Column(db.String(255), nullable=False)
 -     password = db.Column(db.String(255), nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 - 
 - class Whitelist(Base):
 -     __tablename__ = "whitelists"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="whitelists_pkey"),
 -         db.Index("whitelists_tenant_idx", "tenant_id"),
 -     )
 -     id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=True)
 -     category = db.Column(db.String(255), nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 - 
 - class DatasetPermission(Base):
 -     __tablename__ = "dataset_permissions"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
 -         db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
 -         db.Index("idx_dataset_permissions_account_id", "account_id"),
 -         db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     account_id = db.Column(StringUUID, nullable=False)
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 - 
 - class ExternalKnowledgeApis(Base):
 -     __tablename__ = "external_knowledge_apis"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
 -         db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
 -         db.Index("external_knowledge_apis_name_idx", "name"),
 -     )
 - 
 -     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     name = db.Column(db.String(255), nullable=False)
 -     description = db.Column(db.String(255), nullable=False)
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     settings = db.Column(db.Text, nullable=True)
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     updated_by = db.Column(StringUUID, nullable=True)
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 -     def to_dict(self):
 -         return {
 -             "id": self.id,
 -             "tenant_id": self.tenant_id,
 -             "name": self.name,
 -             "description": self.description,
 -             "settings": self.settings_dict,
 -             "dataset_bindings": self.dataset_bindings,
 -             "created_by": self.created_by,
 -             "created_at": self.created_at.isoformat(),
 -         }
 - 
 -     @property
 -     def settings_dict(self):
 -         try:
 -             return json.loads(self.settings) if self.settings else None
 -         except JSONDecodeError:
 -             return None
 - 
 -     @property
 -     def dataset_bindings(self):
 -         external_knowledge_bindings = (
 -             db.session.query(ExternalKnowledgeBindings)
 -             .filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
 -             .all()
 -         )
 -         dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
 -         datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
 -         dataset_bindings = []
 -         for dataset in datasets:
 -             dataset_bindings.append({"id": dataset.id, "name": dataset.name})
 - 
 -         return dataset_bindings
 - 
 - 
 - class ExternalKnowledgeBindings(Base):
 -     __tablename__ = "external_knowledge_bindings"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
 -         db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
 -         db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
 -         db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
 -         db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     external_knowledge_api_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     external_knowledge_id = db.Column(db.Text, nullable=False)
 -     created_by = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     updated_by = db.Column(StringUUID, nullable=True)
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 - 
 - class DatasetAutoDisableLog(Base):
 -     __tablename__ = "dataset_auto_disable_logs"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
 -         db.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
 -         db.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
 -         db.Index("dataset_auto_disable_log_created_atx", "created_at"),
 -     )
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     document_id = db.Column(StringUUID, nullable=False)
 -     notified = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 - 
 - 
 - class RateLimitLog(Base):
 -     __tablename__ = "rate_limit_logs"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
 -         db.Index("rate_limit_log_tenant_idx", "tenant_id"),
 -         db.Index("rate_limit_log_operation_idx", "operation"),
 -     )
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     subscription_plan = db.Column(db.String(255), nullable=False)
 -     operation = db.Column(db.String(255), nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 - 
 - 
 - class DatasetMetadata(Base):
 -     __tablename__ = "dataset_metadatas"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
 -         db.Index("dataset_metadata_tenant_idx", "tenant_id"),
 -         db.Index("dataset_metadata_dataset_idx", "dataset_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     type = db.Column(db.String(255), nullable=False)
 -     name = db.Column(db.String(255), nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
 -     created_by = db.Column(StringUUID, nullable=False)
 -     updated_by = db.Column(StringUUID, nullable=True)
 - 
 - 
 - class DatasetMetadataBinding(Base):
 -     __tablename__ = "dataset_metadata_bindings"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
 -         db.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
 -         db.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
 -         db.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
 -         db.Index("dataset_metadata_binding_document_idx", "document_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     dataset_id = db.Column(StringUUID, nullable=False)
 -     metadata_id = db.Column(StringUUID, nullable=False)
 -     document_id = db.Column(StringUUID, nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     created_by = db.Column(StringUUID, nullable=False)
 - 
 - 
 - class PipelineBuiltInTemplate(Base):  # type: ignore[name-defined]
 -     __tablename__ = "pipeline_built_in_templates"
 -     __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_built_in_template_pkey"),)
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     name = db.Column(db.String(255), nullable=False)
 -     description = db.Column(db.Text, nullable=False)
 -     chunk_structure = db.Column(db.String(255), nullable=False)
 -     icon = db.Column(db.JSON, nullable=False)
 -     yaml_content = db.Column(db.Text, nullable=False)
 -     copyright = db.Column(db.String(255), nullable=False)
 -     privacy_policy = db.Column(db.String(255), nullable=False)
 -     position = db.Column(db.Integer, nullable=False)
 -     install_count = db.Column(db.Integer, nullable=False, default=0)
 -     language = db.Column(db.String(255), nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 - 
 - 
 - class PipelineCustomizedTemplate(Base):  # type: ignore[name-defined]
 -     __tablename__ = "pipeline_customized_templates"
 -     __table_args__ = (
 -         db.PrimaryKeyConstraint("id", name="pipeline_customized_template_pkey"),
 -         db.Index("pipeline_customized_template_tenant_idx", "tenant_id"),
 -     )
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id = db.Column(StringUUID, nullable=False)
 -     name = db.Column(db.String(255), nullable=False)
 -     description = db.Column(db.Text, nullable=False)
 -     chunk_structure = db.Column(db.String(255), nullable=False)
 -     icon = db.Column(db.JSON, nullable=False)
 -     position = db.Column(db.Integer, nullable=False)
 -     yaml_content = db.Column(db.Text, nullable=False)
 -     install_count = db.Column(db.Integer, nullable=False, default=0)
 -     language = db.Column(db.String(255), nullable=False)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 - 
 - class Pipeline(Base):  # type: ignore[name-defined]
 -     __tablename__ = "pipelines"
 -     __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_pkey"),)
 - 
 -     id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
 -     tenant_id: Mapped[str] = db.Column(StringUUID, nullable=False)
 -     name = db.Column(db.String(255), nullable=False)
 -     description = db.Column(db.Text, nullable=False, server_default=db.text("''::character varying"))
 -     workflow_id = db.Column(StringUUID, nullable=True)
 -     is_public = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 -     is_published = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
 -     created_by = db.Column(StringUUID, nullable=True)
 -     created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 -     updated_by = db.Column(StringUUID, nullable=True)
 -     updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
 - 
 -     @property
 -     def dataset(self):
 -         return db.session.query(Dataset).filter(Dataset.pipeline_id == self.id).first()
 
 
  |