Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

dataset.py 48KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from datetime import datetime
  12. from json import JSONDecodeError
  13. from typing import Any, Optional, cast
  14. import sqlalchemy as sa
  15. from sqlalchemy import DateTime, String, func, select
  16. from sqlalchemy.dialects.postgresql import JSONB
  17. from sqlalchemy.orm import Mapped, mapped_column
  18. from configs import dify_config
  19. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  20. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  21. from extensions.ext_storage import storage
  22. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  23. from .account import Account
  24. from .base import Base
  25. from .engine import db
  26. from .model import App, Tag, TagBinding, UploadFile
  27. from .types import StringUUID
  28. class DatasetPermissionEnum(enum.StrEnum):
  29. ONLY_ME = "only_me"
  30. ALL_TEAM = "all_team_members"
  31. PARTIAL_TEAM = "partial_members"
  32. class Dataset(Base):
  33. __tablename__ = "datasets"
  34. __table_args__ = (
  35. sa.PrimaryKeyConstraint("id", name="dataset_pkey"),
  36. sa.Index("dataset_tenant_idx", "tenant_id"),
  37. sa.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  38. )
  39. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  40. PROVIDER_LIST = ["vendor", "external", None]
  41. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  42. tenant_id: Mapped[str] = mapped_column(StringUUID)
  43. name: Mapped[str] = mapped_column(String(255))
  44. description = mapped_column(sa.Text, nullable=True)
  45. provider: Mapped[str] = mapped_column(String(255), server_default=sa.text("'vendor'::character varying"))
  46. permission: Mapped[str] = mapped_column(String(255), server_default=sa.text("'only_me'::character varying"))
  47. data_source_type = mapped_column(String(255))
  48. indexing_technique: Mapped[Optional[str]] = mapped_column(String(255))
  49. index_struct = mapped_column(sa.Text, nullable=True)
  50. created_by = mapped_column(StringUUID, nullable=False)
  51. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  52. updated_by = mapped_column(StringUUID, nullable=True)
  53. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  54. embedding_model = db.Column(String(255), nullable=True) # TODO: mapped_column
  55. embedding_model_provider = db.Column(String(255), nullable=True) # TODO: mapped_column
  56. collection_binding_id = mapped_column(StringUUID, nullable=True)
  57. retrieval_model = mapped_column(JSONB, nullable=True)
  58. built_in_field_enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  59. @property
  60. def dataset_keyword_table(self):
  61. dataset_keyword_table = (
  62. db.session.query(DatasetKeywordTable).where(DatasetKeywordTable.dataset_id == self.id).first()
  63. )
  64. if dataset_keyword_table:
  65. return dataset_keyword_table
  66. return None
  67. @property
  68. def index_struct_dict(self):
  69. return json.loads(self.index_struct) if self.index_struct else None
  70. @property
  71. def external_retrieval_model(self):
  72. default_retrieval_model = {
  73. "top_k": 2,
  74. "score_threshold": 0.0,
  75. }
  76. return self.retrieval_model or default_retrieval_model
  77. @property
  78. def created_by_account(self):
  79. return db.session.get(Account, self.created_by)
  80. @property
  81. def latest_process_rule(self):
  82. return (
  83. db.session.query(DatasetProcessRule)
  84. .where(DatasetProcessRule.dataset_id == self.id)
  85. .order_by(DatasetProcessRule.created_at.desc())
  86. .first()
  87. )
  88. @property
  89. def app_count(self):
  90. return (
  91. db.session.query(func.count(AppDatasetJoin.id))
  92. .where(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  93. .scalar()
  94. )
  95. @property
  96. def document_count(self):
  97. return db.session.query(func.count(Document.id)).where(Document.dataset_id == self.id).scalar()
  98. @property
  99. def available_document_count(self):
  100. return (
  101. db.session.query(func.count(Document.id))
  102. .where(
  103. Document.dataset_id == self.id,
  104. Document.indexing_status == "completed",
  105. Document.enabled == True,
  106. Document.archived == False,
  107. )
  108. .scalar()
  109. )
  110. @property
  111. def available_segment_count(self):
  112. return (
  113. db.session.query(func.count(DocumentSegment.id))
  114. .where(
  115. DocumentSegment.dataset_id == self.id,
  116. DocumentSegment.status == "completed",
  117. DocumentSegment.enabled == True,
  118. )
  119. .scalar()
  120. )
  121. @property
  122. def word_count(self):
  123. return (
  124. db.session.query(Document)
  125. .with_entities(func.coalesce(func.sum(Document.word_count), 0))
  126. .where(Document.dataset_id == self.id)
  127. .scalar()
  128. )
  129. @property
  130. def doc_form(self):
  131. document = db.session.query(Document).where(Document.dataset_id == self.id).first()
  132. if document:
  133. return document.doc_form
  134. return None
  135. @property
  136. def retrieval_model_dict(self):
  137. default_retrieval_model = {
  138. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  139. "reranking_enable": False,
  140. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  141. "top_k": 2,
  142. "score_threshold_enabled": False,
  143. }
  144. return self.retrieval_model or default_retrieval_model
  145. @property
  146. def tags(self):
  147. tags = (
  148. db.session.query(Tag)
  149. .join(TagBinding, Tag.id == TagBinding.tag_id)
  150. .where(
  151. TagBinding.target_id == self.id,
  152. TagBinding.tenant_id == self.tenant_id,
  153. Tag.tenant_id == self.tenant_id,
  154. Tag.type == "knowledge",
  155. )
  156. .all()
  157. )
  158. return tags or []
  159. @property
  160. def external_knowledge_info(self):
  161. if self.provider != "external":
  162. return None
  163. external_knowledge_binding = (
  164. db.session.query(ExternalKnowledgeBindings).where(ExternalKnowledgeBindings.dataset_id == self.id).first()
  165. )
  166. if not external_knowledge_binding:
  167. return None
  168. external_knowledge_api = db.session.scalar(
  169. select(ExternalKnowledgeApis).where(
  170. ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id
  171. )
  172. )
  173. if not external_knowledge_api:
  174. return None
  175. return {
  176. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  177. "external_knowledge_api_id": external_knowledge_api.id,
  178. "external_knowledge_api_name": external_knowledge_api.name,
  179. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  180. }
  181. @property
  182. def doc_metadata(self):
  183. dataset_metadatas = db.session.query(DatasetMetadata).where(DatasetMetadata.dataset_id == self.id).all()
  184. doc_metadata = [
  185. {
  186. "id": dataset_metadata.id,
  187. "name": dataset_metadata.name,
  188. "type": dataset_metadata.type,
  189. }
  190. for dataset_metadata in dataset_metadatas
  191. ]
  192. if self.built_in_field_enabled:
  193. doc_metadata.append(
  194. {
  195. "id": "built-in",
  196. "name": BuiltInField.document_name.value,
  197. "type": "string",
  198. }
  199. )
  200. doc_metadata.append(
  201. {
  202. "id": "built-in",
  203. "name": BuiltInField.uploader.value,
  204. "type": "string",
  205. }
  206. )
  207. doc_metadata.append(
  208. {
  209. "id": "built-in",
  210. "name": BuiltInField.upload_date.value,
  211. "type": "time",
  212. }
  213. )
  214. doc_metadata.append(
  215. {
  216. "id": "built-in",
  217. "name": BuiltInField.last_update_date.value,
  218. "type": "time",
  219. }
  220. )
  221. doc_metadata.append(
  222. {
  223. "id": "built-in",
  224. "name": BuiltInField.source.value,
  225. "type": "string",
  226. }
  227. )
  228. return doc_metadata
  229. @staticmethod
  230. def gen_collection_name_by_id(dataset_id: str) -> str:
  231. normalized_dataset_id = dataset_id.replace("-", "_")
  232. return f"{dify_config.VECTOR_INDEX_NAME_PREFIX}_{normalized_dataset_id}_Node"
  233. class DatasetProcessRule(Base):
  234. __tablename__ = "dataset_process_rules"
  235. __table_args__ = (
  236. sa.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  237. sa.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  238. )
  239. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  240. dataset_id = mapped_column(StringUUID, nullable=False)
  241. mode = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  242. rules = mapped_column(sa.Text, nullable=True)
  243. created_by = mapped_column(StringUUID, nullable=False)
  244. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  245. MODES = ["automatic", "custom", "hierarchical"]
  246. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  247. AUTOMATIC_RULES: dict[str, Any] = {
  248. "pre_processing_rules": [
  249. {"id": "remove_extra_spaces", "enabled": True},
  250. {"id": "remove_urls_emails", "enabled": False},
  251. ],
  252. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  253. }
  254. def to_dict(self):
  255. return {
  256. "id": self.id,
  257. "dataset_id": self.dataset_id,
  258. "mode": self.mode,
  259. "rules": self.rules_dict,
  260. }
  261. @property
  262. def rules_dict(self):
  263. try:
  264. return json.loads(self.rules) if self.rules else None
  265. except JSONDecodeError:
  266. return None
  267. class Document(Base):
  268. __tablename__ = "documents"
  269. __table_args__ = (
  270. sa.PrimaryKeyConstraint("id", name="document_pkey"),
  271. sa.Index("document_dataset_id_idx", "dataset_id"),
  272. sa.Index("document_is_paused_idx", "is_paused"),
  273. sa.Index("document_tenant_idx", "tenant_id"),
  274. sa.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  275. )
  276. # initial fields
  277. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  278. tenant_id = mapped_column(StringUUID, nullable=False)
  279. dataset_id = mapped_column(StringUUID, nullable=False)
  280. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  281. data_source_type: Mapped[str] = mapped_column(String(255), nullable=False)
  282. data_source_info = mapped_column(sa.Text, nullable=True)
  283. dataset_process_rule_id = mapped_column(StringUUID, nullable=True)
  284. batch: Mapped[str] = mapped_column(String(255), nullable=False)
  285. name: Mapped[str] = mapped_column(String(255), nullable=False)
  286. created_from: Mapped[str] = mapped_column(String(255), nullable=False)
  287. created_by = mapped_column(StringUUID, nullable=False)
  288. created_api_request_id = mapped_column(StringUUID, nullable=True)
  289. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  290. # start processing
  291. processing_started_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  292. # parsing
  293. file_id = mapped_column(sa.Text, nullable=True)
  294. word_count: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True) # TODO: make this not nullable
  295. parsing_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  296. # cleaning
  297. cleaning_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  298. # split
  299. splitting_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  300. # indexing
  301. tokens: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True)
  302. indexing_latency: Mapped[Optional[float]] = mapped_column(sa.Float, nullable=True)
  303. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  304. # pause
  305. is_paused: Mapped[Optional[bool]] = mapped_column(sa.Boolean, nullable=True, server_default=sa.text("false"))
  306. paused_by = mapped_column(StringUUID, nullable=True)
  307. paused_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  308. # error
  309. error = mapped_column(sa.Text, nullable=True)
  310. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  311. # basic fields
  312. indexing_status = mapped_column(String(255), nullable=False, server_default=sa.text("'waiting'::character varying"))
  313. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  314. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  315. disabled_by = mapped_column(StringUUID, nullable=True)
  316. archived: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  317. archived_reason = mapped_column(String(255), nullable=True)
  318. archived_by = mapped_column(StringUUID, nullable=True)
  319. archived_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  320. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  321. doc_type = mapped_column(String(40), nullable=True)
  322. doc_metadata = mapped_column(JSONB, nullable=True)
  323. doc_form = mapped_column(String(255), nullable=False, server_default=sa.text("'text_model'::character varying"))
  324. doc_language = mapped_column(String(255), nullable=True)
  325. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  326. @property
  327. def display_status(self):
  328. status = None
  329. if self.indexing_status == "waiting":
  330. status = "queuing"
  331. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  332. status = "paused"
  333. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  334. status = "indexing"
  335. elif self.indexing_status == "error":
  336. status = "error"
  337. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  338. status = "available"
  339. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  340. status = "disabled"
  341. elif self.indexing_status == "completed" and self.archived:
  342. status = "archived"
  343. return status
  344. @property
  345. def data_source_info_dict(self):
  346. if self.data_source_info:
  347. try:
  348. data_source_info_dict = json.loads(self.data_source_info)
  349. except JSONDecodeError:
  350. data_source_info_dict = {}
  351. return data_source_info_dict
  352. return None
  353. @property
  354. def data_source_detail_dict(self):
  355. if self.data_source_info:
  356. if self.data_source_type == "upload_file":
  357. data_source_info_dict = json.loads(self.data_source_info)
  358. file_detail = (
  359. db.session.query(UploadFile)
  360. .where(UploadFile.id == data_source_info_dict["upload_file_id"])
  361. .one_or_none()
  362. )
  363. if file_detail:
  364. return {
  365. "upload_file": {
  366. "id": file_detail.id,
  367. "name": file_detail.name,
  368. "size": file_detail.size,
  369. "extension": file_detail.extension,
  370. "mime_type": file_detail.mime_type,
  371. "created_by": file_detail.created_by,
  372. "created_at": file_detail.created_at.timestamp(),
  373. }
  374. }
  375. elif self.data_source_type in {"notion_import", "website_crawl"}:
  376. return json.loads(self.data_source_info)
  377. return {}
  378. @property
  379. def average_segment_length(self):
  380. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  381. return self.word_count // self.segment_count
  382. return 0
  383. @property
  384. def dataset_process_rule(self):
  385. if self.dataset_process_rule_id:
  386. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  387. return None
  388. @property
  389. def dataset(self):
  390. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).one_or_none()
  391. @property
  392. def segment_count(self):
  393. return db.session.query(DocumentSegment).where(DocumentSegment.document_id == self.id).count()
  394. @property
  395. def hit_count(self):
  396. return (
  397. db.session.query(DocumentSegment)
  398. .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count), 0))
  399. .where(DocumentSegment.document_id == self.id)
  400. .scalar()
  401. )
  402. @property
  403. def uploader(self):
  404. user = db.session.query(Account).where(Account.id == self.created_by).first()
  405. return user.name if user else None
  406. @property
  407. def upload_date(self):
  408. return self.created_at
  409. @property
  410. def last_update_date(self):
  411. return self.updated_at
  412. @property
  413. def doc_metadata_details(self):
  414. if self.doc_metadata:
  415. document_metadatas = (
  416. db.session.query(DatasetMetadata)
  417. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  418. .where(
  419. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  420. )
  421. .all()
  422. )
  423. metadata_list = []
  424. for metadata in document_metadatas:
  425. metadata_dict = {
  426. "id": metadata.id,
  427. "name": metadata.name,
  428. "type": metadata.type,
  429. "value": self.doc_metadata.get(metadata.name),
  430. }
  431. metadata_list.append(metadata_dict)
  432. # deal built-in fields
  433. metadata_list.extend(self.get_built_in_fields())
  434. return metadata_list
  435. return None
  436. @property
  437. def process_rule_dict(self):
  438. if self.dataset_process_rule_id:
  439. return self.dataset_process_rule.to_dict()
  440. return None
  441. def get_built_in_fields(self):
  442. built_in_fields = []
  443. built_in_fields.append(
  444. {
  445. "id": "built-in",
  446. "name": BuiltInField.document_name,
  447. "type": "string",
  448. "value": self.name,
  449. }
  450. )
  451. built_in_fields.append(
  452. {
  453. "id": "built-in",
  454. "name": BuiltInField.uploader,
  455. "type": "string",
  456. "value": self.uploader,
  457. }
  458. )
  459. built_in_fields.append(
  460. {
  461. "id": "built-in",
  462. "name": BuiltInField.upload_date,
  463. "type": "time",
  464. "value": str(self.created_at.timestamp()),
  465. }
  466. )
  467. built_in_fields.append(
  468. {
  469. "id": "built-in",
  470. "name": BuiltInField.last_update_date,
  471. "type": "time",
  472. "value": str(self.updated_at.timestamp()),
  473. }
  474. )
  475. built_in_fields.append(
  476. {
  477. "id": "built-in",
  478. "name": BuiltInField.source,
  479. "type": "string",
  480. "value": MetadataDataSource[self.data_source_type].value,
  481. }
  482. )
  483. return built_in_fields
  484. def to_dict(self):
  485. return {
  486. "id": self.id,
  487. "tenant_id": self.tenant_id,
  488. "dataset_id": self.dataset_id,
  489. "position": self.position,
  490. "data_source_type": self.data_source_type,
  491. "data_source_info": self.data_source_info,
  492. "dataset_process_rule_id": self.dataset_process_rule_id,
  493. "batch": self.batch,
  494. "name": self.name,
  495. "created_from": self.created_from,
  496. "created_by": self.created_by,
  497. "created_api_request_id": self.created_api_request_id,
  498. "created_at": self.created_at,
  499. "processing_started_at": self.processing_started_at,
  500. "file_id": self.file_id,
  501. "word_count": self.word_count,
  502. "parsing_completed_at": self.parsing_completed_at,
  503. "cleaning_completed_at": self.cleaning_completed_at,
  504. "splitting_completed_at": self.splitting_completed_at,
  505. "tokens": self.tokens,
  506. "indexing_latency": self.indexing_latency,
  507. "completed_at": self.completed_at,
  508. "is_paused": self.is_paused,
  509. "paused_by": self.paused_by,
  510. "paused_at": self.paused_at,
  511. "error": self.error,
  512. "stopped_at": self.stopped_at,
  513. "indexing_status": self.indexing_status,
  514. "enabled": self.enabled,
  515. "disabled_at": self.disabled_at,
  516. "disabled_by": self.disabled_by,
  517. "archived": self.archived,
  518. "archived_reason": self.archived_reason,
  519. "archived_by": self.archived_by,
  520. "archived_at": self.archived_at,
  521. "updated_at": self.updated_at,
  522. "doc_type": self.doc_type,
  523. "doc_metadata": self.doc_metadata,
  524. "doc_form": self.doc_form,
  525. "doc_language": self.doc_language,
  526. "display_status": self.display_status,
  527. "data_source_info_dict": self.data_source_info_dict,
  528. "average_segment_length": self.average_segment_length,
  529. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  530. "dataset": self.dataset.to_dict() if self.dataset else None,
  531. "segment_count": self.segment_count,
  532. "hit_count": self.hit_count,
  533. }
  534. @classmethod
  535. def from_dict(cls, data: dict):
  536. return cls(
  537. id=data.get("id"),
  538. tenant_id=data.get("tenant_id"),
  539. dataset_id=data.get("dataset_id"),
  540. position=data.get("position"),
  541. data_source_type=data.get("data_source_type"),
  542. data_source_info=data.get("data_source_info"),
  543. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  544. batch=data.get("batch"),
  545. name=data.get("name"),
  546. created_from=data.get("created_from"),
  547. created_by=data.get("created_by"),
  548. created_api_request_id=data.get("created_api_request_id"),
  549. created_at=data.get("created_at"),
  550. processing_started_at=data.get("processing_started_at"),
  551. file_id=data.get("file_id"),
  552. word_count=data.get("word_count"),
  553. parsing_completed_at=data.get("parsing_completed_at"),
  554. cleaning_completed_at=data.get("cleaning_completed_at"),
  555. splitting_completed_at=data.get("splitting_completed_at"),
  556. tokens=data.get("tokens"),
  557. indexing_latency=data.get("indexing_latency"),
  558. completed_at=data.get("completed_at"),
  559. is_paused=data.get("is_paused"),
  560. paused_by=data.get("paused_by"),
  561. paused_at=data.get("paused_at"),
  562. error=data.get("error"),
  563. stopped_at=data.get("stopped_at"),
  564. indexing_status=data.get("indexing_status"),
  565. enabled=data.get("enabled"),
  566. disabled_at=data.get("disabled_at"),
  567. disabled_by=data.get("disabled_by"),
  568. archived=data.get("archived"),
  569. archived_reason=data.get("archived_reason"),
  570. archived_by=data.get("archived_by"),
  571. archived_at=data.get("archived_at"),
  572. updated_at=data.get("updated_at"),
  573. doc_type=data.get("doc_type"),
  574. doc_metadata=data.get("doc_metadata"),
  575. doc_form=data.get("doc_form"),
  576. doc_language=data.get("doc_language"),
  577. )
  578. class DocumentSegment(Base):
  579. __tablename__ = "document_segments"
  580. __table_args__ = (
  581. sa.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  582. sa.Index("document_segment_dataset_id_idx", "dataset_id"),
  583. sa.Index("document_segment_document_id_idx", "document_id"),
  584. sa.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  585. sa.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  586. sa.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
  587. sa.Index("document_segment_tenant_idx", "tenant_id"),
  588. )
  589. # initial fields
  590. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  591. tenant_id = mapped_column(StringUUID, nullable=False)
  592. dataset_id = mapped_column(StringUUID, nullable=False)
  593. document_id = mapped_column(StringUUID, nullable=False)
  594. position: Mapped[int]
  595. content = mapped_column(sa.Text, nullable=False)
  596. answer = mapped_column(sa.Text, nullable=True)
  597. word_count: Mapped[int]
  598. tokens: Mapped[int]
  599. # indexing fields
  600. keywords = mapped_column(sa.JSON, nullable=True)
  601. index_node_id = mapped_column(String(255), nullable=True)
  602. index_node_hash = mapped_column(String(255), nullable=True)
  603. # basic fields
  604. hit_count: Mapped[int] = mapped_column(sa.Integer, nullable=False, default=0)
  605. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  606. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  607. disabled_by = mapped_column(StringUUID, nullable=True)
  608. status: Mapped[str] = mapped_column(String(255), server_default=sa.text("'waiting'::character varying"))
  609. created_by = mapped_column(StringUUID, nullable=False)
  610. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  611. updated_by = mapped_column(StringUUID, nullable=True)
  612. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  613. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  614. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  615. error = mapped_column(sa.Text, nullable=True)
  616. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  617. @property
  618. def dataset(self):
  619. return db.session.scalar(select(Dataset).where(Dataset.id == self.dataset_id))
  620. @property
  621. def document(self):
  622. return db.session.scalar(select(Document).where(Document.id == self.document_id))
  623. @property
  624. def previous_segment(self):
  625. return db.session.scalar(
  626. select(DocumentSegment).where(
  627. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1
  628. )
  629. )
  630. @property
  631. def next_segment(self):
  632. return db.session.scalar(
  633. select(DocumentSegment).where(
  634. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1
  635. )
  636. )
  637. @property
  638. def child_chunks(self):
  639. process_rule = self.document.dataset_process_rule
  640. if process_rule.mode == "hierarchical":
  641. rules = Rule(**process_rule.rules_dict)
  642. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  643. child_chunks = (
  644. db.session.query(ChildChunk)
  645. .where(ChildChunk.segment_id == self.id)
  646. .order_by(ChildChunk.position.asc())
  647. .all()
  648. )
  649. return child_chunks or []
  650. else:
  651. return []
  652. else:
  653. return []
  654. def get_child_chunks(self):
  655. process_rule = self.document.dataset_process_rule
  656. if process_rule.mode == "hierarchical":
  657. rules = Rule(**process_rule.rules_dict)
  658. if rules.parent_mode:
  659. child_chunks = (
  660. db.session.query(ChildChunk)
  661. .where(ChildChunk.segment_id == self.id)
  662. .order_by(ChildChunk.position.asc())
  663. .all()
  664. )
  665. return child_chunks or []
  666. else:
  667. return []
  668. else:
  669. return []
  670. @property
  671. def sign_content(self):
  672. return self.get_sign_content()
  673. def get_sign_content(self):
  674. signed_urls = []
  675. text = self.content
  676. # For data before v0.10.0
  677. pattern = r"/files/([a-f0-9\-]+)/image-preview"
  678. matches = re.finditer(pattern, text)
  679. for match in matches:
  680. upload_file_id = match.group(1)
  681. nonce = os.urandom(16).hex()
  682. timestamp = str(int(time.time()))
  683. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  684. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  685. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  686. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  687. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  688. signed_url = f"{match.group(0)}?{params}"
  689. signed_urls.append((match.start(), match.end(), signed_url))
  690. # For data after v0.10.0
  691. pattern = r"/files/([a-f0-9\-]+)/file-preview"
  692. matches = re.finditer(pattern, text)
  693. for match in matches:
  694. upload_file_id = match.group(1)
  695. nonce = os.urandom(16).hex()
  696. timestamp = str(int(time.time()))
  697. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  698. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  699. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  700. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  701. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  702. signed_url = f"{match.group(0)}?{params}"
  703. signed_urls.append((match.start(), match.end(), signed_url))
  704. # Reconstruct the text with signed URLs
  705. offset = 0
  706. for start, end, signed_url in signed_urls:
  707. text = text[: start + offset] + signed_url + text[end + offset :]
  708. offset += len(signed_url) - (end - start)
  709. return text
  710. class ChildChunk(Base):
  711. __tablename__ = "child_chunks"
  712. __table_args__ = (
  713. sa.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  714. sa.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  715. sa.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
  716. sa.Index("child_chunks_segment_idx", "segment_id"),
  717. )
  718. # initial fields
  719. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  720. tenant_id = mapped_column(StringUUID, nullable=False)
  721. dataset_id = mapped_column(StringUUID, nullable=False)
  722. document_id = mapped_column(StringUUID, nullable=False)
  723. segment_id = mapped_column(StringUUID, nullable=False)
  724. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  725. content = mapped_column(sa.Text, nullable=False)
  726. word_count: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  727. # indexing fields
  728. index_node_id = mapped_column(String(255), nullable=True)
  729. index_node_hash = mapped_column(String(255), nullable=True)
  730. type = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  731. created_by = mapped_column(StringUUID, nullable=False)
  732. created_at: Mapped[datetime] = mapped_column(
  733. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  734. )
  735. updated_by = mapped_column(StringUUID, nullable=True)
  736. updated_at: Mapped[datetime] = mapped_column(
  737. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  738. )
  739. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  740. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  741. error = mapped_column(sa.Text, nullable=True)
  742. @property
  743. def dataset(self):
  744. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).first()
  745. @property
  746. def document(self):
  747. return db.session.query(Document).where(Document.id == self.document_id).first()
  748. @property
  749. def segment(self):
  750. return db.session.query(DocumentSegment).where(DocumentSegment.id == self.segment_id).first()
  751. class AppDatasetJoin(Base):
  752. __tablename__ = "app_dataset_joins"
  753. __table_args__ = (
  754. sa.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  755. sa.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  756. )
  757. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  758. app_id = mapped_column(StringUUID, nullable=False)
  759. dataset_id = mapped_column(StringUUID, nullable=False)
  760. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  761. @property
  762. def app(self):
  763. return db.session.get(App, self.app_id)
  764. class DatasetQuery(Base):
  765. __tablename__ = "dataset_queries"
  766. __table_args__ = (
  767. sa.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  768. sa.Index("dataset_query_dataset_id_idx", "dataset_id"),
  769. )
  770. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  771. dataset_id = mapped_column(StringUUID, nullable=False)
  772. content = mapped_column(sa.Text, nullable=False)
  773. source: Mapped[str] = mapped_column(String(255), nullable=False)
  774. source_app_id = mapped_column(StringUUID, nullable=True)
  775. created_by_role = mapped_column(String, nullable=False)
  776. created_by = mapped_column(StringUUID, nullable=False)
  777. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  778. class DatasetKeywordTable(Base):
  779. __tablename__ = "dataset_keyword_tables"
  780. __table_args__ = (
  781. sa.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  782. sa.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  783. )
  784. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  785. dataset_id = mapped_column(StringUUID, nullable=False, unique=True)
  786. keyword_table = mapped_column(sa.Text, nullable=False)
  787. data_source_type = mapped_column(
  788. String(255), nullable=False, server_default=sa.text("'database'::character varying")
  789. )
  790. @property
  791. def keyword_table_dict(self):
  792. class SetDecoder(json.JSONDecoder):
  793. def __init__(self, *args, **kwargs):
  794. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  795. def object_hook(self, dct):
  796. if isinstance(dct, dict):
  797. for keyword, node_idxs in dct.items():
  798. if isinstance(node_idxs, list):
  799. dct[keyword] = set(node_idxs)
  800. return dct
  801. # get dataset
  802. dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
  803. if not dataset:
  804. return None
  805. if self.data_source_type == "database":
  806. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  807. else:
  808. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  809. try:
  810. keyword_table_text = storage.load_once(file_key)
  811. if keyword_table_text:
  812. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  813. return None
  814. except Exception as e:
  815. logging.exception("Failed to load keyword table from file: %s", file_key)
  816. return None
  817. class Embedding(Base):
  818. __tablename__ = "embeddings"
  819. __table_args__ = (
  820. sa.PrimaryKeyConstraint("id", name="embedding_pkey"),
  821. sa.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  822. sa.Index("created_at_idx", "created_at"),
  823. )
  824. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  825. model_name = mapped_column(
  826. String(255), nullable=False, server_default=sa.text("'text-embedding-ada-002'::character varying")
  827. )
  828. hash = mapped_column(String(64), nullable=False)
  829. embedding = mapped_column(sa.LargeBinary, nullable=False)
  830. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  831. provider_name = mapped_column(String(255), nullable=False, server_default=sa.text("''::character varying"))
  832. def set_embedding(self, embedding_data: list[float]):
  833. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  834. def get_embedding(self) -> list[float]:
  835. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  836. class DatasetCollectionBinding(Base):
  837. __tablename__ = "dataset_collection_bindings"
  838. __table_args__ = (
  839. sa.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  840. sa.Index("provider_model_name_idx", "provider_name", "model_name"),
  841. )
  842. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  843. provider_name: Mapped[str] = mapped_column(String(255), nullable=False)
  844. model_name: Mapped[str] = mapped_column(String(255), nullable=False)
  845. type = mapped_column(String(40), server_default=sa.text("'dataset'::character varying"), nullable=False)
  846. collection_name = mapped_column(String(64), nullable=False)
  847. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  848. class TidbAuthBinding(Base):
  849. __tablename__ = "tidb_auth_bindings"
  850. __table_args__ = (
  851. sa.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  852. sa.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  853. sa.Index("tidb_auth_bindings_active_idx", "active"),
  854. sa.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  855. sa.Index("tidb_auth_bindings_status_idx", "status"),
  856. )
  857. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  858. tenant_id = mapped_column(StringUUID, nullable=True)
  859. cluster_id: Mapped[str] = mapped_column(String(255), nullable=False)
  860. cluster_name: Mapped[str] = mapped_column(String(255), nullable=False)
  861. active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=db.text("false"))
  862. status = mapped_column(String(255), nullable=False, server_default=db.text("'CREATING'::character varying"))
  863. account: Mapped[str] = mapped_column(String(255), nullable=False)
  864. password: Mapped[str] = mapped_column(String(255), nullable=False)
  865. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  866. class Whitelist(Base):
  867. __tablename__ = "whitelists"
  868. __table_args__ = (
  869. sa.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  870. sa.Index("whitelists_tenant_idx", "tenant_id"),
  871. )
  872. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  873. tenant_id = mapped_column(StringUUID, nullable=True)
  874. category: Mapped[str] = mapped_column(String(255), nullable=False)
  875. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  876. class DatasetPermission(Base):
  877. __tablename__ = "dataset_permissions"
  878. __table_args__ = (
  879. sa.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  880. sa.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  881. sa.Index("idx_dataset_permissions_account_id", "account_id"),
  882. sa.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  883. )
  884. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"), primary_key=True)
  885. dataset_id = mapped_column(StringUUID, nullable=False)
  886. account_id = mapped_column(StringUUID, nullable=False)
  887. tenant_id = mapped_column(StringUUID, nullable=False)
  888. has_permission: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  889. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  890. class ExternalKnowledgeApis(Base):
  891. __tablename__ = "external_knowledge_apis"
  892. __table_args__ = (
  893. sa.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  894. sa.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  895. sa.Index("external_knowledge_apis_name_idx", "name"),
  896. )
  897. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  898. name: Mapped[str] = mapped_column(String(255), nullable=False)
  899. description: Mapped[str] = mapped_column(String(255), nullable=False)
  900. tenant_id = mapped_column(StringUUID, nullable=False)
  901. settings = mapped_column(sa.Text, nullable=True)
  902. created_by = mapped_column(StringUUID, nullable=False)
  903. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  904. updated_by = mapped_column(StringUUID, nullable=True)
  905. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  906. def to_dict(self):
  907. return {
  908. "id": self.id,
  909. "tenant_id": self.tenant_id,
  910. "name": self.name,
  911. "description": self.description,
  912. "settings": self.settings_dict,
  913. "dataset_bindings": self.dataset_bindings,
  914. "created_by": self.created_by,
  915. "created_at": self.created_at.isoformat(),
  916. }
  917. @property
  918. def settings_dict(self):
  919. try:
  920. return json.loads(self.settings) if self.settings else None
  921. except JSONDecodeError:
  922. return None
  923. @property
  924. def dataset_bindings(self):
  925. external_knowledge_bindings = (
  926. db.session.query(ExternalKnowledgeBindings)
  927. .where(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  928. .all()
  929. )
  930. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  931. datasets = db.session.query(Dataset).where(Dataset.id.in_(dataset_ids)).all()
  932. dataset_bindings = []
  933. for dataset in datasets:
  934. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  935. return dataset_bindings
  936. class ExternalKnowledgeBindings(Base):
  937. __tablename__ = "external_knowledge_bindings"
  938. __table_args__ = (
  939. sa.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  940. sa.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  941. sa.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  942. sa.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  943. sa.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  944. )
  945. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  946. tenant_id = mapped_column(StringUUID, nullable=False)
  947. external_knowledge_api_id = mapped_column(StringUUID, nullable=False)
  948. dataset_id = mapped_column(StringUUID, nullable=False)
  949. external_knowledge_id = mapped_column(sa.Text, nullable=False)
  950. created_by = mapped_column(StringUUID, nullable=False)
  951. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  952. updated_by = mapped_column(StringUUID, nullable=True)
  953. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  954. class DatasetAutoDisableLog(Base):
  955. __tablename__ = "dataset_auto_disable_logs"
  956. __table_args__ = (
  957. sa.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  958. sa.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  959. sa.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  960. sa.Index("dataset_auto_disable_log_created_atx", "created_at"),
  961. )
  962. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  963. tenant_id = mapped_column(StringUUID, nullable=False)
  964. dataset_id = mapped_column(StringUUID, nullable=False)
  965. document_id = mapped_column(StringUUID, nullable=False)
  966. notified: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  967. created_at: Mapped[datetime] = mapped_column(
  968. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  969. )
  970. class RateLimitLog(Base):
  971. __tablename__ = "rate_limit_logs"
  972. __table_args__ = (
  973. sa.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  974. sa.Index("rate_limit_log_tenant_idx", "tenant_id"),
  975. sa.Index("rate_limit_log_operation_idx", "operation"),
  976. )
  977. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  978. tenant_id = mapped_column(StringUUID, nullable=False)
  979. subscription_plan: Mapped[str] = mapped_column(String(255), nullable=False)
  980. operation: Mapped[str] = mapped_column(String(255), nullable=False)
  981. created_at: Mapped[datetime] = mapped_column(
  982. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  983. )
  984. class DatasetMetadata(Base):
  985. __tablename__ = "dataset_metadatas"
  986. __table_args__ = (
  987. sa.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  988. sa.Index("dataset_metadata_tenant_idx", "tenant_id"),
  989. sa.Index("dataset_metadata_dataset_idx", "dataset_id"),
  990. )
  991. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  992. tenant_id = mapped_column(StringUUID, nullable=False)
  993. dataset_id = mapped_column(StringUUID, nullable=False)
  994. type: Mapped[str] = mapped_column(String(255), nullable=False)
  995. name: Mapped[str] = mapped_column(String(255), nullable=False)
  996. created_at: Mapped[datetime] = mapped_column(
  997. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  998. )
  999. updated_at: Mapped[datetime] = mapped_column(
  1000. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1001. )
  1002. created_by = mapped_column(StringUUID, nullable=False)
  1003. updated_by = mapped_column(StringUUID, nullable=True)
  1004. class DatasetMetadataBinding(Base):
  1005. __tablename__ = "dataset_metadata_bindings"
  1006. __table_args__ = (
  1007. sa.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  1008. sa.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  1009. sa.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  1010. sa.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  1011. sa.Index("dataset_metadata_binding_document_idx", "document_id"),
  1012. )
  1013. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1014. tenant_id = mapped_column(StringUUID, nullable=False)
  1015. dataset_id = mapped_column(StringUUID, nullable=False)
  1016. metadata_id = mapped_column(StringUUID, nullable=False)
  1017. document_id = mapped_column(StringUUID, nullable=False)
  1018. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1019. created_by = mapped_column(StringUUID, nullable=False)