Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from datetime import datetime
  12. from json import JSONDecodeError
  13. from typing import Any, Optional, cast
  14. import sqlalchemy as sa
  15. from sqlalchemy import DateTime, String, func, select
  16. from sqlalchemy.dialects.postgresql import JSONB
  17. from sqlalchemy.orm import Mapped, mapped_column
  18. from configs import dify_config
  19. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  20. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  21. from extensions.ext_storage import storage
  22. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  23. from .account import Account
  24. from .base import Base
  25. from .engine import db
  26. from .model import App, Tag, TagBinding, UploadFile
  27. from .types import StringUUID
  28. logger = logging.getLogger(__name__)
  29. class DatasetPermissionEnum(enum.StrEnum):
  30. ONLY_ME = "only_me"
  31. ALL_TEAM = "all_team_members"
  32. PARTIAL_TEAM = "partial_members"
  33. class Dataset(Base):
  34. __tablename__ = "datasets"
  35. __table_args__ = (
  36. sa.PrimaryKeyConstraint("id", name="dataset_pkey"),
  37. sa.Index("dataset_tenant_idx", "tenant_id"),
  38. sa.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  39. )
  40. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  41. PROVIDER_LIST = ["vendor", "external", None]
  42. id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  43. tenant_id: Mapped[str] = mapped_column(StringUUID)
  44. name: Mapped[str] = mapped_column(String(255))
  45. description = mapped_column(sa.Text, nullable=True)
  46. provider: Mapped[str] = mapped_column(String(255), server_default=sa.text("'vendor'::character varying"))
  47. permission: Mapped[str] = mapped_column(String(255), server_default=sa.text("'only_me'::character varying"))
  48. data_source_type = mapped_column(String(255))
  49. indexing_technique: Mapped[Optional[str]] = mapped_column(String(255))
  50. index_struct = mapped_column(sa.Text, nullable=True)
  51. created_by = mapped_column(StringUUID, nullable=False)
  52. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  53. updated_by = mapped_column(StringUUID, nullable=True)
  54. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  55. embedding_model = mapped_column(String(255), nullable=True)
  56. embedding_model_provider = mapped_column(String(255), nullable=True)
  57. collection_binding_id = mapped_column(StringUUID, nullable=True)
  58. retrieval_model = mapped_column(JSONB, nullable=True)
  59. built_in_field_enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  60. @property
  61. def dataset_keyword_table(self):
  62. dataset_keyword_table = (
  63. db.session.query(DatasetKeywordTable).where(DatasetKeywordTable.dataset_id == self.id).first()
  64. )
  65. if dataset_keyword_table:
  66. return dataset_keyword_table
  67. return None
  68. @property
  69. def index_struct_dict(self):
  70. return json.loads(self.index_struct) if self.index_struct else None
  71. @property
  72. def external_retrieval_model(self):
  73. default_retrieval_model = {
  74. "top_k": 2,
  75. "score_threshold": 0.0,
  76. }
  77. return self.retrieval_model or default_retrieval_model
  78. @property
  79. def created_by_account(self):
  80. return db.session.get(Account, self.created_by)
  81. @property
  82. def latest_process_rule(self):
  83. return (
  84. db.session.query(DatasetProcessRule)
  85. .where(DatasetProcessRule.dataset_id == self.id)
  86. .order_by(DatasetProcessRule.created_at.desc())
  87. .first()
  88. )
  89. @property
  90. def app_count(self):
  91. return (
  92. db.session.query(func.count(AppDatasetJoin.id))
  93. .where(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  94. .scalar()
  95. )
  96. @property
  97. def document_count(self):
  98. return db.session.query(func.count(Document.id)).where(Document.dataset_id == self.id).scalar()
  99. @property
  100. def available_document_count(self):
  101. return (
  102. db.session.query(func.count(Document.id))
  103. .where(
  104. Document.dataset_id == self.id,
  105. Document.indexing_status == "completed",
  106. Document.enabled == True,
  107. Document.archived == False,
  108. )
  109. .scalar()
  110. )
  111. @property
  112. def available_segment_count(self):
  113. return (
  114. db.session.query(func.count(DocumentSegment.id))
  115. .where(
  116. DocumentSegment.dataset_id == self.id,
  117. DocumentSegment.status == "completed",
  118. DocumentSegment.enabled == True,
  119. )
  120. .scalar()
  121. )
  122. @property
  123. def word_count(self):
  124. return (
  125. db.session.query(Document)
  126. .with_entities(func.coalesce(func.sum(Document.word_count), 0))
  127. .where(Document.dataset_id == self.id)
  128. .scalar()
  129. )
  130. @property
  131. def doc_form(self):
  132. document = db.session.query(Document).where(Document.dataset_id == self.id).first()
  133. if document:
  134. return document.doc_form
  135. return None
  136. @property
  137. def retrieval_model_dict(self):
  138. default_retrieval_model = {
  139. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  140. "reranking_enable": False,
  141. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  142. "top_k": 2,
  143. "score_threshold_enabled": False,
  144. }
  145. return self.retrieval_model or default_retrieval_model
  146. @property
  147. def tags(self):
  148. tags = (
  149. db.session.query(Tag)
  150. .join(TagBinding, Tag.id == TagBinding.tag_id)
  151. .where(
  152. TagBinding.target_id == self.id,
  153. TagBinding.tenant_id == self.tenant_id,
  154. Tag.tenant_id == self.tenant_id,
  155. Tag.type == "knowledge",
  156. )
  157. .all()
  158. )
  159. return tags or []
  160. @property
  161. def external_knowledge_info(self):
  162. if self.provider != "external":
  163. return None
  164. external_knowledge_binding = (
  165. db.session.query(ExternalKnowledgeBindings).where(ExternalKnowledgeBindings.dataset_id == self.id).first()
  166. )
  167. if not external_knowledge_binding:
  168. return None
  169. external_knowledge_api = db.session.scalar(
  170. select(ExternalKnowledgeApis).where(
  171. ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id
  172. )
  173. )
  174. if not external_knowledge_api:
  175. return None
  176. return {
  177. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  178. "external_knowledge_api_id": external_knowledge_api.id,
  179. "external_knowledge_api_name": external_knowledge_api.name,
  180. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  181. }
  182. @property
  183. def doc_metadata(self):
  184. dataset_metadatas = db.session.scalars(
  185. select(DatasetMetadata).where(DatasetMetadata.dataset_id == self.id)
  186. ).all()
  187. doc_metadata = [
  188. {
  189. "id": dataset_metadata.id,
  190. "name": dataset_metadata.name,
  191. "type": dataset_metadata.type,
  192. }
  193. for dataset_metadata in dataset_metadatas
  194. ]
  195. if self.built_in_field_enabled:
  196. doc_metadata.append(
  197. {
  198. "id": "built-in",
  199. "name": BuiltInField.document_name.value,
  200. "type": "string",
  201. }
  202. )
  203. doc_metadata.append(
  204. {
  205. "id": "built-in",
  206. "name": BuiltInField.uploader.value,
  207. "type": "string",
  208. }
  209. )
  210. doc_metadata.append(
  211. {
  212. "id": "built-in",
  213. "name": BuiltInField.upload_date.value,
  214. "type": "time",
  215. }
  216. )
  217. doc_metadata.append(
  218. {
  219. "id": "built-in",
  220. "name": BuiltInField.last_update_date.value,
  221. "type": "time",
  222. }
  223. )
  224. doc_metadata.append(
  225. {
  226. "id": "built-in",
  227. "name": BuiltInField.source.value,
  228. "type": "string",
  229. }
  230. )
  231. return doc_metadata
  232. @staticmethod
  233. def gen_collection_name_by_id(dataset_id: str) -> str:
  234. normalized_dataset_id = dataset_id.replace("-", "_")
  235. return f"{dify_config.VECTOR_INDEX_NAME_PREFIX}_{normalized_dataset_id}_Node"
  236. class DatasetProcessRule(Base):
  237. __tablename__ = "dataset_process_rules"
  238. __table_args__ = (
  239. sa.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  240. sa.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  241. )
  242. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  243. dataset_id = mapped_column(StringUUID, nullable=False)
  244. mode = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  245. rules = mapped_column(sa.Text, nullable=True)
  246. created_by = mapped_column(StringUUID, nullable=False)
  247. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  248. MODES = ["automatic", "custom", "hierarchical"]
  249. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  250. AUTOMATIC_RULES: dict[str, Any] = {
  251. "pre_processing_rules": [
  252. {"id": "remove_extra_spaces", "enabled": True},
  253. {"id": "remove_urls_emails", "enabled": False},
  254. ],
  255. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  256. }
  257. def to_dict(self) -> dict[str, Any]:
  258. return {
  259. "id": self.id,
  260. "dataset_id": self.dataset_id,
  261. "mode": self.mode,
  262. "rules": self.rules_dict,
  263. }
  264. @property
  265. def rules_dict(self) -> dict[str, Any] | None:
  266. try:
  267. return json.loads(self.rules) if self.rules else None
  268. except JSONDecodeError:
  269. return None
  270. class Document(Base):
  271. __tablename__ = "documents"
  272. __table_args__ = (
  273. sa.PrimaryKeyConstraint("id", name="document_pkey"),
  274. sa.Index("document_dataset_id_idx", "dataset_id"),
  275. sa.Index("document_is_paused_idx", "is_paused"),
  276. sa.Index("document_tenant_idx", "tenant_id"),
  277. sa.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  278. )
  279. # initial fields
  280. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  281. tenant_id = mapped_column(StringUUID, nullable=False)
  282. dataset_id = mapped_column(StringUUID, nullable=False)
  283. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  284. data_source_type: Mapped[str] = mapped_column(String(255), nullable=False)
  285. data_source_info = mapped_column(sa.Text, nullable=True)
  286. dataset_process_rule_id = mapped_column(StringUUID, nullable=True)
  287. batch: Mapped[str] = mapped_column(String(255), nullable=False)
  288. name: Mapped[str] = mapped_column(String(255), nullable=False)
  289. created_from: Mapped[str] = mapped_column(String(255), nullable=False)
  290. created_by = mapped_column(StringUUID, nullable=False)
  291. created_api_request_id = mapped_column(StringUUID, nullable=True)
  292. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  293. # start processing
  294. processing_started_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  295. # parsing
  296. file_id = mapped_column(sa.Text, nullable=True)
  297. word_count: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True) # TODO: make this not nullable
  298. parsing_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  299. # cleaning
  300. cleaning_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  301. # split
  302. splitting_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  303. # indexing
  304. tokens: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True)
  305. indexing_latency: Mapped[Optional[float]] = mapped_column(sa.Float, nullable=True)
  306. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  307. # pause
  308. is_paused: Mapped[Optional[bool]] = mapped_column(sa.Boolean, nullable=True, server_default=sa.text("false"))
  309. paused_by = mapped_column(StringUUID, nullable=True)
  310. paused_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  311. # error
  312. error = mapped_column(sa.Text, nullable=True)
  313. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  314. # basic fields
  315. indexing_status = mapped_column(String(255), nullable=False, server_default=sa.text("'waiting'::character varying"))
  316. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  317. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  318. disabled_by = mapped_column(StringUUID, nullable=True)
  319. archived: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  320. archived_reason = mapped_column(String(255), nullable=True)
  321. archived_by = mapped_column(StringUUID, nullable=True)
  322. archived_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  323. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  324. doc_type = mapped_column(String(40), nullable=True)
  325. doc_metadata = mapped_column(JSONB, nullable=True)
  326. doc_form = mapped_column(String(255), nullable=False, server_default=sa.text("'text_model'::character varying"))
  327. doc_language = mapped_column(String(255), nullable=True)
  328. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  329. @property
  330. def display_status(self):
  331. status = None
  332. if self.indexing_status == "waiting":
  333. status = "queuing"
  334. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  335. status = "paused"
  336. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  337. status = "indexing"
  338. elif self.indexing_status == "error":
  339. status = "error"
  340. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  341. status = "available"
  342. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  343. status = "disabled"
  344. elif self.indexing_status == "completed" and self.archived:
  345. status = "archived"
  346. return status
  347. @property
  348. def data_source_info_dict(self) -> dict[str, Any] | None:
  349. if self.data_source_info:
  350. try:
  351. data_source_info_dict: dict[str, Any] = json.loads(self.data_source_info)
  352. except JSONDecodeError:
  353. data_source_info_dict = {}
  354. return data_source_info_dict
  355. return None
  356. @property
  357. def data_source_detail_dict(self) -> dict[str, Any]:
  358. if self.data_source_info:
  359. if self.data_source_type == "upload_file":
  360. data_source_info_dict: dict[str, Any] = json.loads(self.data_source_info)
  361. file_detail = (
  362. db.session.query(UploadFile)
  363. .where(UploadFile.id == data_source_info_dict["upload_file_id"])
  364. .one_or_none()
  365. )
  366. if file_detail:
  367. return {
  368. "upload_file": {
  369. "id": file_detail.id,
  370. "name": file_detail.name,
  371. "size": file_detail.size,
  372. "extension": file_detail.extension,
  373. "mime_type": file_detail.mime_type,
  374. "created_by": file_detail.created_by,
  375. "created_at": file_detail.created_at.timestamp(),
  376. }
  377. }
  378. elif self.data_source_type in {"notion_import", "website_crawl"}:
  379. result: dict[str, Any] = json.loads(self.data_source_info)
  380. return result
  381. return {}
  382. @property
  383. def average_segment_length(self):
  384. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  385. return self.word_count // self.segment_count
  386. return 0
  387. @property
  388. def dataset_process_rule(self):
  389. if self.dataset_process_rule_id:
  390. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  391. return None
  392. @property
  393. def dataset(self):
  394. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).one_or_none()
  395. @property
  396. def segment_count(self):
  397. return db.session.query(DocumentSegment).where(DocumentSegment.document_id == self.id).count()
  398. @property
  399. def hit_count(self):
  400. return (
  401. db.session.query(DocumentSegment)
  402. .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count), 0))
  403. .where(DocumentSegment.document_id == self.id)
  404. .scalar()
  405. )
  406. @property
  407. def uploader(self):
  408. user = db.session.query(Account).where(Account.id == self.created_by).first()
  409. return user.name if user else None
  410. @property
  411. def upload_date(self):
  412. return self.created_at
  413. @property
  414. def last_update_date(self):
  415. return self.updated_at
  416. @property
  417. def doc_metadata_details(self) -> list[dict[str, Any]] | None:
  418. if self.doc_metadata:
  419. document_metadatas = (
  420. db.session.query(DatasetMetadata)
  421. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  422. .where(
  423. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  424. )
  425. .all()
  426. )
  427. metadata_list: list[dict[str, Any]] = []
  428. for metadata in document_metadatas:
  429. metadata_dict: dict[str, Any] = {
  430. "id": metadata.id,
  431. "name": metadata.name,
  432. "type": metadata.type,
  433. "value": self.doc_metadata.get(metadata.name),
  434. }
  435. metadata_list.append(metadata_dict)
  436. # deal built-in fields
  437. metadata_list.extend(self.get_built_in_fields())
  438. return metadata_list
  439. return None
  440. @property
  441. def process_rule_dict(self) -> dict[str, Any] | None:
  442. if self.dataset_process_rule_id and self.dataset_process_rule:
  443. return self.dataset_process_rule.to_dict()
  444. return None
  445. def get_built_in_fields(self) -> list[dict[str, Any]]:
  446. built_in_fields: list[dict[str, Any]] = []
  447. built_in_fields.append(
  448. {
  449. "id": "built-in",
  450. "name": BuiltInField.document_name,
  451. "type": "string",
  452. "value": self.name,
  453. }
  454. )
  455. built_in_fields.append(
  456. {
  457. "id": "built-in",
  458. "name": BuiltInField.uploader,
  459. "type": "string",
  460. "value": self.uploader,
  461. }
  462. )
  463. built_in_fields.append(
  464. {
  465. "id": "built-in",
  466. "name": BuiltInField.upload_date,
  467. "type": "time",
  468. "value": str(self.created_at.timestamp()),
  469. }
  470. )
  471. built_in_fields.append(
  472. {
  473. "id": "built-in",
  474. "name": BuiltInField.last_update_date,
  475. "type": "time",
  476. "value": str(self.updated_at.timestamp()),
  477. }
  478. )
  479. built_in_fields.append(
  480. {
  481. "id": "built-in",
  482. "name": BuiltInField.source,
  483. "type": "string",
  484. "value": MetadataDataSource[self.data_source_type].value,
  485. }
  486. )
  487. return built_in_fields
  488. def to_dict(self) -> dict[str, Any]:
  489. return {
  490. "id": self.id,
  491. "tenant_id": self.tenant_id,
  492. "dataset_id": self.dataset_id,
  493. "position": self.position,
  494. "data_source_type": self.data_source_type,
  495. "data_source_info": self.data_source_info,
  496. "dataset_process_rule_id": self.dataset_process_rule_id,
  497. "batch": self.batch,
  498. "name": self.name,
  499. "created_from": self.created_from,
  500. "created_by": self.created_by,
  501. "created_api_request_id": self.created_api_request_id,
  502. "created_at": self.created_at,
  503. "processing_started_at": self.processing_started_at,
  504. "file_id": self.file_id,
  505. "word_count": self.word_count,
  506. "parsing_completed_at": self.parsing_completed_at,
  507. "cleaning_completed_at": self.cleaning_completed_at,
  508. "splitting_completed_at": self.splitting_completed_at,
  509. "tokens": self.tokens,
  510. "indexing_latency": self.indexing_latency,
  511. "completed_at": self.completed_at,
  512. "is_paused": self.is_paused,
  513. "paused_by": self.paused_by,
  514. "paused_at": self.paused_at,
  515. "error": self.error,
  516. "stopped_at": self.stopped_at,
  517. "indexing_status": self.indexing_status,
  518. "enabled": self.enabled,
  519. "disabled_at": self.disabled_at,
  520. "disabled_by": self.disabled_by,
  521. "archived": self.archived,
  522. "archived_reason": self.archived_reason,
  523. "archived_by": self.archived_by,
  524. "archived_at": self.archived_at,
  525. "updated_at": self.updated_at,
  526. "doc_type": self.doc_type,
  527. "doc_metadata": self.doc_metadata,
  528. "doc_form": self.doc_form,
  529. "doc_language": self.doc_language,
  530. "display_status": self.display_status,
  531. "data_source_info_dict": self.data_source_info_dict,
  532. "average_segment_length": self.average_segment_length,
  533. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  534. "dataset": None, # Dataset class doesn't have a to_dict method
  535. "segment_count": self.segment_count,
  536. "hit_count": self.hit_count,
  537. }
  538. @classmethod
  539. def from_dict(cls, data: dict[str, Any]):
  540. return cls(
  541. id=data.get("id"),
  542. tenant_id=data.get("tenant_id"),
  543. dataset_id=data.get("dataset_id"),
  544. position=data.get("position"),
  545. data_source_type=data.get("data_source_type"),
  546. data_source_info=data.get("data_source_info"),
  547. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  548. batch=data.get("batch"),
  549. name=data.get("name"),
  550. created_from=data.get("created_from"),
  551. created_by=data.get("created_by"),
  552. created_api_request_id=data.get("created_api_request_id"),
  553. created_at=data.get("created_at"),
  554. processing_started_at=data.get("processing_started_at"),
  555. file_id=data.get("file_id"),
  556. word_count=data.get("word_count"),
  557. parsing_completed_at=data.get("parsing_completed_at"),
  558. cleaning_completed_at=data.get("cleaning_completed_at"),
  559. splitting_completed_at=data.get("splitting_completed_at"),
  560. tokens=data.get("tokens"),
  561. indexing_latency=data.get("indexing_latency"),
  562. completed_at=data.get("completed_at"),
  563. is_paused=data.get("is_paused"),
  564. paused_by=data.get("paused_by"),
  565. paused_at=data.get("paused_at"),
  566. error=data.get("error"),
  567. stopped_at=data.get("stopped_at"),
  568. indexing_status=data.get("indexing_status"),
  569. enabled=data.get("enabled"),
  570. disabled_at=data.get("disabled_at"),
  571. disabled_by=data.get("disabled_by"),
  572. archived=data.get("archived"),
  573. archived_reason=data.get("archived_reason"),
  574. archived_by=data.get("archived_by"),
  575. archived_at=data.get("archived_at"),
  576. updated_at=data.get("updated_at"),
  577. doc_type=data.get("doc_type"),
  578. doc_metadata=data.get("doc_metadata"),
  579. doc_form=data.get("doc_form"),
  580. doc_language=data.get("doc_language"),
  581. )
  582. class DocumentSegment(Base):
  583. __tablename__ = "document_segments"
  584. __table_args__ = (
  585. sa.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  586. sa.Index("document_segment_dataset_id_idx", "dataset_id"),
  587. sa.Index("document_segment_document_id_idx", "document_id"),
  588. sa.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  589. sa.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  590. sa.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
  591. sa.Index("document_segment_tenant_idx", "tenant_id"),
  592. )
  593. # initial fields
  594. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  595. tenant_id = mapped_column(StringUUID, nullable=False)
  596. dataset_id = mapped_column(StringUUID, nullable=False)
  597. document_id = mapped_column(StringUUID, nullable=False)
  598. position: Mapped[int]
  599. content = mapped_column(sa.Text, nullable=False)
  600. answer = mapped_column(sa.Text, nullable=True)
  601. word_count: Mapped[int]
  602. tokens: Mapped[int]
  603. # indexing fields
  604. keywords = mapped_column(sa.JSON, nullable=True)
  605. index_node_id = mapped_column(String(255), nullable=True)
  606. index_node_hash = mapped_column(String(255), nullable=True)
  607. # basic fields
  608. hit_count: Mapped[int] = mapped_column(sa.Integer, nullable=False, default=0)
  609. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  610. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  611. disabled_by = mapped_column(StringUUID, nullable=True)
  612. status: Mapped[str] = mapped_column(String(255), server_default=sa.text("'waiting'::character varying"))
  613. created_by = mapped_column(StringUUID, nullable=False)
  614. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  615. updated_by = mapped_column(StringUUID, nullable=True)
  616. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  617. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  618. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  619. error = mapped_column(sa.Text, nullable=True)
  620. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  621. @property
  622. def dataset(self):
  623. return db.session.scalar(select(Dataset).where(Dataset.id == self.dataset_id))
  624. @property
  625. def document(self):
  626. return db.session.scalar(select(Document).where(Document.id == self.document_id))
  627. @property
  628. def previous_segment(self):
  629. return db.session.scalar(
  630. select(DocumentSegment).where(
  631. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1
  632. )
  633. )
  634. @property
  635. def next_segment(self):
  636. return db.session.scalar(
  637. select(DocumentSegment).where(
  638. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1
  639. )
  640. )
  641. @property
  642. def child_chunks(self) -> list[Any]:
  643. if not self.document:
  644. return []
  645. process_rule = self.document.dataset_process_rule
  646. if process_rule and process_rule.mode == "hierarchical":
  647. rules_dict = process_rule.rules_dict
  648. if rules_dict:
  649. rules = Rule(**rules_dict)
  650. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  651. child_chunks = (
  652. db.session.query(ChildChunk)
  653. .where(ChildChunk.segment_id == self.id)
  654. .order_by(ChildChunk.position.asc())
  655. .all()
  656. )
  657. return child_chunks or []
  658. return []
  659. def get_child_chunks(self) -> list[Any]:
  660. if not self.document:
  661. return []
  662. process_rule = self.document.dataset_process_rule
  663. if process_rule and process_rule.mode == "hierarchical":
  664. rules_dict = process_rule.rules_dict
  665. if rules_dict:
  666. rules = Rule(**rules_dict)
  667. if rules.parent_mode:
  668. child_chunks = (
  669. db.session.query(ChildChunk)
  670. .where(ChildChunk.segment_id == self.id)
  671. .order_by(ChildChunk.position.asc())
  672. .all()
  673. )
  674. return child_chunks or []
  675. return []
  676. @property
  677. def sign_content(self) -> str:
  678. return self.get_sign_content()
  679. def get_sign_content(self) -> str:
  680. signed_urls: list[tuple[int, int, str]] = []
  681. text = self.content
  682. # For data before v0.10.0
  683. pattern = r"/files/([a-f0-9\-]+)/image-preview"
  684. matches = re.finditer(pattern, text)
  685. for match in matches:
  686. upload_file_id = match.group(1)
  687. nonce = os.urandom(16).hex()
  688. timestamp = str(int(time.time()))
  689. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  690. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  691. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  692. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  693. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  694. signed_url = f"{match.group(0)}?{params}"
  695. signed_urls.append((match.start(), match.end(), signed_url))
  696. # For data after v0.10.0
  697. pattern = r"/files/([a-f0-9\-]+)/file-preview"
  698. matches = re.finditer(pattern, text)
  699. for match in matches:
  700. upload_file_id = match.group(1)
  701. nonce = os.urandom(16).hex()
  702. timestamp = str(int(time.time()))
  703. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  704. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  705. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  706. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  707. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  708. signed_url = f"{match.group(0)}?{params}"
  709. signed_urls.append((match.start(), match.end(), signed_url))
  710. # Reconstruct the text with signed URLs
  711. offset = 0
  712. for start, end, signed_url in signed_urls:
  713. text = text[: start + offset] + signed_url + text[end + offset :]
  714. offset += len(signed_url) - (end - start)
  715. return text
  716. class ChildChunk(Base):
  717. __tablename__ = "child_chunks"
  718. __table_args__ = (
  719. sa.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  720. sa.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  721. sa.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
  722. sa.Index("child_chunks_segment_idx", "segment_id"),
  723. )
  724. # initial fields
  725. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  726. tenant_id = mapped_column(StringUUID, nullable=False)
  727. dataset_id = mapped_column(StringUUID, nullable=False)
  728. document_id = mapped_column(StringUUID, nullable=False)
  729. segment_id = mapped_column(StringUUID, nullable=False)
  730. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  731. content = mapped_column(sa.Text, nullable=False)
  732. word_count: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  733. # indexing fields
  734. index_node_id = mapped_column(String(255), nullable=True)
  735. index_node_hash = mapped_column(String(255), nullable=True)
  736. type = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  737. created_by = mapped_column(StringUUID, nullable=False)
  738. created_at: Mapped[datetime] = mapped_column(
  739. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  740. )
  741. updated_by = mapped_column(StringUUID, nullable=True)
  742. updated_at: Mapped[datetime] = mapped_column(
  743. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  744. )
  745. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  746. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  747. error = mapped_column(sa.Text, nullable=True)
  748. @property
  749. def dataset(self):
  750. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).first()
  751. @property
  752. def document(self):
  753. return db.session.query(Document).where(Document.id == self.document_id).first()
  754. @property
  755. def segment(self):
  756. return db.session.query(DocumentSegment).where(DocumentSegment.id == self.segment_id).first()
  757. class AppDatasetJoin(Base):
  758. __tablename__ = "app_dataset_joins"
  759. __table_args__ = (
  760. sa.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  761. sa.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  762. )
  763. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  764. app_id = mapped_column(StringUUID, nullable=False)
  765. dataset_id = mapped_column(StringUUID, nullable=False)
  766. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  767. @property
  768. def app(self):
  769. return db.session.get(App, self.app_id)
  770. class DatasetQuery(Base):
  771. __tablename__ = "dataset_queries"
  772. __table_args__ = (
  773. sa.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  774. sa.Index("dataset_query_dataset_id_idx", "dataset_id"),
  775. )
  776. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  777. dataset_id = mapped_column(StringUUID, nullable=False)
  778. content = mapped_column(sa.Text, nullable=False)
  779. source: Mapped[str] = mapped_column(String(255), nullable=False)
  780. source_app_id = mapped_column(StringUUID, nullable=True)
  781. created_by_role = mapped_column(String, nullable=False)
  782. created_by = mapped_column(StringUUID, nullable=False)
  783. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  784. class DatasetKeywordTable(Base):
  785. __tablename__ = "dataset_keyword_tables"
  786. __table_args__ = (
  787. sa.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  788. sa.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  789. )
  790. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  791. dataset_id = mapped_column(StringUUID, nullable=False, unique=True)
  792. keyword_table = mapped_column(sa.Text, nullable=False)
  793. data_source_type = mapped_column(
  794. String(255), nullable=False, server_default=sa.text("'database'::character varying")
  795. )
  796. @property
  797. def keyword_table_dict(self) -> dict[str, set[Any]] | None:
  798. class SetDecoder(json.JSONDecoder):
  799. def __init__(self, *args: Any, **kwargs: Any) -> None:
  800. def object_hook(dct: Any) -> Any:
  801. if isinstance(dct, dict):
  802. result: dict[str, Any] = {}
  803. items = cast(dict[str, Any], dct).items()
  804. for keyword, node_idxs in items:
  805. if isinstance(node_idxs, list):
  806. result[keyword] = set(cast(list[Any], node_idxs))
  807. else:
  808. result[keyword] = node_idxs
  809. return result
  810. return dct
  811. super().__init__(object_hook=object_hook, *args, **kwargs)
  812. # get dataset
  813. dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
  814. if not dataset:
  815. return None
  816. if self.data_source_type == "database":
  817. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  818. else:
  819. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  820. try:
  821. keyword_table_text = storage.load_once(file_key)
  822. if keyword_table_text:
  823. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  824. return None
  825. except Exception:
  826. logger.exception("Failed to load keyword table from file: %s", file_key)
  827. return None
  828. class Embedding(Base):
  829. __tablename__ = "embeddings"
  830. __table_args__ = (
  831. sa.PrimaryKeyConstraint("id", name="embedding_pkey"),
  832. sa.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  833. sa.Index("created_at_idx", "created_at"),
  834. )
  835. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  836. model_name = mapped_column(
  837. String(255), nullable=False, server_default=sa.text("'text-embedding-ada-002'::character varying")
  838. )
  839. hash = mapped_column(String(64), nullable=False)
  840. embedding = mapped_column(sa.LargeBinary, nullable=False)
  841. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  842. provider_name = mapped_column(String(255), nullable=False, server_default=sa.text("''::character varying"))
  843. def set_embedding(self, embedding_data: list[float]):
  844. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  845. def get_embedding(self) -> list[float]:
  846. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  847. class DatasetCollectionBinding(Base):
  848. __tablename__ = "dataset_collection_bindings"
  849. __table_args__ = (
  850. sa.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  851. sa.Index("provider_model_name_idx", "provider_name", "model_name"),
  852. )
  853. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  854. provider_name: Mapped[str] = mapped_column(String(255), nullable=False)
  855. model_name: Mapped[str] = mapped_column(String(255), nullable=False)
  856. type = mapped_column(String(40), server_default=sa.text("'dataset'::character varying"), nullable=False)
  857. collection_name = mapped_column(String(64), nullable=False)
  858. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  859. class TidbAuthBinding(Base):
  860. __tablename__ = "tidb_auth_bindings"
  861. __table_args__ = (
  862. sa.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  863. sa.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  864. sa.Index("tidb_auth_bindings_active_idx", "active"),
  865. sa.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  866. sa.Index("tidb_auth_bindings_status_idx", "status"),
  867. )
  868. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  869. tenant_id = mapped_column(StringUUID, nullable=True)
  870. cluster_id: Mapped[str] = mapped_column(String(255), nullable=False)
  871. cluster_name: Mapped[str] = mapped_column(String(255), nullable=False)
  872. active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=db.text("false"))
  873. status = mapped_column(String(255), nullable=False, server_default=db.text("'CREATING'::character varying"))
  874. account: Mapped[str] = mapped_column(String(255), nullable=False)
  875. password: Mapped[str] = mapped_column(String(255), nullable=False)
  876. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  877. class Whitelist(Base):
  878. __tablename__ = "whitelists"
  879. __table_args__ = (
  880. sa.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  881. sa.Index("whitelists_tenant_idx", "tenant_id"),
  882. )
  883. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  884. tenant_id = mapped_column(StringUUID, nullable=True)
  885. category: Mapped[str] = mapped_column(String(255), nullable=False)
  886. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  887. class DatasetPermission(Base):
  888. __tablename__ = "dataset_permissions"
  889. __table_args__ = (
  890. sa.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  891. sa.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  892. sa.Index("idx_dataset_permissions_account_id", "account_id"),
  893. sa.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  894. )
  895. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"), primary_key=True)
  896. dataset_id = mapped_column(StringUUID, nullable=False)
  897. account_id = mapped_column(StringUUID, nullable=False)
  898. tenant_id = mapped_column(StringUUID, nullable=False)
  899. has_permission: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  900. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  901. class ExternalKnowledgeApis(Base):
  902. __tablename__ = "external_knowledge_apis"
  903. __table_args__ = (
  904. sa.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  905. sa.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  906. sa.Index("external_knowledge_apis_name_idx", "name"),
  907. )
  908. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  909. name: Mapped[str] = mapped_column(String(255), nullable=False)
  910. description: Mapped[str] = mapped_column(String(255), nullable=False)
  911. tenant_id = mapped_column(StringUUID, nullable=False)
  912. settings = mapped_column(sa.Text, nullable=True)
  913. created_by = mapped_column(StringUUID, nullable=False)
  914. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  915. updated_by = mapped_column(StringUUID, nullable=True)
  916. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  917. def to_dict(self) -> dict[str, Any]:
  918. return {
  919. "id": self.id,
  920. "tenant_id": self.tenant_id,
  921. "name": self.name,
  922. "description": self.description,
  923. "settings": self.settings_dict,
  924. "dataset_bindings": self.dataset_bindings,
  925. "created_by": self.created_by,
  926. "created_at": self.created_at.isoformat(),
  927. }
  928. @property
  929. def settings_dict(self) -> dict[str, Any] | None:
  930. try:
  931. return json.loads(self.settings) if self.settings else None
  932. except JSONDecodeError:
  933. return None
  934. @property
  935. def dataset_bindings(self) -> list[dict[str, Any]]:
  936. external_knowledge_bindings = db.session.scalars(
  937. select(ExternalKnowledgeBindings).where(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  938. ).all()
  939. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  940. datasets = db.session.scalars(select(Dataset).where(Dataset.id.in_(dataset_ids))).all()
  941. dataset_bindings: list[dict[str, Any]] = []
  942. for dataset in datasets:
  943. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  944. return dataset_bindings
  945. class ExternalKnowledgeBindings(Base):
  946. __tablename__ = "external_knowledge_bindings"
  947. __table_args__ = (
  948. sa.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  949. sa.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  950. sa.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  951. sa.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  952. sa.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  953. )
  954. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  955. tenant_id = mapped_column(StringUUID, nullable=False)
  956. external_knowledge_api_id = mapped_column(StringUUID, nullable=False)
  957. dataset_id = mapped_column(StringUUID, nullable=False)
  958. external_knowledge_id = mapped_column(sa.Text, nullable=False)
  959. created_by = mapped_column(StringUUID, nullable=False)
  960. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  961. updated_by = mapped_column(StringUUID, nullable=True)
  962. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  963. class DatasetAutoDisableLog(Base):
  964. __tablename__ = "dataset_auto_disable_logs"
  965. __table_args__ = (
  966. sa.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  967. sa.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  968. sa.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  969. sa.Index("dataset_auto_disable_log_created_atx", "created_at"),
  970. )
  971. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  972. tenant_id = mapped_column(StringUUID, nullable=False)
  973. dataset_id = mapped_column(StringUUID, nullable=False)
  974. document_id = mapped_column(StringUUID, nullable=False)
  975. notified: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  976. created_at: Mapped[datetime] = mapped_column(
  977. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  978. )
  979. class RateLimitLog(Base):
  980. __tablename__ = "rate_limit_logs"
  981. __table_args__ = (
  982. sa.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  983. sa.Index("rate_limit_log_tenant_idx", "tenant_id"),
  984. sa.Index("rate_limit_log_operation_idx", "operation"),
  985. )
  986. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  987. tenant_id = mapped_column(StringUUID, nullable=False)
  988. subscription_plan: Mapped[str] = mapped_column(String(255), nullable=False)
  989. operation: Mapped[str] = mapped_column(String(255), nullable=False)
  990. created_at: Mapped[datetime] = mapped_column(
  991. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  992. )
  993. class DatasetMetadata(Base):
  994. __tablename__ = "dataset_metadatas"
  995. __table_args__ = (
  996. sa.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  997. sa.Index("dataset_metadata_tenant_idx", "tenant_id"),
  998. sa.Index("dataset_metadata_dataset_idx", "dataset_id"),
  999. )
  1000. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1001. tenant_id = mapped_column(StringUUID, nullable=False)
  1002. dataset_id = mapped_column(StringUUID, nullable=False)
  1003. type: Mapped[str] = mapped_column(String(255), nullable=False)
  1004. name: Mapped[str] = mapped_column(String(255), nullable=False)
  1005. created_at: Mapped[datetime] = mapped_column(
  1006. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1007. )
  1008. updated_at: Mapped[datetime] = mapped_column(
  1009. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1010. )
  1011. created_by = mapped_column(StringUUID, nullable=False)
  1012. updated_by = mapped_column(StringUUID, nullable=True)
  1013. class DatasetMetadataBinding(Base):
  1014. __tablename__ = "dataset_metadata_bindings"
  1015. __table_args__ = (
  1016. sa.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  1017. sa.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  1018. sa.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  1019. sa.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  1020. sa.Index("dataset_metadata_binding_document_idx", "document_id"),
  1021. )
  1022. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1023. tenant_id = mapped_column(StringUUID, nullable=False)
  1024. dataset_id = mapped_column(StringUUID, nullable=False)
  1025. metadata_id = mapped_column(StringUUID, nullable=False)
  1026. document_id = mapped_column(StringUUID, nullable=False)
  1027. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1028. created_by = mapped_column(StringUUID, nullable=False)