Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from json import JSONDecodeError
  12. from typing import Any, cast
  13. from sqlalchemy import func
  14. from sqlalchemy.dialects.postgresql import JSONB
  15. from sqlalchemy.orm import Mapped
  16. from configs import dify_config
  17. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  18. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  19. from extensions.ext_storage import storage
  20. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  21. from .account import Account
  22. from .base import Base
  23. from .engine import db
  24. from .model import App, Tag, TagBinding, UploadFile
  25. from .types import StringUUID
  26. class DatasetPermissionEnum(enum.StrEnum):
  27. ONLY_ME = "only_me"
  28. ALL_TEAM = "all_team_members"
  29. PARTIAL_TEAM = "partial_members"
  30. class Dataset(Base):
  31. __tablename__ = "datasets"
  32. __table_args__ = (
  33. db.PrimaryKeyConstraint("id", name="dataset_pkey"),
  34. db.Index("dataset_tenant_idx", "tenant_id"),
  35. db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  36. )
  37. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  38. PROVIDER_LIST = ["vendor", "external", None]
  39. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  40. tenant_id = db.Column(StringUUID, nullable=False)
  41. name = db.Column(db.String(255), nullable=False)
  42. description = db.Column(db.Text, nullable=True)
  43. provider = db.Column(db.String(255), nullable=False, server_default=db.text("'vendor'::character varying"))
  44. permission = db.Column(db.String(255), nullable=False, server_default=db.text("'only_me'::character varying"))
  45. data_source_type = db.Column(db.String(255))
  46. indexing_technique = db.Column(db.String(255), nullable=True)
  47. index_struct = db.Column(db.Text, nullable=True)
  48. created_by = db.Column(StringUUID, nullable=False)
  49. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  50. updated_by = db.Column(StringUUID, nullable=True)
  51. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  52. embedding_model = db.Column(db.String(255), nullable=True)
  53. embedding_model_provider = db.Column(db.String(255), nullable=True)
  54. collection_binding_id = db.Column(StringUUID, nullable=True)
  55. retrieval_model = db.Column(JSONB, nullable=True)
  56. built_in_field_enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  57. @property
  58. def dataset_keyword_table(self):
  59. dataset_keyword_table = (
  60. db.session.query(DatasetKeywordTable).filter(DatasetKeywordTable.dataset_id == self.id).first()
  61. )
  62. if dataset_keyword_table:
  63. return dataset_keyword_table
  64. return None
  65. @property
  66. def index_struct_dict(self):
  67. return json.loads(self.index_struct) if self.index_struct else None
  68. @property
  69. def external_retrieval_model(self):
  70. default_retrieval_model = {
  71. "top_k": 2,
  72. "score_threshold": 0.0,
  73. }
  74. return self.retrieval_model or default_retrieval_model
  75. @property
  76. def created_by_account(self):
  77. return db.session.get(Account, self.created_by)
  78. @property
  79. def latest_process_rule(self):
  80. return (
  81. db.session.query(DatasetProcessRule)
  82. .filter(DatasetProcessRule.dataset_id == self.id)
  83. .order_by(DatasetProcessRule.created_at.desc())
  84. .first()
  85. )
  86. @property
  87. def app_count(self):
  88. return (
  89. db.session.query(func.count(AppDatasetJoin.id))
  90. .filter(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  91. .scalar()
  92. )
  93. @property
  94. def document_count(self):
  95. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  96. @property
  97. def available_document_count(self):
  98. return (
  99. db.session.query(func.count(Document.id))
  100. .filter(
  101. Document.dataset_id == self.id,
  102. Document.indexing_status == "completed",
  103. Document.enabled == True,
  104. Document.archived == False,
  105. )
  106. .scalar()
  107. )
  108. @property
  109. def available_segment_count(self):
  110. return (
  111. db.session.query(func.count(DocumentSegment.id))
  112. .filter(
  113. DocumentSegment.dataset_id == self.id,
  114. DocumentSegment.status == "completed",
  115. DocumentSegment.enabled == True,
  116. )
  117. .scalar()
  118. )
  119. @property
  120. def word_count(self):
  121. return (
  122. db.session.query(Document)
  123. .with_entities(func.coalesce(func.sum(Document.word_count), 0))
  124. .filter(Document.dataset_id == self.id)
  125. .scalar()
  126. )
  127. @property
  128. def doc_form(self):
  129. document = db.session.query(Document).filter(Document.dataset_id == self.id).first()
  130. if document:
  131. return document.doc_form
  132. return None
  133. @property
  134. def retrieval_model_dict(self):
  135. default_retrieval_model = {
  136. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  137. "reranking_enable": False,
  138. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  139. "top_k": 2,
  140. "score_threshold_enabled": False,
  141. }
  142. return self.retrieval_model or default_retrieval_model
  143. @property
  144. def tags(self):
  145. tags = (
  146. db.session.query(Tag)
  147. .join(TagBinding, Tag.id == TagBinding.tag_id)
  148. .filter(
  149. TagBinding.target_id == self.id,
  150. TagBinding.tenant_id == self.tenant_id,
  151. Tag.tenant_id == self.tenant_id,
  152. Tag.type == "knowledge",
  153. )
  154. .all()
  155. )
  156. return tags or []
  157. @property
  158. def external_knowledge_info(self):
  159. if self.provider != "external":
  160. return None
  161. external_knowledge_binding = (
  162. db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
  163. )
  164. if not external_knowledge_binding:
  165. return None
  166. external_knowledge_api = (
  167. db.session.query(ExternalKnowledgeApis)
  168. .filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
  169. .first()
  170. )
  171. if not external_knowledge_api:
  172. return None
  173. return {
  174. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  175. "external_knowledge_api_id": external_knowledge_api.id,
  176. "external_knowledge_api_name": external_knowledge_api.name,
  177. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  178. }
  179. @property
  180. def doc_metadata(self):
  181. dataset_metadatas = db.session.query(DatasetMetadata).filter(DatasetMetadata.dataset_id == self.id).all()
  182. doc_metadata = [
  183. {
  184. "id": dataset_metadata.id,
  185. "name": dataset_metadata.name,
  186. "type": dataset_metadata.type,
  187. }
  188. for dataset_metadata in dataset_metadatas
  189. ]
  190. if self.built_in_field_enabled:
  191. doc_metadata.append(
  192. {
  193. "id": "built-in",
  194. "name": BuiltInField.document_name.value,
  195. "type": "string",
  196. }
  197. )
  198. doc_metadata.append(
  199. {
  200. "id": "built-in",
  201. "name": BuiltInField.uploader.value,
  202. "type": "string",
  203. }
  204. )
  205. doc_metadata.append(
  206. {
  207. "id": "built-in",
  208. "name": BuiltInField.upload_date.value,
  209. "type": "time",
  210. }
  211. )
  212. doc_metadata.append(
  213. {
  214. "id": "built-in",
  215. "name": BuiltInField.last_update_date.value,
  216. "type": "time",
  217. }
  218. )
  219. doc_metadata.append(
  220. {
  221. "id": "built-in",
  222. "name": BuiltInField.source.value,
  223. "type": "string",
  224. }
  225. )
  226. return doc_metadata
  227. @staticmethod
  228. def gen_collection_name_by_id(dataset_id: str) -> str:
  229. normalized_dataset_id = dataset_id.replace("-", "_")
  230. return f"{dify_config.VECTOR_INDEX_NAME_PREFIX}_{normalized_dataset_id}_Node"
  231. class DatasetProcessRule(Base):
  232. __tablename__ = "dataset_process_rules"
  233. __table_args__ = (
  234. db.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  235. db.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  236. )
  237. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  238. dataset_id = db.Column(StringUUID, nullable=False)
  239. mode = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  240. rules = db.Column(db.Text, nullable=True)
  241. created_by = db.Column(StringUUID, nullable=False)
  242. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  243. MODES = ["automatic", "custom", "hierarchical"]
  244. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  245. AUTOMATIC_RULES: dict[str, Any] = {
  246. "pre_processing_rules": [
  247. {"id": "remove_extra_spaces", "enabled": True},
  248. {"id": "remove_urls_emails", "enabled": False},
  249. ],
  250. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  251. }
  252. def to_dict(self):
  253. return {
  254. "id": self.id,
  255. "dataset_id": self.dataset_id,
  256. "mode": self.mode,
  257. "rules": self.rules_dict,
  258. }
  259. @property
  260. def rules_dict(self):
  261. try:
  262. return json.loads(self.rules) if self.rules else None
  263. except JSONDecodeError:
  264. return None
  265. class Document(Base):
  266. __tablename__ = "documents"
  267. __table_args__ = (
  268. db.PrimaryKeyConstraint("id", name="document_pkey"),
  269. db.Index("document_dataset_id_idx", "dataset_id"),
  270. db.Index("document_is_paused_idx", "is_paused"),
  271. db.Index("document_tenant_idx", "tenant_id"),
  272. db.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  273. )
  274. # initial fields
  275. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  276. tenant_id = db.Column(StringUUID, nullable=False)
  277. dataset_id = db.Column(StringUUID, nullable=False)
  278. position = db.Column(db.Integer, nullable=False)
  279. data_source_type = db.Column(db.String(255), nullable=False)
  280. data_source_info = db.Column(db.Text, nullable=True)
  281. dataset_process_rule_id = db.Column(StringUUID, nullable=True)
  282. batch = db.Column(db.String(255), nullable=False)
  283. name = db.Column(db.String(255), nullable=False)
  284. created_from = db.Column(db.String(255), nullable=False)
  285. created_by = db.Column(StringUUID, nullable=False)
  286. created_api_request_id = db.Column(StringUUID, nullable=True)
  287. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  288. # start processing
  289. processing_started_at = db.Column(db.DateTime, nullable=True)
  290. # parsing
  291. file_id = db.Column(db.Text, nullable=True)
  292. word_count = db.Column(db.Integer, nullable=True)
  293. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  294. # cleaning
  295. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  296. # split
  297. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  298. # indexing
  299. tokens = db.Column(db.Integer, nullable=True)
  300. indexing_latency = db.Column(db.Float, nullable=True)
  301. completed_at = db.Column(db.DateTime, nullable=True)
  302. # pause
  303. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
  304. paused_by = db.Column(StringUUID, nullable=True)
  305. paused_at = db.Column(db.DateTime, nullable=True)
  306. # error
  307. error = db.Column(db.Text, nullable=True)
  308. stopped_at = db.Column(db.DateTime, nullable=True)
  309. # basic fields
  310. indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  311. enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  312. disabled_at = db.Column(db.DateTime, nullable=True)
  313. disabled_by = db.Column(StringUUID, nullable=True)
  314. archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  315. archived_reason = db.Column(db.String(255), nullable=True)
  316. archived_by = db.Column(StringUUID, nullable=True)
  317. archived_at = db.Column(db.DateTime, nullable=True)
  318. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  319. doc_type = db.Column(db.String(40), nullable=True)
  320. doc_metadata = db.Column(JSONB, nullable=True)
  321. doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
  322. doc_language = db.Column(db.String(255), nullable=True)
  323. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  324. @property
  325. def display_status(self):
  326. status = None
  327. if self.indexing_status == "waiting":
  328. status = "queuing"
  329. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  330. status = "paused"
  331. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  332. status = "indexing"
  333. elif self.indexing_status == "error":
  334. status = "error"
  335. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  336. status = "available"
  337. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  338. status = "disabled"
  339. elif self.indexing_status == "completed" and self.archived:
  340. status = "archived"
  341. return status
  342. @property
  343. def data_source_info_dict(self):
  344. if self.data_source_info:
  345. try:
  346. data_source_info_dict = json.loads(self.data_source_info)
  347. except JSONDecodeError:
  348. data_source_info_dict = {}
  349. return data_source_info_dict
  350. return None
  351. @property
  352. def data_source_detail_dict(self):
  353. if self.data_source_info:
  354. if self.data_source_type == "upload_file":
  355. data_source_info_dict = json.loads(self.data_source_info)
  356. file_detail = (
  357. db.session.query(UploadFile)
  358. .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
  359. .one_or_none()
  360. )
  361. if file_detail:
  362. return {
  363. "upload_file": {
  364. "id": file_detail.id,
  365. "name": file_detail.name,
  366. "size": file_detail.size,
  367. "extension": file_detail.extension,
  368. "mime_type": file_detail.mime_type,
  369. "created_by": file_detail.created_by,
  370. "created_at": file_detail.created_at.timestamp(),
  371. }
  372. }
  373. elif self.data_source_type in {"notion_import", "website_crawl"}:
  374. return json.loads(self.data_source_info)
  375. return {}
  376. @property
  377. def average_segment_length(self):
  378. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  379. return self.word_count // self.segment_count
  380. return 0
  381. @property
  382. def dataset_process_rule(self):
  383. if self.dataset_process_rule_id:
  384. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  385. return None
  386. @property
  387. def dataset(self):
  388. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  389. @property
  390. def segment_count(self):
  391. return db.session.query(DocumentSegment).filter(DocumentSegment.document_id == self.id).count()
  392. @property
  393. def hit_count(self):
  394. return (
  395. db.session.query(DocumentSegment)
  396. .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count), 0))
  397. .filter(DocumentSegment.document_id == self.id)
  398. .scalar()
  399. )
  400. @property
  401. def uploader(self):
  402. user = db.session.query(Account).filter(Account.id == self.created_by).first()
  403. return user.name if user else None
  404. @property
  405. def upload_date(self):
  406. return self.created_at
  407. @property
  408. def last_update_date(self):
  409. return self.updated_at
  410. @property
  411. def doc_metadata_details(self):
  412. if self.doc_metadata:
  413. document_metadatas = (
  414. db.session.query(DatasetMetadata)
  415. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  416. .filter(
  417. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  418. )
  419. .all()
  420. )
  421. metadata_list = []
  422. for metadata in document_metadatas:
  423. metadata_dict = {
  424. "id": metadata.id,
  425. "name": metadata.name,
  426. "type": metadata.type,
  427. "value": self.doc_metadata.get(metadata.name),
  428. }
  429. metadata_list.append(metadata_dict)
  430. # deal built-in fields
  431. metadata_list.extend(self.get_built_in_fields())
  432. return metadata_list
  433. return None
  434. @property
  435. def process_rule_dict(self):
  436. if self.dataset_process_rule_id:
  437. return self.dataset_process_rule.to_dict()
  438. return None
  439. def get_built_in_fields(self):
  440. built_in_fields = []
  441. built_in_fields.append(
  442. {
  443. "id": "built-in",
  444. "name": BuiltInField.document_name,
  445. "type": "string",
  446. "value": self.name,
  447. }
  448. )
  449. built_in_fields.append(
  450. {
  451. "id": "built-in",
  452. "name": BuiltInField.uploader,
  453. "type": "string",
  454. "value": self.uploader,
  455. }
  456. )
  457. built_in_fields.append(
  458. {
  459. "id": "built-in",
  460. "name": BuiltInField.upload_date,
  461. "type": "time",
  462. "value": self.created_at.timestamp(),
  463. }
  464. )
  465. built_in_fields.append(
  466. {
  467. "id": "built-in",
  468. "name": BuiltInField.last_update_date,
  469. "type": "time",
  470. "value": self.updated_at.timestamp(),
  471. }
  472. )
  473. built_in_fields.append(
  474. {
  475. "id": "built-in",
  476. "name": BuiltInField.source,
  477. "type": "string",
  478. "value": MetadataDataSource[self.data_source_type].value,
  479. }
  480. )
  481. return built_in_fields
  482. def to_dict(self):
  483. return {
  484. "id": self.id,
  485. "tenant_id": self.tenant_id,
  486. "dataset_id": self.dataset_id,
  487. "position": self.position,
  488. "data_source_type": self.data_source_type,
  489. "data_source_info": self.data_source_info,
  490. "dataset_process_rule_id": self.dataset_process_rule_id,
  491. "batch": self.batch,
  492. "name": self.name,
  493. "created_from": self.created_from,
  494. "created_by": self.created_by,
  495. "created_api_request_id": self.created_api_request_id,
  496. "created_at": self.created_at,
  497. "processing_started_at": self.processing_started_at,
  498. "file_id": self.file_id,
  499. "word_count": self.word_count,
  500. "parsing_completed_at": self.parsing_completed_at,
  501. "cleaning_completed_at": self.cleaning_completed_at,
  502. "splitting_completed_at": self.splitting_completed_at,
  503. "tokens": self.tokens,
  504. "indexing_latency": self.indexing_latency,
  505. "completed_at": self.completed_at,
  506. "is_paused": self.is_paused,
  507. "paused_by": self.paused_by,
  508. "paused_at": self.paused_at,
  509. "error": self.error,
  510. "stopped_at": self.stopped_at,
  511. "indexing_status": self.indexing_status,
  512. "enabled": self.enabled,
  513. "disabled_at": self.disabled_at,
  514. "disabled_by": self.disabled_by,
  515. "archived": self.archived,
  516. "archived_reason": self.archived_reason,
  517. "archived_by": self.archived_by,
  518. "archived_at": self.archived_at,
  519. "updated_at": self.updated_at,
  520. "doc_type": self.doc_type,
  521. "doc_metadata": self.doc_metadata,
  522. "doc_form": self.doc_form,
  523. "doc_language": self.doc_language,
  524. "display_status": self.display_status,
  525. "data_source_info_dict": self.data_source_info_dict,
  526. "average_segment_length": self.average_segment_length,
  527. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  528. "dataset": self.dataset.to_dict() if self.dataset else None,
  529. "segment_count": self.segment_count,
  530. "hit_count": self.hit_count,
  531. }
  532. @classmethod
  533. def from_dict(cls, data: dict):
  534. return cls(
  535. id=data.get("id"),
  536. tenant_id=data.get("tenant_id"),
  537. dataset_id=data.get("dataset_id"),
  538. position=data.get("position"),
  539. data_source_type=data.get("data_source_type"),
  540. data_source_info=data.get("data_source_info"),
  541. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  542. batch=data.get("batch"),
  543. name=data.get("name"),
  544. created_from=data.get("created_from"),
  545. created_by=data.get("created_by"),
  546. created_api_request_id=data.get("created_api_request_id"),
  547. created_at=data.get("created_at"),
  548. processing_started_at=data.get("processing_started_at"),
  549. file_id=data.get("file_id"),
  550. word_count=data.get("word_count"),
  551. parsing_completed_at=data.get("parsing_completed_at"),
  552. cleaning_completed_at=data.get("cleaning_completed_at"),
  553. splitting_completed_at=data.get("splitting_completed_at"),
  554. tokens=data.get("tokens"),
  555. indexing_latency=data.get("indexing_latency"),
  556. completed_at=data.get("completed_at"),
  557. is_paused=data.get("is_paused"),
  558. paused_by=data.get("paused_by"),
  559. paused_at=data.get("paused_at"),
  560. error=data.get("error"),
  561. stopped_at=data.get("stopped_at"),
  562. indexing_status=data.get("indexing_status"),
  563. enabled=data.get("enabled"),
  564. disabled_at=data.get("disabled_at"),
  565. disabled_by=data.get("disabled_by"),
  566. archived=data.get("archived"),
  567. archived_reason=data.get("archived_reason"),
  568. archived_by=data.get("archived_by"),
  569. archived_at=data.get("archived_at"),
  570. updated_at=data.get("updated_at"),
  571. doc_type=data.get("doc_type"),
  572. doc_metadata=data.get("doc_metadata"),
  573. doc_form=data.get("doc_form"),
  574. doc_language=data.get("doc_language"),
  575. )
  576. class DocumentSegment(Base):
  577. __tablename__ = "document_segments"
  578. __table_args__ = (
  579. db.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  580. db.Index("document_segment_dataset_id_idx", "dataset_id"),
  581. db.Index("document_segment_document_id_idx", "document_id"),
  582. db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  583. db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  584. db.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
  585. db.Index("document_segment_tenant_idx", "tenant_id"),
  586. )
  587. # initial fields
  588. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  589. tenant_id = db.Column(StringUUID, nullable=False)
  590. dataset_id = db.Column(StringUUID, nullable=False)
  591. document_id = db.Column(StringUUID, nullable=False)
  592. position: Mapped[int]
  593. content = db.Column(db.Text, nullable=False)
  594. answer = db.Column(db.Text, nullable=True)
  595. word_count = db.Column(db.Integer, nullable=False)
  596. tokens = db.Column(db.Integer, nullable=False)
  597. # indexing fields
  598. keywords = db.Column(db.JSON, nullable=True)
  599. index_node_id = db.Column(db.String(255), nullable=True)
  600. index_node_hash = db.Column(db.String(255), nullable=True)
  601. # basic fields
  602. hit_count = db.Column(db.Integer, nullable=False, default=0)
  603. enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  604. disabled_at = db.Column(db.DateTime, nullable=True)
  605. disabled_by = db.Column(StringUUID, nullable=True)
  606. status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  607. created_by = db.Column(StringUUID, nullable=False)
  608. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  609. updated_by = db.Column(StringUUID, nullable=True)
  610. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  611. indexing_at = db.Column(db.DateTime, nullable=True)
  612. completed_at = db.Column(db.DateTime, nullable=True)
  613. error = db.Column(db.Text, nullable=True)
  614. stopped_at = db.Column(db.DateTime, nullable=True)
  615. @property
  616. def dataset(self):
  617. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  618. @property
  619. def document(self):
  620. return db.session.query(Document).filter(Document.id == self.document_id).first()
  621. @property
  622. def previous_segment(self):
  623. return (
  624. db.session.query(DocumentSegment)
  625. .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1)
  626. .first()
  627. )
  628. @property
  629. def next_segment(self):
  630. return (
  631. db.session.query(DocumentSegment)
  632. .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1)
  633. .first()
  634. )
  635. @property
  636. def child_chunks(self):
  637. process_rule = self.document.dataset_process_rule
  638. if process_rule.mode == "hierarchical":
  639. rules = Rule(**process_rule.rules_dict)
  640. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  641. child_chunks = (
  642. db.session.query(ChildChunk)
  643. .filter(ChildChunk.segment_id == self.id)
  644. .order_by(ChildChunk.position.asc())
  645. .all()
  646. )
  647. return child_chunks or []
  648. else:
  649. return []
  650. else:
  651. return []
  652. def get_child_chunks(self):
  653. process_rule = self.document.dataset_process_rule
  654. if process_rule.mode == "hierarchical":
  655. rules = Rule(**process_rule.rules_dict)
  656. if rules.parent_mode:
  657. child_chunks = (
  658. db.session.query(ChildChunk)
  659. .filter(ChildChunk.segment_id == self.id)
  660. .order_by(ChildChunk.position.asc())
  661. .all()
  662. )
  663. return child_chunks or []
  664. else:
  665. return []
  666. else:
  667. return []
  668. @property
  669. def sign_content(self):
  670. return self.get_sign_content()
  671. def get_sign_content(self):
  672. signed_urls = []
  673. text = self.content
  674. # For data before v0.10.0
  675. pattern = r"/files/([a-f0-9\-]+)/image-preview"
  676. matches = re.finditer(pattern, text)
  677. for match in matches:
  678. upload_file_id = match.group(1)
  679. nonce = os.urandom(16).hex()
  680. timestamp = str(int(time.time()))
  681. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  682. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  683. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  684. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  685. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  686. signed_url = f"{match.group(0)}?{params}"
  687. signed_urls.append((match.start(), match.end(), signed_url))
  688. # For data after v0.10.0
  689. pattern = r"/files/([a-f0-9\-]+)/file-preview"
  690. matches = re.finditer(pattern, text)
  691. for match in matches:
  692. upload_file_id = match.group(1)
  693. nonce = os.urandom(16).hex()
  694. timestamp = str(int(time.time()))
  695. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  696. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  697. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  698. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  699. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  700. signed_url = f"{match.group(0)}?{params}"
  701. signed_urls.append((match.start(), match.end(), signed_url))
  702. # Reconstruct the text with signed URLs
  703. offset = 0
  704. for start, end, signed_url in signed_urls:
  705. text = text[: start + offset] + signed_url + text[end + offset :]
  706. offset += len(signed_url) - (end - start)
  707. return text
  708. class ChildChunk(Base):
  709. __tablename__ = "child_chunks"
  710. __table_args__ = (
  711. db.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  712. db.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  713. db.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
  714. db.Index("child_chunks_segment_idx", "segment_id"),
  715. )
  716. # initial fields
  717. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  718. tenant_id = db.Column(StringUUID, nullable=False)
  719. dataset_id = db.Column(StringUUID, nullable=False)
  720. document_id = db.Column(StringUUID, nullable=False)
  721. segment_id = db.Column(StringUUID, nullable=False)
  722. position = db.Column(db.Integer, nullable=False)
  723. content = db.Column(db.Text, nullable=False)
  724. word_count = db.Column(db.Integer, nullable=False)
  725. # indexing fields
  726. index_node_id = db.Column(db.String(255), nullable=True)
  727. index_node_hash = db.Column(db.String(255), nullable=True)
  728. type = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  729. created_by = db.Column(StringUUID, nullable=False)
  730. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  731. updated_by = db.Column(StringUUID, nullable=True)
  732. updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  733. indexing_at = db.Column(db.DateTime, nullable=True)
  734. completed_at = db.Column(db.DateTime, nullable=True)
  735. error = db.Column(db.Text, nullable=True)
  736. @property
  737. def dataset(self):
  738. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  739. @property
  740. def document(self):
  741. return db.session.query(Document).filter(Document.id == self.document_id).first()
  742. @property
  743. def segment(self):
  744. return db.session.query(DocumentSegment).filter(DocumentSegment.id == self.segment_id).first()
  745. class AppDatasetJoin(Base):
  746. __tablename__ = "app_dataset_joins"
  747. __table_args__ = (
  748. db.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  749. db.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  750. )
  751. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  752. app_id = db.Column(StringUUID, nullable=False)
  753. dataset_id = db.Column(StringUUID, nullable=False)
  754. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  755. @property
  756. def app(self):
  757. return db.session.get(App, self.app_id)
  758. class DatasetQuery(Base):
  759. __tablename__ = "dataset_queries"
  760. __table_args__ = (
  761. db.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  762. db.Index("dataset_query_dataset_id_idx", "dataset_id"),
  763. )
  764. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  765. dataset_id = db.Column(StringUUID, nullable=False)
  766. content = db.Column(db.Text, nullable=False)
  767. source = db.Column(db.String(255), nullable=False)
  768. source_app_id = db.Column(StringUUID, nullable=True)
  769. created_by_role = db.Column(db.String, nullable=False)
  770. created_by = db.Column(StringUUID, nullable=False)
  771. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  772. class DatasetKeywordTable(Base):
  773. __tablename__ = "dataset_keyword_tables"
  774. __table_args__ = (
  775. db.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  776. db.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  777. )
  778. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  779. dataset_id = db.Column(StringUUID, nullable=False, unique=True)
  780. keyword_table = db.Column(db.Text, nullable=False)
  781. data_source_type = db.Column(
  782. db.String(255), nullable=False, server_default=db.text("'database'::character varying")
  783. )
  784. @property
  785. def keyword_table_dict(self):
  786. class SetDecoder(json.JSONDecoder):
  787. def __init__(self, *args, **kwargs):
  788. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  789. def object_hook(self, dct):
  790. if isinstance(dct, dict):
  791. for keyword, node_idxs in dct.items():
  792. if isinstance(node_idxs, list):
  793. dct[keyword] = set(node_idxs)
  794. return dct
  795. # get dataset
  796. dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
  797. if not dataset:
  798. return None
  799. if self.data_source_type == "database":
  800. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  801. else:
  802. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  803. try:
  804. keyword_table_text = storage.load_once(file_key)
  805. if keyword_table_text:
  806. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  807. return None
  808. except Exception as e:
  809. logging.exception(f"Failed to load keyword table from file: {file_key}")
  810. return None
  811. class Embedding(Base):
  812. __tablename__ = "embeddings"
  813. __table_args__ = (
  814. db.PrimaryKeyConstraint("id", name="embedding_pkey"),
  815. db.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  816. db.Index("created_at_idx", "created_at"),
  817. )
  818. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  819. model_name = db.Column(
  820. db.String(255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying")
  821. )
  822. hash = db.Column(db.String(64), nullable=False)
  823. embedding = db.Column(db.LargeBinary, nullable=False)
  824. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  825. provider_name = db.Column(db.String(255), nullable=False, server_default=db.text("''::character varying"))
  826. def set_embedding(self, embedding_data: list[float]):
  827. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  828. def get_embedding(self) -> list[float]:
  829. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  830. class DatasetCollectionBinding(Base):
  831. __tablename__ = "dataset_collection_bindings"
  832. __table_args__ = (
  833. db.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  834. db.Index("provider_model_name_idx", "provider_name", "model_name"),
  835. )
  836. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  837. provider_name = db.Column(db.String(255), nullable=False)
  838. model_name = db.Column(db.String(255), nullable=False)
  839. type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
  840. collection_name = db.Column(db.String(64), nullable=False)
  841. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  842. class TidbAuthBinding(Base):
  843. __tablename__ = "tidb_auth_bindings"
  844. __table_args__ = (
  845. db.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  846. db.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  847. db.Index("tidb_auth_bindings_active_idx", "active"),
  848. db.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  849. db.Index("tidb_auth_bindings_status_idx", "status"),
  850. )
  851. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  852. tenant_id = db.Column(StringUUID, nullable=True)
  853. cluster_id = db.Column(db.String(255), nullable=False)
  854. cluster_name = db.Column(db.String(255), nullable=False)
  855. active = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  856. status = db.Column(db.String(255), nullable=False, server_default=db.text("CREATING"))
  857. account = db.Column(db.String(255), nullable=False)
  858. password = db.Column(db.String(255), nullable=False)
  859. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  860. class Whitelist(Base):
  861. __tablename__ = "whitelists"
  862. __table_args__ = (
  863. db.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  864. db.Index("whitelists_tenant_idx", "tenant_id"),
  865. )
  866. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  867. tenant_id = db.Column(StringUUID, nullable=True)
  868. category = db.Column(db.String(255), nullable=False)
  869. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  870. class DatasetPermission(Base):
  871. __tablename__ = "dataset_permissions"
  872. __table_args__ = (
  873. db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  874. db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  875. db.Index("idx_dataset_permissions_account_id", "account_id"),
  876. db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  877. )
  878. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
  879. dataset_id = db.Column(StringUUID, nullable=False)
  880. account_id = db.Column(StringUUID, nullable=False)
  881. tenant_id = db.Column(StringUUID, nullable=False)
  882. has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  883. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  884. class ExternalKnowledgeApis(Base):
  885. __tablename__ = "external_knowledge_apis"
  886. __table_args__ = (
  887. db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  888. db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  889. db.Index("external_knowledge_apis_name_idx", "name"),
  890. )
  891. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  892. name = db.Column(db.String(255), nullable=False)
  893. description = db.Column(db.String(255), nullable=False)
  894. tenant_id = db.Column(StringUUID, nullable=False)
  895. settings = db.Column(db.Text, nullable=True)
  896. created_by = db.Column(StringUUID, nullable=False)
  897. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  898. updated_by = db.Column(StringUUID, nullable=True)
  899. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  900. def to_dict(self):
  901. return {
  902. "id": self.id,
  903. "tenant_id": self.tenant_id,
  904. "name": self.name,
  905. "description": self.description,
  906. "settings": self.settings_dict,
  907. "dataset_bindings": self.dataset_bindings,
  908. "created_by": self.created_by,
  909. "created_at": self.created_at.isoformat(),
  910. }
  911. @property
  912. def settings_dict(self):
  913. try:
  914. return json.loads(self.settings) if self.settings else None
  915. except JSONDecodeError:
  916. return None
  917. @property
  918. def dataset_bindings(self):
  919. external_knowledge_bindings = (
  920. db.session.query(ExternalKnowledgeBindings)
  921. .filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  922. .all()
  923. )
  924. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  925. datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
  926. dataset_bindings = []
  927. for dataset in datasets:
  928. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  929. return dataset_bindings
  930. class ExternalKnowledgeBindings(Base):
  931. __tablename__ = "external_knowledge_bindings"
  932. __table_args__ = (
  933. db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  934. db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  935. db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  936. db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  937. db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  938. )
  939. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  940. tenant_id = db.Column(StringUUID, nullable=False)
  941. external_knowledge_api_id = db.Column(StringUUID, nullable=False)
  942. dataset_id = db.Column(StringUUID, nullable=False)
  943. external_knowledge_id = db.Column(db.Text, nullable=False)
  944. created_by = db.Column(StringUUID, nullable=False)
  945. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  946. updated_by = db.Column(StringUUID, nullable=True)
  947. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  948. class DatasetAutoDisableLog(Base):
  949. __tablename__ = "dataset_auto_disable_logs"
  950. __table_args__ = (
  951. db.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  952. db.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  953. db.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  954. db.Index("dataset_auto_disable_log_created_atx", "created_at"),
  955. )
  956. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  957. tenant_id = db.Column(StringUUID, nullable=False)
  958. dataset_id = db.Column(StringUUID, nullable=False)
  959. document_id = db.Column(StringUUID, nullable=False)
  960. notified = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  961. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  962. class RateLimitLog(Base):
  963. __tablename__ = "rate_limit_logs"
  964. __table_args__ = (
  965. db.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  966. db.Index("rate_limit_log_tenant_idx", "tenant_id"),
  967. db.Index("rate_limit_log_operation_idx", "operation"),
  968. )
  969. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  970. tenant_id = db.Column(StringUUID, nullable=False)
  971. subscription_plan = db.Column(db.String(255), nullable=False)
  972. operation = db.Column(db.String(255), nullable=False)
  973. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  974. class DatasetMetadata(Base):
  975. __tablename__ = "dataset_metadatas"
  976. __table_args__ = (
  977. db.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  978. db.Index("dataset_metadata_tenant_idx", "tenant_id"),
  979. db.Index("dataset_metadata_dataset_idx", "dataset_id"),
  980. )
  981. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  982. tenant_id = db.Column(StringUUID, nullable=False)
  983. dataset_id = db.Column(StringUUID, nullable=False)
  984. type = db.Column(db.String(255), nullable=False)
  985. name = db.Column(db.String(255), nullable=False)
  986. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  987. updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  988. created_by = db.Column(StringUUID, nullable=False)
  989. updated_by = db.Column(StringUUID, nullable=True)
  990. class DatasetMetadataBinding(Base):
  991. __tablename__ = "dataset_metadata_bindings"
  992. __table_args__ = (
  993. db.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  994. db.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  995. db.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  996. db.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  997. db.Index("dataset_metadata_binding_document_idx", "document_id"),
  998. )
  999. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1000. tenant_id = db.Column(StringUUID, nullable=False)
  1001. dataset_id = db.Column(StringUUID, nullable=False)
  1002. metadata_id = db.Column(StringUUID, nullable=False)
  1003. document_id = db.Column(StringUUID, nullable=False)
  1004. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1005. created_by = db.Column(StringUUID, nullable=False)