Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from datetime import datetime
  12. from json import JSONDecodeError
  13. from typing import Any, Optional, cast
  14. from sqlalchemy import DateTime, String, func, select
  15. from sqlalchemy.dialects.postgresql import JSONB
  16. from sqlalchemy.orm import Mapped, mapped_column
  17. from configs import dify_config
  18. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  19. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  20. from extensions.ext_storage import storage
  21. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  22. from .account import Account
  23. from .base import Base
  24. from .engine import db
  25. from .model import App, Tag, TagBinding, UploadFile
  26. from .types import StringUUID
  27. class DatasetPermissionEnum(enum.StrEnum):
  28. ONLY_ME = "only_me"
  29. ALL_TEAM = "all_team_members"
  30. PARTIAL_TEAM = "partial_members"
  31. class Dataset(Base):
  32. __tablename__ = "datasets"
  33. __table_args__ = (
  34. db.PrimaryKeyConstraint("id", name="dataset_pkey"),
  35. db.Index("dataset_tenant_idx", "tenant_id"),
  36. db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  37. )
  38. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  39. PROVIDER_LIST = ["vendor", "external", None]
  40. id = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  41. tenant_id: Mapped[str] = mapped_column(StringUUID)
  42. name: Mapped[str] = mapped_column(String(255))
  43. description = mapped_column(db.Text, nullable=True)
  44. provider: Mapped[str] = mapped_column(String(255), server_default=db.text("'vendor'::character varying"))
  45. permission: Mapped[str] = mapped_column(String(255), server_default=db.text("'only_me'::character varying"))
  46. data_source_type = mapped_column(String(255))
  47. indexing_technique: Mapped[Optional[str]] = mapped_column(String(255))
  48. index_struct = mapped_column(db.Text, nullable=True)
  49. created_by = mapped_column(StringUUID, nullable=False)
  50. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  51. updated_by = mapped_column(StringUUID, nullable=True)
  52. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  53. embedding_model = db.Column(String(255), nullable=True) # TODO: mapped_column
  54. embedding_model_provider = db.Column(String(255), nullable=True) # TODO: mapped_column
  55. collection_binding_id = mapped_column(StringUUID, nullable=True)
  56. retrieval_model = mapped_column(JSONB, nullable=True)
  57. built_in_field_enabled: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=db.text("false"))
  58. @property
  59. def dataset_keyword_table(self):
  60. dataset_keyword_table = (
  61. db.session.query(DatasetKeywordTable).where(DatasetKeywordTable.dataset_id == self.id).first()
  62. )
  63. if dataset_keyword_table:
  64. return dataset_keyword_table
  65. return None
  66. @property
  67. def index_struct_dict(self):
  68. return json.loads(self.index_struct) if self.index_struct else None
  69. @property
  70. def external_retrieval_model(self):
  71. default_retrieval_model = {
  72. "top_k": 2,
  73. "score_threshold": 0.0,
  74. }
  75. return self.retrieval_model or default_retrieval_model
  76. @property
  77. def created_by_account(self):
  78. return db.session.get(Account, self.created_by)
  79. @property
  80. def latest_process_rule(self):
  81. return (
  82. db.session.query(DatasetProcessRule)
  83. .where(DatasetProcessRule.dataset_id == self.id)
  84. .order_by(DatasetProcessRule.created_at.desc())
  85. .first()
  86. )
  87. @property
  88. def app_count(self):
  89. return (
  90. db.session.query(func.count(AppDatasetJoin.id))
  91. .where(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  92. .scalar()
  93. )
  94. @property
  95. def document_count(self):
  96. return db.session.query(func.count(Document.id)).where(Document.dataset_id == self.id).scalar()
  97. @property
  98. def available_document_count(self):
  99. return (
  100. db.session.query(func.count(Document.id))
  101. .where(
  102. Document.dataset_id == self.id,
  103. Document.indexing_status == "completed",
  104. Document.enabled == True,
  105. Document.archived == False,
  106. )
  107. .scalar()
  108. )
  109. @property
  110. def available_segment_count(self):
  111. return (
  112. db.session.query(func.count(DocumentSegment.id))
  113. .where(
  114. DocumentSegment.dataset_id == self.id,
  115. DocumentSegment.status == "completed",
  116. DocumentSegment.enabled == True,
  117. )
  118. .scalar()
  119. )
  120. @property
  121. def word_count(self):
  122. return (
  123. db.session.query(Document)
  124. .with_entities(func.coalesce(func.sum(Document.word_count), 0))
  125. .where(Document.dataset_id == self.id)
  126. .scalar()
  127. )
  128. @property
  129. def doc_form(self):
  130. document = db.session.query(Document).where(Document.dataset_id == self.id).first()
  131. if document:
  132. return document.doc_form
  133. return None
  134. @property
  135. def retrieval_model_dict(self):
  136. default_retrieval_model = {
  137. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  138. "reranking_enable": False,
  139. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  140. "top_k": 2,
  141. "score_threshold_enabled": False,
  142. }
  143. return self.retrieval_model or default_retrieval_model
  144. @property
  145. def tags(self):
  146. tags = (
  147. db.session.query(Tag)
  148. .join(TagBinding, Tag.id == TagBinding.tag_id)
  149. .where(
  150. TagBinding.target_id == self.id,
  151. TagBinding.tenant_id == self.tenant_id,
  152. Tag.tenant_id == self.tenant_id,
  153. Tag.type == "knowledge",
  154. )
  155. .all()
  156. )
  157. return tags or []
  158. @property
  159. def external_knowledge_info(self):
  160. if self.provider != "external":
  161. return None
  162. external_knowledge_binding = (
  163. db.session.query(ExternalKnowledgeBindings).where(ExternalKnowledgeBindings.dataset_id == self.id).first()
  164. )
  165. if not external_knowledge_binding:
  166. return None
  167. external_knowledge_api = db.session.scalar(
  168. select(ExternalKnowledgeApis).where(
  169. ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id
  170. )
  171. )
  172. if not external_knowledge_api:
  173. return None
  174. return {
  175. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  176. "external_knowledge_api_id": external_knowledge_api.id,
  177. "external_knowledge_api_name": external_knowledge_api.name,
  178. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  179. }
  180. @property
  181. def doc_metadata(self):
  182. dataset_metadatas = db.session.query(DatasetMetadata).where(DatasetMetadata.dataset_id == self.id).all()
  183. doc_metadata = [
  184. {
  185. "id": dataset_metadata.id,
  186. "name": dataset_metadata.name,
  187. "type": dataset_metadata.type,
  188. }
  189. for dataset_metadata in dataset_metadatas
  190. ]
  191. if self.built_in_field_enabled:
  192. doc_metadata.append(
  193. {
  194. "id": "built-in",
  195. "name": BuiltInField.document_name.value,
  196. "type": "string",
  197. }
  198. )
  199. doc_metadata.append(
  200. {
  201. "id": "built-in",
  202. "name": BuiltInField.uploader.value,
  203. "type": "string",
  204. }
  205. )
  206. doc_metadata.append(
  207. {
  208. "id": "built-in",
  209. "name": BuiltInField.upload_date.value,
  210. "type": "time",
  211. }
  212. )
  213. doc_metadata.append(
  214. {
  215. "id": "built-in",
  216. "name": BuiltInField.last_update_date.value,
  217. "type": "time",
  218. }
  219. )
  220. doc_metadata.append(
  221. {
  222. "id": "built-in",
  223. "name": BuiltInField.source.value,
  224. "type": "string",
  225. }
  226. )
  227. return doc_metadata
  228. @staticmethod
  229. def gen_collection_name_by_id(dataset_id: str) -> str:
  230. normalized_dataset_id = dataset_id.replace("-", "_")
  231. return f"{dify_config.VECTOR_INDEX_NAME_PREFIX}_{normalized_dataset_id}_Node"
  232. class DatasetProcessRule(Base):
  233. __tablename__ = "dataset_process_rules"
  234. __table_args__ = (
  235. db.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  236. db.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  237. )
  238. id = mapped_column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  239. dataset_id = mapped_column(StringUUID, nullable=False)
  240. mode = mapped_column(String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  241. rules = mapped_column(db.Text, nullable=True)
  242. created_by = mapped_column(StringUUID, nullable=False)
  243. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  244. MODES = ["automatic", "custom", "hierarchical"]
  245. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  246. AUTOMATIC_RULES: dict[str, Any] = {
  247. "pre_processing_rules": [
  248. {"id": "remove_extra_spaces", "enabled": True},
  249. {"id": "remove_urls_emails", "enabled": False},
  250. ],
  251. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  252. }
  253. def to_dict(self):
  254. return {
  255. "id": self.id,
  256. "dataset_id": self.dataset_id,
  257. "mode": self.mode,
  258. "rules": self.rules_dict,
  259. }
  260. @property
  261. def rules_dict(self):
  262. try:
  263. return json.loads(self.rules) if self.rules else None
  264. except JSONDecodeError:
  265. return None
  266. class Document(Base):
  267. __tablename__ = "documents"
  268. __table_args__ = (
  269. db.PrimaryKeyConstraint("id", name="document_pkey"),
  270. db.Index("document_dataset_id_idx", "dataset_id"),
  271. db.Index("document_is_paused_idx", "is_paused"),
  272. db.Index("document_tenant_idx", "tenant_id"),
  273. db.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  274. )
  275. # initial fields
  276. id = mapped_column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  277. tenant_id = mapped_column(StringUUID, nullable=False)
  278. dataset_id = mapped_column(StringUUID, nullable=False)
  279. position: Mapped[int] = mapped_column(db.Integer, nullable=False)
  280. data_source_type: Mapped[str] = mapped_column(String(255), nullable=False)
  281. data_source_info = mapped_column(db.Text, nullable=True)
  282. dataset_process_rule_id = mapped_column(StringUUID, nullable=True)
  283. batch: Mapped[str] = mapped_column(String(255), nullable=False)
  284. name: Mapped[str] = mapped_column(String(255), nullable=False)
  285. created_from: Mapped[str] = mapped_column(String(255), nullable=False)
  286. created_by = mapped_column(StringUUID, nullable=False)
  287. created_api_request_id = mapped_column(StringUUID, nullable=True)
  288. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  289. # start processing
  290. processing_started_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  291. # parsing
  292. file_id = mapped_column(db.Text, nullable=True)
  293. word_count: Mapped[Optional[int]] = mapped_column(db.Integer, nullable=True) # TODO: make this not nullable
  294. parsing_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  295. # cleaning
  296. cleaning_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  297. # split
  298. splitting_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  299. # indexing
  300. tokens: Mapped[Optional[int]] = mapped_column(db.Integer, nullable=True)
  301. indexing_latency: Mapped[Optional[float]] = mapped_column(db.Float, nullable=True)
  302. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  303. # pause
  304. is_paused: Mapped[Optional[bool]] = mapped_column(db.Boolean, nullable=True, server_default=db.text("false"))
  305. paused_by = mapped_column(StringUUID, nullable=True)
  306. paused_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  307. # error
  308. error = mapped_column(db.Text, nullable=True)
  309. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  310. # basic fields
  311. indexing_status = mapped_column(String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  312. enabled: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=db.text("true"))
  313. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  314. disabled_by = mapped_column(StringUUID, nullable=True)
  315. archived: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=db.text("false"))
  316. archived_reason = mapped_column(String(255), nullable=True)
  317. archived_by = mapped_column(StringUUID, nullable=True)
  318. archived_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  319. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  320. doc_type = mapped_column(String(40), nullable=True)
  321. doc_metadata = mapped_column(JSONB, nullable=True)
  322. doc_form = mapped_column(String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
  323. doc_language = mapped_column(String(255), nullable=True)
  324. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  325. @property
  326. def display_status(self):
  327. status = None
  328. if self.indexing_status == "waiting":
  329. status = "queuing"
  330. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  331. status = "paused"
  332. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  333. status = "indexing"
  334. elif self.indexing_status == "error":
  335. status = "error"
  336. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  337. status = "available"
  338. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  339. status = "disabled"
  340. elif self.indexing_status == "completed" and self.archived:
  341. status = "archived"
  342. return status
  343. @property
  344. def data_source_info_dict(self):
  345. if self.data_source_info:
  346. try:
  347. data_source_info_dict = json.loads(self.data_source_info)
  348. except JSONDecodeError:
  349. data_source_info_dict = {}
  350. return data_source_info_dict
  351. return None
  352. @property
  353. def data_source_detail_dict(self):
  354. if self.data_source_info:
  355. if self.data_source_type == "upload_file":
  356. data_source_info_dict = json.loads(self.data_source_info)
  357. file_detail = (
  358. db.session.query(UploadFile)
  359. .where(UploadFile.id == data_source_info_dict["upload_file_id"])
  360. .one_or_none()
  361. )
  362. if file_detail:
  363. return {
  364. "upload_file": {
  365. "id": file_detail.id,
  366. "name": file_detail.name,
  367. "size": file_detail.size,
  368. "extension": file_detail.extension,
  369. "mime_type": file_detail.mime_type,
  370. "created_by": file_detail.created_by,
  371. "created_at": file_detail.created_at.timestamp(),
  372. }
  373. }
  374. elif self.data_source_type in {"notion_import", "website_crawl"}:
  375. return json.loads(self.data_source_info)
  376. return {}
  377. @property
  378. def average_segment_length(self):
  379. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  380. return self.word_count // self.segment_count
  381. return 0
  382. @property
  383. def dataset_process_rule(self):
  384. if self.dataset_process_rule_id:
  385. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  386. return None
  387. @property
  388. def dataset(self):
  389. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).one_or_none()
  390. @property
  391. def segment_count(self):
  392. return db.session.query(DocumentSegment).where(DocumentSegment.document_id == self.id).count()
  393. @property
  394. def hit_count(self):
  395. return (
  396. db.session.query(DocumentSegment)
  397. .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count), 0))
  398. .where(DocumentSegment.document_id == self.id)
  399. .scalar()
  400. )
  401. @property
  402. def uploader(self):
  403. user = db.session.query(Account).where(Account.id == self.created_by).first()
  404. return user.name if user else None
  405. @property
  406. def upload_date(self):
  407. return self.created_at
  408. @property
  409. def last_update_date(self):
  410. return self.updated_at
  411. @property
  412. def doc_metadata_details(self):
  413. if self.doc_metadata:
  414. document_metadatas = (
  415. db.session.query(DatasetMetadata)
  416. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  417. .where(
  418. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  419. )
  420. .all()
  421. )
  422. metadata_list = []
  423. for metadata in document_metadatas:
  424. metadata_dict = {
  425. "id": metadata.id,
  426. "name": metadata.name,
  427. "type": metadata.type,
  428. "value": self.doc_metadata.get(metadata.name),
  429. }
  430. metadata_list.append(metadata_dict)
  431. # deal built-in fields
  432. metadata_list.extend(self.get_built_in_fields())
  433. return metadata_list
  434. return None
  435. @property
  436. def process_rule_dict(self):
  437. if self.dataset_process_rule_id:
  438. return self.dataset_process_rule.to_dict()
  439. return None
  440. def get_built_in_fields(self):
  441. built_in_fields = []
  442. built_in_fields.append(
  443. {
  444. "id": "built-in",
  445. "name": BuiltInField.document_name,
  446. "type": "string",
  447. "value": self.name,
  448. }
  449. )
  450. built_in_fields.append(
  451. {
  452. "id": "built-in",
  453. "name": BuiltInField.uploader,
  454. "type": "string",
  455. "value": self.uploader,
  456. }
  457. )
  458. built_in_fields.append(
  459. {
  460. "id": "built-in",
  461. "name": BuiltInField.upload_date,
  462. "type": "time",
  463. "value": str(self.created_at.timestamp()),
  464. }
  465. )
  466. built_in_fields.append(
  467. {
  468. "id": "built-in",
  469. "name": BuiltInField.last_update_date,
  470. "type": "time",
  471. "value": str(self.updated_at.timestamp()),
  472. }
  473. )
  474. built_in_fields.append(
  475. {
  476. "id": "built-in",
  477. "name": BuiltInField.source,
  478. "type": "string",
  479. "value": MetadataDataSource[self.data_source_type].value,
  480. }
  481. )
  482. return built_in_fields
  483. def to_dict(self):
  484. return {
  485. "id": self.id,
  486. "tenant_id": self.tenant_id,
  487. "dataset_id": self.dataset_id,
  488. "position": self.position,
  489. "data_source_type": self.data_source_type,
  490. "data_source_info": self.data_source_info,
  491. "dataset_process_rule_id": self.dataset_process_rule_id,
  492. "batch": self.batch,
  493. "name": self.name,
  494. "created_from": self.created_from,
  495. "created_by": self.created_by,
  496. "created_api_request_id": self.created_api_request_id,
  497. "created_at": self.created_at,
  498. "processing_started_at": self.processing_started_at,
  499. "file_id": self.file_id,
  500. "word_count": self.word_count,
  501. "parsing_completed_at": self.parsing_completed_at,
  502. "cleaning_completed_at": self.cleaning_completed_at,
  503. "splitting_completed_at": self.splitting_completed_at,
  504. "tokens": self.tokens,
  505. "indexing_latency": self.indexing_latency,
  506. "completed_at": self.completed_at,
  507. "is_paused": self.is_paused,
  508. "paused_by": self.paused_by,
  509. "paused_at": self.paused_at,
  510. "error": self.error,
  511. "stopped_at": self.stopped_at,
  512. "indexing_status": self.indexing_status,
  513. "enabled": self.enabled,
  514. "disabled_at": self.disabled_at,
  515. "disabled_by": self.disabled_by,
  516. "archived": self.archived,
  517. "archived_reason": self.archived_reason,
  518. "archived_by": self.archived_by,
  519. "archived_at": self.archived_at,
  520. "updated_at": self.updated_at,
  521. "doc_type": self.doc_type,
  522. "doc_metadata": self.doc_metadata,
  523. "doc_form": self.doc_form,
  524. "doc_language": self.doc_language,
  525. "display_status": self.display_status,
  526. "data_source_info_dict": self.data_source_info_dict,
  527. "average_segment_length": self.average_segment_length,
  528. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  529. "dataset": self.dataset.to_dict() if self.dataset else None,
  530. "segment_count": self.segment_count,
  531. "hit_count": self.hit_count,
  532. }
  533. @classmethod
  534. def from_dict(cls, data: dict):
  535. return cls(
  536. id=data.get("id"),
  537. tenant_id=data.get("tenant_id"),
  538. dataset_id=data.get("dataset_id"),
  539. position=data.get("position"),
  540. data_source_type=data.get("data_source_type"),
  541. data_source_info=data.get("data_source_info"),
  542. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  543. batch=data.get("batch"),
  544. name=data.get("name"),
  545. created_from=data.get("created_from"),
  546. created_by=data.get("created_by"),
  547. created_api_request_id=data.get("created_api_request_id"),
  548. created_at=data.get("created_at"),
  549. processing_started_at=data.get("processing_started_at"),
  550. file_id=data.get("file_id"),
  551. word_count=data.get("word_count"),
  552. parsing_completed_at=data.get("parsing_completed_at"),
  553. cleaning_completed_at=data.get("cleaning_completed_at"),
  554. splitting_completed_at=data.get("splitting_completed_at"),
  555. tokens=data.get("tokens"),
  556. indexing_latency=data.get("indexing_latency"),
  557. completed_at=data.get("completed_at"),
  558. is_paused=data.get("is_paused"),
  559. paused_by=data.get("paused_by"),
  560. paused_at=data.get("paused_at"),
  561. error=data.get("error"),
  562. stopped_at=data.get("stopped_at"),
  563. indexing_status=data.get("indexing_status"),
  564. enabled=data.get("enabled"),
  565. disabled_at=data.get("disabled_at"),
  566. disabled_by=data.get("disabled_by"),
  567. archived=data.get("archived"),
  568. archived_reason=data.get("archived_reason"),
  569. archived_by=data.get("archived_by"),
  570. archived_at=data.get("archived_at"),
  571. updated_at=data.get("updated_at"),
  572. doc_type=data.get("doc_type"),
  573. doc_metadata=data.get("doc_metadata"),
  574. doc_form=data.get("doc_form"),
  575. doc_language=data.get("doc_language"),
  576. )
  577. class DocumentSegment(Base):
  578. __tablename__ = "document_segments"
  579. __table_args__ = (
  580. db.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  581. db.Index("document_segment_dataset_id_idx", "dataset_id"),
  582. db.Index("document_segment_document_id_idx", "document_id"),
  583. db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  584. db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  585. db.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
  586. db.Index("document_segment_tenant_idx", "tenant_id"),
  587. )
  588. # initial fields
  589. id = mapped_column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  590. tenant_id = mapped_column(StringUUID, nullable=False)
  591. dataset_id = mapped_column(StringUUID, nullable=False)
  592. document_id = mapped_column(StringUUID, nullable=False)
  593. position: Mapped[int]
  594. content = mapped_column(db.Text, nullable=False)
  595. answer = mapped_column(db.Text, nullable=True)
  596. word_count: Mapped[int]
  597. tokens: Mapped[int]
  598. # indexing fields
  599. keywords = mapped_column(db.JSON, nullable=True)
  600. index_node_id = mapped_column(String(255), nullable=True)
  601. index_node_hash = mapped_column(String(255), nullable=True)
  602. # basic fields
  603. hit_count: Mapped[int] = mapped_column(db.Integer, nullable=False, default=0)
  604. enabled: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=db.text("true"))
  605. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  606. disabled_by = mapped_column(StringUUID, nullable=True)
  607. status: Mapped[str] = mapped_column(String(255), server_default=db.text("'waiting'::character varying"))
  608. created_by = mapped_column(StringUUID, nullable=False)
  609. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  610. updated_by = mapped_column(StringUUID, nullable=True)
  611. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  612. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  613. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  614. error = mapped_column(db.Text, nullable=True)
  615. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  616. @property
  617. def dataset(self):
  618. return db.session.scalar(select(Dataset).where(Dataset.id == self.dataset_id))
  619. @property
  620. def document(self):
  621. return db.session.scalar(select(Document).where(Document.id == self.document_id))
  622. @property
  623. def previous_segment(self):
  624. return db.session.scalar(
  625. select(DocumentSegment).where(
  626. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1
  627. )
  628. )
  629. @property
  630. def next_segment(self):
  631. return db.session.scalar(
  632. select(DocumentSegment).where(
  633. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1
  634. )
  635. )
  636. @property
  637. def child_chunks(self):
  638. process_rule = self.document.dataset_process_rule
  639. if process_rule.mode == "hierarchical":
  640. rules = Rule(**process_rule.rules_dict)
  641. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  642. child_chunks = (
  643. db.session.query(ChildChunk)
  644. .where(ChildChunk.segment_id == self.id)
  645. .order_by(ChildChunk.position.asc())
  646. .all()
  647. )
  648. return child_chunks or []
  649. else:
  650. return []
  651. else:
  652. return []
  653. def get_child_chunks(self):
  654. process_rule = self.document.dataset_process_rule
  655. if process_rule.mode == "hierarchical":
  656. rules = Rule(**process_rule.rules_dict)
  657. if rules.parent_mode:
  658. child_chunks = (
  659. db.session.query(ChildChunk)
  660. .where(ChildChunk.segment_id == self.id)
  661. .order_by(ChildChunk.position.asc())
  662. .all()
  663. )
  664. return child_chunks or []
  665. else:
  666. return []
  667. else:
  668. return []
  669. @property
  670. def sign_content(self):
  671. return self.get_sign_content()
  672. def get_sign_content(self):
  673. signed_urls = []
  674. text = self.content
  675. # For data before v0.10.0
  676. pattern = r"/files/([a-f0-9\-]+)/image-preview"
  677. matches = re.finditer(pattern, text)
  678. for match in matches:
  679. upload_file_id = match.group(1)
  680. nonce = os.urandom(16).hex()
  681. timestamp = str(int(time.time()))
  682. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  683. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  684. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  685. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  686. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  687. signed_url = f"{match.group(0)}?{params}"
  688. signed_urls.append((match.start(), match.end(), signed_url))
  689. # For data after v0.10.0
  690. pattern = r"/files/([a-f0-9\-]+)/file-preview"
  691. matches = re.finditer(pattern, text)
  692. for match in matches:
  693. upload_file_id = match.group(1)
  694. nonce = os.urandom(16).hex()
  695. timestamp = str(int(time.time()))
  696. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  697. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  698. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  699. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  700. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  701. signed_url = f"{match.group(0)}?{params}"
  702. signed_urls.append((match.start(), match.end(), signed_url))
  703. # Reconstruct the text with signed URLs
  704. offset = 0
  705. for start, end, signed_url in signed_urls:
  706. text = text[: start + offset] + signed_url + text[end + offset :]
  707. offset += len(signed_url) - (end - start)
  708. return text
  709. class ChildChunk(Base):
  710. __tablename__ = "child_chunks"
  711. __table_args__ = (
  712. db.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  713. db.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  714. db.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
  715. db.Index("child_chunks_segment_idx", "segment_id"),
  716. )
  717. # initial fields
  718. id = mapped_column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  719. tenant_id = mapped_column(StringUUID, nullable=False)
  720. dataset_id = mapped_column(StringUUID, nullable=False)
  721. document_id = mapped_column(StringUUID, nullable=False)
  722. segment_id = mapped_column(StringUUID, nullable=False)
  723. position: Mapped[int] = mapped_column(db.Integer, nullable=False)
  724. content = mapped_column(db.Text, nullable=False)
  725. word_count: Mapped[int] = mapped_column(db.Integer, nullable=False)
  726. # indexing fields
  727. index_node_id = mapped_column(String(255), nullable=True)
  728. index_node_hash = mapped_column(String(255), nullable=True)
  729. type = mapped_column(String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  730. created_by = mapped_column(StringUUID, nullable=False)
  731. created_at: Mapped[datetime] = mapped_column(
  732. DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")
  733. )
  734. updated_by = mapped_column(StringUUID, nullable=True)
  735. updated_at: Mapped[datetime] = mapped_column(
  736. DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")
  737. )
  738. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  739. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  740. error = mapped_column(db.Text, nullable=True)
  741. @property
  742. def dataset(self):
  743. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).first()
  744. @property
  745. def document(self):
  746. return db.session.query(Document).where(Document.id == self.document_id).first()
  747. @property
  748. def segment(self):
  749. return db.session.query(DocumentSegment).where(DocumentSegment.id == self.segment_id).first()
  750. class AppDatasetJoin(Base):
  751. __tablename__ = "app_dataset_joins"
  752. __table_args__ = (
  753. db.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  754. db.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  755. )
  756. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  757. app_id = mapped_column(StringUUID, nullable=False)
  758. dataset_id = mapped_column(StringUUID, nullable=False)
  759. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  760. @property
  761. def app(self):
  762. return db.session.get(App, self.app_id)
  763. class DatasetQuery(Base):
  764. __tablename__ = "dataset_queries"
  765. __table_args__ = (
  766. db.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  767. db.Index("dataset_query_dataset_id_idx", "dataset_id"),
  768. )
  769. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  770. dataset_id = mapped_column(StringUUID, nullable=False)
  771. content = mapped_column(db.Text, nullable=False)
  772. source: Mapped[str] = mapped_column(String(255), nullable=False)
  773. source_app_id = mapped_column(StringUUID, nullable=True)
  774. created_by_role = mapped_column(String, nullable=False)
  775. created_by = mapped_column(StringUUID, nullable=False)
  776. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  777. class DatasetKeywordTable(Base):
  778. __tablename__ = "dataset_keyword_tables"
  779. __table_args__ = (
  780. db.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  781. db.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  782. )
  783. id = mapped_column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  784. dataset_id = mapped_column(StringUUID, nullable=False, unique=True)
  785. keyword_table = mapped_column(db.Text, nullable=False)
  786. data_source_type = mapped_column(
  787. String(255), nullable=False, server_default=db.text("'database'::character varying")
  788. )
  789. @property
  790. def keyword_table_dict(self):
  791. class SetDecoder(json.JSONDecoder):
  792. def __init__(self, *args, **kwargs):
  793. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  794. def object_hook(self, dct):
  795. if isinstance(dct, dict):
  796. for keyword, node_idxs in dct.items():
  797. if isinstance(node_idxs, list):
  798. dct[keyword] = set(node_idxs)
  799. return dct
  800. # get dataset
  801. dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
  802. if not dataset:
  803. return None
  804. if self.data_source_type == "database":
  805. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  806. else:
  807. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  808. try:
  809. keyword_table_text = storage.load_once(file_key)
  810. if keyword_table_text:
  811. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  812. return None
  813. except Exception as e:
  814. logging.exception("Failed to load keyword table from file: %s", file_key)
  815. return None
  816. class Embedding(Base):
  817. __tablename__ = "embeddings"
  818. __table_args__ = (
  819. db.PrimaryKeyConstraint("id", name="embedding_pkey"),
  820. db.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  821. db.Index("created_at_idx", "created_at"),
  822. )
  823. id = mapped_column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  824. model_name = mapped_column(
  825. String(255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying")
  826. )
  827. hash = mapped_column(String(64), nullable=False)
  828. embedding = mapped_column(db.LargeBinary, nullable=False)
  829. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  830. provider_name = mapped_column(String(255), nullable=False, server_default=db.text("''::character varying"))
  831. def set_embedding(self, embedding_data: list[float]):
  832. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  833. def get_embedding(self) -> list[float]:
  834. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  835. class DatasetCollectionBinding(Base):
  836. __tablename__ = "dataset_collection_bindings"
  837. __table_args__ = (
  838. db.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  839. db.Index("provider_model_name_idx", "provider_name", "model_name"),
  840. )
  841. id = mapped_column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  842. provider_name: Mapped[str] = mapped_column(String(255), nullable=False)
  843. model_name: Mapped[str] = mapped_column(String(255), nullable=False)
  844. type = mapped_column(String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
  845. collection_name = mapped_column(String(64), nullable=False)
  846. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  847. class TidbAuthBinding(Base):
  848. __tablename__ = "tidb_auth_bindings"
  849. __table_args__ = (
  850. db.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  851. db.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  852. db.Index("tidb_auth_bindings_active_idx", "active"),
  853. db.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  854. db.Index("tidb_auth_bindings_status_idx", "status"),
  855. )
  856. id = mapped_column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  857. tenant_id = mapped_column(StringUUID, nullable=True)
  858. cluster_id: Mapped[str] = mapped_column(String(255), nullable=False)
  859. cluster_name: Mapped[str] = mapped_column(String(255), nullable=False)
  860. active: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=db.text("false"))
  861. status = mapped_column(String(255), nullable=False, server_default=db.text("CREATING"))
  862. account: Mapped[str] = mapped_column(String(255), nullable=False)
  863. password: Mapped[str] = mapped_column(String(255), nullable=False)
  864. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  865. class Whitelist(Base):
  866. __tablename__ = "whitelists"
  867. __table_args__ = (
  868. db.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  869. db.Index("whitelists_tenant_idx", "tenant_id"),
  870. )
  871. id = mapped_column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  872. tenant_id = mapped_column(StringUUID, nullable=True)
  873. category: Mapped[str] = mapped_column(String(255), nullable=False)
  874. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  875. class DatasetPermission(Base):
  876. __tablename__ = "dataset_permissions"
  877. __table_args__ = (
  878. db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  879. db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  880. db.Index("idx_dataset_permissions_account_id", "account_id"),
  881. db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  882. )
  883. id = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
  884. dataset_id = mapped_column(StringUUID, nullable=False)
  885. account_id = mapped_column(StringUUID, nullable=False)
  886. tenant_id = mapped_column(StringUUID, nullable=False)
  887. has_permission: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=db.text("true"))
  888. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  889. class ExternalKnowledgeApis(Base):
  890. __tablename__ = "external_knowledge_apis"
  891. __table_args__ = (
  892. db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  893. db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  894. db.Index("external_knowledge_apis_name_idx", "name"),
  895. )
  896. id = mapped_column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  897. name: Mapped[str] = mapped_column(String(255), nullable=False)
  898. description: Mapped[str] = mapped_column(String(255), nullable=False)
  899. tenant_id = mapped_column(StringUUID, nullable=False)
  900. settings = mapped_column(db.Text, nullable=True)
  901. created_by = mapped_column(StringUUID, nullable=False)
  902. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  903. updated_by = mapped_column(StringUUID, nullable=True)
  904. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  905. def to_dict(self):
  906. return {
  907. "id": self.id,
  908. "tenant_id": self.tenant_id,
  909. "name": self.name,
  910. "description": self.description,
  911. "settings": self.settings_dict,
  912. "dataset_bindings": self.dataset_bindings,
  913. "created_by": self.created_by,
  914. "created_at": self.created_at.isoformat(),
  915. }
  916. @property
  917. def settings_dict(self):
  918. try:
  919. return json.loads(self.settings) if self.settings else None
  920. except JSONDecodeError:
  921. return None
  922. @property
  923. def dataset_bindings(self):
  924. external_knowledge_bindings = (
  925. db.session.query(ExternalKnowledgeBindings)
  926. .where(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  927. .all()
  928. )
  929. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  930. datasets = db.session.query(Dataset).where(Dataset.id.in_(dataset_ids)).all()
  931. dataset_bindings = []
  932. for dataset in datasets:
  933. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  934. return dataset_bindings
  935. class ExternalKnowledgeBindings(Base):
  936. __tablename__ = "external_knowledge_bindings"
  937. __table_args__ = (
  938. db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  939. db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  940. db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  941. db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  942. db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  943. )
  944. id = mapped_column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  945. tenant_id = mapped_column(StringUUID, nullable=False)
  946. external_knowledge_api_id = mapped_column(StringUUID, nullable=False)
  947. dataset_id = mapped_column(StringUUID, nullable=False)
  948. external_knowledge_id = mapped_column(db.Text, nullable=False)
  949. created_by = mapped_column(StringUUID, nullable=False)
  950. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  951. updated_by = mapped_column(StringUUID, nullable=True)
  952. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  953. class DatasetAutoDisableLog(Base):
  954. __tablename__ = "dataset_auto_disable_logs"
  955. __table_args__ = (
  956. db.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  957. db.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  958. db.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  959. db.Index("dataset_auto_disable_log_created_atx", "created_at"),
  960. )
  961. id = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  962. tenant_id = mapped_column(StringUUID, nullable=False)
  963. dataset_id = mapped_column(StringUUID, nullable=False)
  964. document_id = mapped_column(StringUUID, nullable=False)
  965. notified: Mapped[bool] = mapped_column(db.Boolean, nullable=False, server_default=db.text("false"))
  966. created_at: Mapped[datetime] = mapped_column(
  967. DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")
  968. )
  969. class RateLimitLog(Base):
  970. __tablename__ = "rate_limit_logs"
  971. __table_args__ = (
  972. db.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  973. db.Index("rate_limit_log_tenant_idx", "tenant_id"),
  974. db.Index("rate_limit_log_operation_idx", "operation"),
  975. )
  976. id = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  977. tenant_id = mapped_column(StringUUID, nullable=False)
  978. subscription_plan: Mapped[str] = mapped_column(String(255), nullable=False)
  979. operation: Mapped[str] = mapped_column(String(255), nullable=False)
  980. created_at: Mapped[datetime] = mapped_column(
  981. DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")
  982. )
  983. class DatasetMetadata(Base):
  984. __tablename__ = "dataset_metadatas"
  985. __table_args__ = (
  986. db.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  987. db.Index("dataset_metadata_tenant_idx", "tenant_id"),
  988. db.Index("dataset_metadata_dataset_idx", "dataset_id"),
  989. )
  990. id = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  991. tenant_id = mapped_column(StringUUID, nullable=False)
  992. dataset_id = mapped_column(StringUUID, nullable=False)
  993. type: Mapped[str] = mapped_column(String(255), nullable=False)
  994. name: Mapped[str] = mapped_column(String(255), nullable=False)
  995. created_at: Mapped[datetime] = mapped_column(
  996. DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")
  997. )
  998. updated_at: Mapped[datetime] = mapped_column(
  999. DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)")
  1000. )
  1001. created_by = mapped_column(StringUUID, nullable=False)
  1002. updated_by = mapped_column(StringUUID, nullable=True)
  1003. class DatasetMetadataBinding(Base):
  1004. __tablename__ = "dataset_metadata_bindings"
  1005. __table_args__ = (
  1006. db.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  1007. db.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  1008. db.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  1009. db.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  1010. db.Index("dataset_metadata_binding_document_idx", "document_id"),
  1011. )
  1012. id = mapped_column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1013. tenant_id = mapped_column(StringUUID, nullable=False)
  1014. dataset_id = mapped_column(StringUUID, nullable=False)
  1015. metadata_id = mapped_column(StringUUID, nullable=False)
  1016. document_id = mapped_column(StringUUID, nullable=False)
  1017. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1018. created_by = mapped_column(StringUUID, nullable=False)