You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

dataset.py 51KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from json import JSONDecodeError
  12. from typing import Any, cast
  13. from sqlalchemy import func
  14. from sqlalchemy.dialects.postgresql import JSONB
  15. from sqlalchemy.orm import Mapped
  16. from configs import dify_config
  17. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  18. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  19. from extensions.ext_storage import storage
  20. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  21. from .account import Account
  22. from .base import Base
  23. from .engine import db
  24. from .model import App, Tag, TagBinding, UploadFile
  25. from .types import StringUUID
  26. class DatasetPermissionEnum(enum.StrEnum):
  27. ONLY_ME = "only_me"
  28. ALL_TEAM = "all_team_members"
  29. PARTIAL_TEAM = "partial_members"
  30. class Dataset(Base):
  31. __tablename__ = "datasets"
  32. __table_args__ = (
  33. db.PrimaryKeyConstraint("id", name="dataset_pkey"),
  34. db.Index("dataset_tenant_idx", "tenant_id"),
  35. db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  36. )
  37. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  38. PROVIDER_LIST = ["vendor", "external", None]
  39. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  40. tenant_id = db.Column(StringUUID, nullable=False)
  41. name = db.Column(db.String(255), nullable=False)
  42. description = db.Column(db.Text, nullable=True)
  43. provider = db.Column(db.String(255), nullable=False, server_default=db.text("'vendor'::character varying"))
  44. permission = db.Column(db.String(255), nullable=False, server_default=db.text("'only_me'::character varying"))
  45. data_source_type = db.Column(db.String(255))
  46. indexing_technique = db.Column(db.String(255), nullable=True)
  47. index_struct = db.Column(db.Text, nullable=True)
  48. created_by = db.Column(StringUUID, nullable=False)
  49. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  50. updated_by = db.Column(StringUUID, nullable=True)
  51. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  52. embedding_model = db.Column(db.String(255), nullable=True)
  53. embedding_model_provider = db.Column(db.String(255), nullable=True)
  54. keyword_number = db.Column(db.Integer, nullable=True, server_default=db.text("10"))
  55. collection_binding_id = db.Column(StringUUID, nullable=True)
  56. retrieval_model = db.Column(JSONB, nullable=True)
  57. built_in_field_enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  58. icon_info = db.Column(JSONB, nullable=True)
  59. runtime_mode = db.Column(db.String(255), nullable=True, server_default=db.text("'general'::character varying"))
  60. pipeline_id = db.Column(StringUUID, nullable=True)
  61. chunk_structure = db.Column(db.String(255), nullable=True)
  62. @property
  63. def total_documents(self):
  64. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  65. @property
  66. def total_available_documents(self):
  67. return (
  68. db.session.query(func.count(Document.id))
  69. .filter(
  70. Document.dataset_id == self.id,
  71. Document.indexing_status == "completed",
  72. Document.enabled == True,
  73. Document.archived == False,
  74. )
  75. .scalar()
  76. )
  77. @property
  78. def dataset_keyword_table(self):
  79. dataset_keyword_table = (
  80. db.session.query(DatasetKeywordTable).filter(DatasetKeywordTable.dataset_id == self.id).first()
  81. )
  82. if dataset_keyword_table:
  83. return dataset_keyword_table
  84. return None
  85. @property
  86. def index_struct_dict(self):
  87. return json.loads(self.index_struct) if self.index_struct else None
  88. @property
  89. def external_retrieval_model(self):
  90. default_retrieval_model = {
  91. "top_k": 2,
  92. "score_threshold": 0.0,
  93. }
  94. return self.retrieval_model or default_retrieval_model
  95. @property
  96. def created_by_account(self):
  97. return db.session.get(Account, self.created_by)
  98. @property
  99. def latest_process_rule(self):
  100. return (
  101. db.session.query(DatasetProcessRule)
  102. .filter(DatasetProcessRule.dataset_id == self.id)
  103. .order_by(DatasetProcessRule.created_at.desc())
  104. .first()
  105. )
  106. @property
  107. def app_count(self):
  108. return (
  109. db.session.query(func.count(AppDatasetJoin.id))
  110. .filter(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  111. .scalar()
  112. )
  113. @property
  114. def document_count(self):
  115. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  116. @property
  117. def available_document_count(self):
  118. return (
  119. db.session.query(func.count(Document.id))
  120. .filter(
  121. Document.dataset_id == self.id,
  122. Document.indexing_status == "completed",
  123. Document.enabled == True,
  124. Document.archived == False,
  125. )
  126. .scalar()
  127. )
  128. @property
  129. def available_segment_count(self):
  130. return (
  131. db.session.query(func.count(DocumentSegment.id))
  132. .filter(
  133. DocumentSegment.dataset_id == self.id,
  134. DocumentSegment.status == "completed",
  135. DocumentSegment.enabled == True,
  136. )
  137. .scalar()
  138. )
  139. @property
  140. def word_count(self):
  141. return (
  142. db.session.query(Document)
  143. .with_entities(func.coalesce(func.sum(Document.word_count)))
  144. .filter(Document.dataset_id == self.id)
  145. .scalar()
  146. )
  147. @property
  148. def doc_form(self):
  149. if self.chunk_structure:
  150. return self.chunk_structure
  151. document = db.session.query(Document).filter(Document.dataset_id == self.id).first()
  152. if document:
  153. return document.doc_form
  154. return None
  155. @property
  156. def retrieval_model_dict(self):
  157. default_retrieval_model = {
  158. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  159. "reranking_enable": False,
  160. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  161. "top_k": 2,
  162. "score_threshold_enabled": False,
  163. }
  164. return self.retrieval_model or default_retrieval_model
  165. @property
  166. def tags(self):
  167. tags = (
  168. db.session.query(Tag)
  169. .join(TagBinding, Tag.id == TagBinding.tag_id)
  170. .filter(
  171. TagBinding.target_id == self.id,
  172. TagBinding.tenant_id == self.tenant_id,
  173. Tag.tenant_id == self.tenant_id,
  174. Tag.type == "knowledge",
  175. )
  176. .all()
  177. )
  178. return tags or []
  179. @property
  180. def external_knowledge_info(self):
  181. if self.provider != "external":
  182. return None
  183. external_knowledge_binding = (
  184. db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
  185. )
  186. if not external_knowledge_binding:
  187. return None
  188. external_knowledge_api = (
  189. db.session.query(ExternalKnowledgeApis)
  190. .filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
  191. .first()
  192. )
  193. if not external_knowledge_api:
  194. return None
  195. return {
  196. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  197. "external_knowledge_api_id": external_knowledge_api.id,
  198. "external_knowledge_api_name": external_knowledge_api.name,
  199. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  200. }
  201. @property
  202. def is_published(self):
  203. if self.pipeline_id:
  204. pipeline = db.session.query(Pipeline).filter(Pipeline.id == self.pipeline_id).first()
  205. if pipeline:
  206. return pipeline.is_published
  207. return False
  208. @property
  209. def doc_metadata(self):
  210. dataset_metadatas = db.session.query(DatasetMetadata).filter(DatasetMetadata.dataset_id == self.id).all()
  211. doc_metadata = [
  212. {
  213. "id": dataset_metadata.id,
  214. "name": dataset_metadata.name,
  215. "type": dataset_metadata.type,
  216. }
  217. for dataset_metadata in dataset_metadatas
  218. ]
  219. if self.built_in_field_enabled:
  220. doc_metadata.append(
  221. {
  222. "id": "built-in",
  223. "name": BuiltInField.document_name.value,
  224. "type": "string",
  225. }
  226. )
  227. doc_metadata.append(
  228. {
  229. "id": "built-in",
  230. "name": BuiltInField.uploader.value,
  231. "type": "string",
  232. }
  233. )
  234. doc_metadata.append(
  235. {
  236. "id": "built-in",
  237. "name": BuiltInField.upload_date.value,
  238. "type": "time",
  239. }
  240. )
  241. doc_metadata.append(
  242. {
  243. "id": "built-in",
  244. "name": BuiltInField.last_update_date.value,
  245. "type": "time",
  246. }
  247. )
  248. doc_metadata.append(
  249. {
  250. "id": "built-in",
  251. "name": BuiltInField.source.value,
  252. "type": "string",
  253. }
  254. )
  255. return doc_metadata
  256. @staticmethod
  257. def gen_collection_name_by_id(dataset_id: str) -> str:
  258. normalized_dataset_id = dataset_id.replace("-", "_")
  259. return f"Vector_index_{normalized_dataset_id}_Node"
  260. class DatasetProcessRule(Base):
  261. __tablename__ = "dataset_process_rules"
  262. __table_args__ = (
  263. db.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  264. db.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  265. )
  266. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  267. dataset_id = db.Column(StringUUID, nullable=False)
  268. mode = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  269. rules = db.Column(db.Text, nullable=True)
  270. created_by = db.Column(StringUUID, nullable=False)
  271. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  272. MODES = ["automatic", "custom", "hierarchical"]
  273. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  274. AUTOMATIC_RULES: dict[str, Any] = {
  275. "pre_processing_rules": [
  276. {"id": "remove_extra_spaces", "enabled": True},
  277. {"id": "remove_urls_emails", "enabled": False},
  278. ],
  279. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  280. }
  281. def to_dict(self):
  282. return {
  283. "id": self.id,
  284. "dataset_id": self.dataset_id,
  285. "mode": self.mode,
  286. "rules": self.rules_dict,
  287. }
  288. @property
  289. def rules_dict(self):
  290. try:
  291. return json.loads(self.rules) if self.rules else None
  292. except JSONDecodeError:
  293. return None
  294. class Document(Base):
  295. __tablename__ = "documents"
  296. __table_args__ = (
  297. db.PrimaryKeyConstraint("id", name="document_pkey"),
  298. db.Index("document_dataset_id_idx", "dataset_id"),
  299. db.Index("document_is_paused_idx", "is_paused"),
  300. db.Index("document_tenant_idx", "tenant_id"),
  301. db.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  302. )
  303. # initial fields
  304. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  305. tenant_id = db.Column(StringUUID, nullable=False)
  306. dataset_id = db.Column(StringUUID, nullable=False)
  307. position = db.Column(db.Integer, nullable=False)
  308. data_source_type = db.Column(db.String(255), nullable=False)
  309. data_source_info = db.Column(db.Text, nullable=True)
  310. dataset_process_rule_id = db.Column(StringUUID, nullable=True)
  311. batch = db.Column(db.String(255), nullable=False)
  312. name = db.Column(db.String(255), nullable=False)
  313. created_from = db.Column(db.String(255), nullable=False)
  314. created_by = db.Column(StringUUID, nullable=False)
  315. created_api_request_id = db.Column(StringUUID, nullable=True)
  316. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  317. # start processing
  318. processing_started_at = db.Column(db.DateTime, nullable=True)
  319. # parsing
  320. file_id = db.Column(db.Text, nullable=True)
  321. word_count = db.Column(db.Integer, nullable=True)
  322. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  323. # cleaning
  324. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  325. # split
  326. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  327. # indexing
  328. tokens = db.Column(db.Integer, nullable=True)
  329. indexing_latency = db.Column(db.Float, nullable=True)
  330. completed_at = db.Column(db.DateTime, nullable=True)
  331. # pause
  332. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text("false"))
  333. paused_by = db.Column(StringUUID, nullable=True)
  334. paused_at = db.Column(db.DateTime, nullable=True)
  335. # error
  336. error = db.Column(db.Text, nullable=True)
  337. stopped_at = db.Column(db.DateTime, nullable=True)
  338. # basic fields
  339. indexing_status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  340. enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  341. disabled_at = db.Column(db.DateTime, nullable=True)
  342. disabled_by = db.Column(StringUUID, nullable=True)
  343. archived = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  344. archived_reason = db.Column(db.String(255), nullable=True)
  345. archived_by = db.Column(StringUUID, nullable=True)
  346. archived_at = db.Column(db.DateTime, nullable=True)
  347. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  348. doc_type = db.Column(db.String(40), nullable=True)
  349. doc_metadata = db.Column(JSONB, nullable=True)
  350. doc_form = db.Column(db.String(255), nullable=False, server_default=db.text("'text_model'::character varying"))
  351. doc_language = db.Column(db.String(255), nullable=True)
  352. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  353. @property
  354. def display_status(self):
  355. status = None
  356. if self.indexing_status == "waiting":
  357. status = "queuing"
  358. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  359. status = "paused"
  360. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  361. status = "indexing"
  362. elif self.indexing_status == "error":
  363. status = "error"
  364. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  365. status = "available"
  366. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  367. status = "disabled"
  368. elif self.indexing_status == "completed" and self.archived:
  369. status = "archived"
  370. return status
  371. @property
  372. def data_source_info_dict(self):
  373. if self.data_source_info:
  374. try:
  375. data_source_info_dict = json.loads(self.data_source_info)
  376. except JSONDecodeError:
  377. data_source_info_dict = {}
  378. return data_source_info_dict
  379. return None
  380. @property
  381. def data_source_detail_dict(self):
  382. if self.data_source_info:
  383. if self.data_source_type == "upload_file":
  384. data_source_info_dict = json.loads(self.data_source_info)
  385. file_detail = (
  386. db.session.query(UploadFile)
  387. .filter(UploadFile.id == data_source_info_dict["upload_file_id"])
  388. .one_or_none()
  389. )
  390. if file_detail:
  391. return {
  392. "upload_file": {
  393. "id": file_detail.id,
  394. "name": file_detail.name,
  395. "size": file_detail.size,
  396. "extension": file_detail.extension,
  397. "mime_type": file_detail.mime_type,
  398. "created_by": file_detail.created_by,
  399. "created_at": file_detail.created_at.timestamp(),
  400. }
  401. }
  402. elif self.data_source_type in {"notion_import", "website_crawl"}:
  403. return json.loads(self.data_source_info)
  404. return {}
  405. @property
  406. def average_segment_length(self):
  407. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  408. return self.word_count // self.segment_count
  409. return 0
  410. @property
  411. def dataset_process_rule(self):
  412. if self.dataset_process_rule_id:
  413. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  414. return None
  415. @property
  416. def dataset(self):
  417. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  418. @property
  419. def segment_count(self):
  420. return db.session.query(DocumentSegment).filter(DocumentSegment.document_id == self.id).count()
  421. @property
  422. def hit_count(self):
  423. return (
  424. db.session.query(DocumentSegment)
  425. .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count)))
  426. .filter(DocumentSegment.document_id == self.id)
  427. .scalar()
  428. )
  429. @property
  430. def uploader(self):
  431. user = db.session.query(Account).filter(Account.id == self.created_by).first()
  432. return user.name if user else None
  433. @property
  434. def upload_date(self):
  435. return self.created_at
  436. @property
  437. def last_update_date(self):
  438. return self.updated_at
  439. @property
  440. def doc_metadata_details(self):
  441. if self.doc_metadata:
  442. document_metadatas = (
  443. db.session.query(DatasetMetadata)
  444. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  445. .filter(
  446. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  447. )
  448. .all()
  449. )
  450. metadata_list = []
  451. for metadata in document_metadatas:
  452. metadata_dict = {
  453. "id": metadata.id,
  454. "name": metadata.name,
  455. "type": metadata.type,
  456. "value": self.doc_metadata.get(metadata.name),
  457. }
  458. metadata_list.append(metadata_dict)
  459. # deal built-in fields
  460. metadata_list.extend(self.get_built_in_fields())
  461. return metadata_list
  462. return None
  463. @property
  464. def process_rule_dict(self):
  465. if self.dataset_process_rule_id:
  466. return self.dataset_process_rule.to_dict()
  467. return None
  468. def get_built_in_fields(self):
  469. built_in_fields = []
  470. built_in_fields.append(
  471. {
  472. "id": "built-in",
  473. "name": BuiltInField.document_name,
  474. "type": "string",
  475. "value": self.name,
  476. }
  477. )
  478. built_in_fields.append(
  479. {
  480. "id": "built-in",
  481. "name": BuiltInField.uploader,
  482. "type": "string",
  483. "value": self.uploader,
  484. }
  485. )
  486. built_in_fields.append(
  487. {
  488. "id": "built-in",
  489. "name": BuiltInField.upload_date,
  490. "type": "time",
  491. "value": self.created_at.timestamp(),
  492. }
  493. )
  494. built_in_fields.append(
  495. {
  496. "id": "built-in",
  497. "name": BuiltInField.last_update_date,
  498. "type": "time",
  499. "value": self.updated_at.timestamp(),
  500. }
  501. )
  502. built_in_fields.append(
  503. {
  504. "id": "built-in",
  505. "name": BuiltInField.source,
  506. "type": "string",
  507. "value": MetadataDataSource[self.data_source_type].value,
  508. }
  509. )
  510. return built_in_fields
  511. def to_dict(self):
  512. return {
  513. "id": self.id,
  514. "tenant_id": self.tenant_id,
  515. "dataset_id": self.dataset_id,
  516. "position": self.position,
  517. "data_source_type": self.data_source_type,
  518. "data_source_info": self.data_source_info,
  519. "dataset_process_rule_id": self.dataset_process_rule_id,
  520. "batch": self.batch,
  521. "name": self.name,
  522. "created_from": self.created_from,
  523. "created_by": self.created_by,
  524. "created_api_request_id": self.created_api_request_id,
  525. "created_at": self.created_at,
  526. "processing_started_at": self.processing_started_at,
  527. "file_id": self.file_id,
  528. "word_count": self.word_count,
  529. "parsing_completed_at": self.parsing_completed_at,
  530. "cleaning_completed_at": self.cleaning_completed_at,
  531. "splitting_completed_at": self.splitting_completed_at,
  532. "tokens": self.tokens,
  533. "indexing_latency": self.indexing_latency,
  534. "completed_at": self.completed_at,
  535. "is_paused": self.is_paused,
  536. "paused_by": self.paused_by,
  537. "paused_at": self.paused_at,
  538. "error": self.error,
  539. "stopped_at": self.stopped_at,
  540. "indexing_status": self.indexing_status,
  541. "enabled": self.enabled,
  542. "disabled_at": self.disabled_at,
  543. "disabled_by": self.disabled_by,
  544. "archived": self.archived,
  545. "archived_reason": self.archived_reason,
  546. "archived_by": self.archived_by,
  547. "archived_at": self.archived_at,
  548. "updated_at": self.updated_at,
  549. "doc_type": self.doc_type,
  550. "doc_metadata": self.doc_metadata,
  551. "doc_form": self.doc_form,
  552. "doc_language": self.doc_language,
  553. "display_status": self.display_status,
  554. "data_source_info_dict": self.data_source_info_dict,
  555. "average_segment_length": self.average_segment_length,
  556. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  557. "dataset": self.dataset.to_dict() if self.dataset else None,
  558. "segment_count": self.segment_count,
  559. "hit_count": self.hit_count,
  560. }
  561. @classmethod
  562. def from_dict(cls, data: dict):
  563. return cls(
  564. id=data.get("id"),
  565. tenant_id=data.get("tenant_id"),
  566. dataset_id=data.get("dataset_id"),
  567. position=data.get("position"),
  568. data_source_type=data.get("data_source_type"),
  569. data_source_info=data.get("data_source_info"),
  570. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  571. batch=data.get("batch"),
  572. name=data.get("name"),
  573. created_from=data.get("created_from"),
  574. created_by=data.get("created_by"),
  575. created_api_request_id=data.get("created_api_request_id"),
  576. created_at=data.get("created_at"),
  577. processing_started_at=data.get("processing_started_at"),
  578. file_id=data.get("file_id"),
  579. word_count=data.get("word_count"),
  580. parsing_completed_at=data.get("parsing_completed_at"),
  581. cleaning_completed_at=data.get("cleaning_completed_at"),
  582. splitting_completed_at=data.get("splitting_completed_at"),
  583. tokens=data.get("tokens"),
  584. indexing_latency=data.get("indexing_latency"),
  585. completed_at=data.get("completed_at"),
  586. is_paused=data.get("is_paused"),
  587. paused_by=data.get("paused_by"),
  588. paused_at=data.get("paused_at"),
  589. error=data.get("error"),
  590. stopped_at=data.get("stopped_at"),
  591. indexing_status=data.get("indexing_status"),
  592. enabled=data.get("enabled"),
  593. disabled_at=data.get("disabled_at"),
  594. disabled_by=data.get("disabled_by"),
  595. archived=data.get("archived"),
  596. archived_reason=data.get("archived_reason"),
  597. archived_by=data.get("archived_by"),
  598. archived_at=data.get("archived_at"),
  599. updated_at=data.get("updated_at"),
  600. doc_type=data.get("doc_type"),
  601. doc_metadata=data.get("doc_metadata"),
  602. doc_form=data.get("doc_form"),
  603. doc_language=data.get("doc_language"),
  604. )
  605. class DocumentSegment(Base):
  606. __tablename__ = "document_segments"
  607. __table_args__ = (
  608. db.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  609. db.Index("document_segment_dataset_id_idx", "dataset_id"),
  610. db.Index("document_segment_document_id_idx", "document_id"),
  611. db.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  612. db.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  613. db.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
  614. db.Index("document_segment_tenant_idx", "tenant_id"),
  615. )
  616. # initial fields
  617. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  618. tenant_id = db.Column(StringUUID, nullable=False)
  619. dataset_id = db.Column(StringUUID, nullable=False)
  620. document_id = db.Column(StringUUID, nullable=False)
  621. position: Mapped[int]
  622. content = db.Column(db.Text, nullable=False)
  623. answer = db.Column(db.Text, nullable=True)
  624. word_count = db.Column(db.Integer, nullable=False)
  625. tokens = db.Column(db.Integer, nullable=False)
  626. # indexing fields
  627. keywords = db.Column(db.JSON, nullable=True)
  628. index_node_id = db.Column(db.String(255), nullable=True)
  629. index_node_hash = db.Column(db.String(255), nullable=True)
  630. # basic fields
  631. hit_count = db.Column(db.Integer, nullable=False, default=0)
  632. enabled = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  633. disabled_at = db.Column(db.DateTime, nullable=True)
  634. disabled_by = db.Column(StringUUID, nullable=True)
  635. status = db.Column(db.String(255), nullable=False, server_default=db.text("'waiting'::character varying"))
  636. created_by = db.Column(StringUUID, nullable=False)
  637. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  638. updated_by = db.Column(StringUUID, nullable=True)
  639. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  640. indexing_at = db.Column(db.DateTime, nullable=True)
  641. completed_at = db.Column(db.DateTime, nullable=True)
  642. error = db.Column(db.Text, nullable=True)
  643. stopped_at = db.Column(db.DateTime, nullable=True)
  644. @property
  645. def dataset(self):
  646. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  647. @property
  648. def document(self):
  649. return db.session.query(Document).filter(Document.id == self.document_id).first()
  650. @property
  651. def previous_segment(self):
  652. return (
  653. db.session.query(DocumentSegment)
  654. .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1)
  655. .first()
  656. )
  657. @property
  658. def next_segment(self):
  659. return (
  660. db.session.query(DocumentSegment)
  661. .filter(DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1)
  662. .first()
  663. )
  664. @property
  665. def child_chunks(self):
  666. process_rule = self.document.dataset_process_rule
  667. if process_rule.mode == "hierarchical":
  668. rules = Rule(**process_rule.rules_dict)
  669. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  670. child_chunks = (
  671. db.session.query(ChildChunk)
  672. .filter(ChildChunk.segment_id == self.id)
  673. .order_by(ChildChunk.position.asc())
  674. .all()
  675. )
  676. return child_chunks or []
  677. else:
  678. return []
  679. else:
  680. return []
  681. def get_child_chunks(self):
  682. process_rule = self.document.dataset_process_rule
  683. if process_rule.mode == "hierarchical":
  684. rules = Rule(**process_rule.rules_dict)
  685. if rules.parent_mode:
  686. child_chunks = (
  687. db.session.query(ChildChunk)
  688. .filter(ChildChunk.segment_id == self.id)
  689. .order_by(ChildChunk.position.asc())
  690. .all()
  691. )
  692. return child_chunks or []
  693. else:
  694. return []
  695. else:
  696. return []
  697. @property
  698. def sign_content(self):
  699. return self.get_sign_content()
  700. def get_sign_content(self):
  701. signed_urls = []
  702. text = self.content
  703. # For data before v0.10.0
  704. pattern = r"/files/([a-f0-9\-]+)/image-preview"
  705. matches = re.finditer(pattern, text)
  706. for match in matches:
  707. upload_file_id = match.group(1)
  708. nonce = os.urandom(16).hex()
  709. timestamp = str(int(time.time()))
  710. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  711. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  712. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  713. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  714. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  715. signed_url = f"{match.group(0)}?{params}"
  716. signed_urls.append((match.start(), match.end(), signed_url))
  717. # For data after v0.10.0
  718. pattern = r"/files/([a-f0-9\-]+)/file-preview"
  719. matches = re.finditer(pattern, text)
  720. for match in matches:
  721. upload_file_id = match.group(1)
  722. nonce = os.urandom(16).hex()
  723. timestamp = str(int(time.time()))
  724. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  725. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  726. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  727. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  728. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  729. signed_url = f"{match.group(0)}?{params}"
  730. signed_urls.append((match.start(), match.end(), signed_url))
  731. # Reconstruct the text with signed URLs
  732. offset = 0
  733. for start, end, signed_url in signed_urls:
  734. text = text[: start + offset] + signed_url + text[end + offset :]
  735. offset += len(signed_url) - (end - start)
  736. return text
  737. class ChildChunk(Base):
  738. __tablename__ = "child_chunks"
  739. __table_args__ = (
  740. db.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  741. db.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  742. db.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
  743. db.Index("child_chunks_segment_idx", "segment_id"),
  744. )
  745. # initial fields
  746. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  747. tenant_id = db.Column(StringUUID, nullable=False)
  748. dataset_id = db.Column(StringUUID, nullable=False)
  749. document_id = db.Column(StringUUID, nullable=False)
  750. segment_id = db.Column(StringUUID, nullable=False)
  751. position = db.Column(db.Integer, nullable=False)
  752. content = db.Column(db.Text, nullable=False)
  753. word_count = db.Column(db.Integer, nullable=False)
  754. # indexing fields
  755. index_node_id = db.Column(db.String(255), nullable=True)
  756. index_node_hash = db.Column(db.String(255), nullable=True)
  757. type = db.Column(db.String(255), nullable=False, server_default=db.text("'automatic'::character varying"))
  758. created_by = db.Column(StringUUID, nullable=False)
  759. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  760. updated_by = db.Column(StringUUID, nullable=True)
  761. updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  762. indexing_at = db.Column(db.DateTime, nullable=True)
  763. completed_at = db.Column(db.DateTime, nullable=True)
  764. error = db.Column(db.Text, nullable=True)
  765. @property
  766. def dataset(self):
  767. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  768. @property
  769. def document(self):
  770. return db.session.query(Document).filter(Document.id == self.document_id).first()
  771. @property
  772. def segment(self):
  773. return db.session.query(DocumentSegment).filter(DocumentSegment.id == self.segment_id).first()
  774. class AppDatasetJoin(Base):
  775. __tablename__ = "app_dataset_joins"
  776. __table_args__ = (
  777. db.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  778. db.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  779. )
  780. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  781. app_id = db.Column(StringUUID, nullable=False)
  782. dataset_id = db.Column(StringUUID, nullable=False)
  783. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  784. @property
  785. def app(self):
  786. return db.session.get(App, self.app_id)
  787. class DatasetQuery(Base):
  788. __tablename__ = "dataset_queries"
  789. __table_args__ = (
  790. db.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  791. db.Index("dataset_query_dataset_id_idx", "dataset_id"),
  792. )
  793. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text("uuid_generate_v4()"))
  794. dataset_id = db.Column(StringUUID, nullable=False)
  795. content = db.Column(db.Text, nullable=False)
  796. source = db.Column(db.String(255), nullable=False)
  797. source_app_id = db.Column(StringUUID, nullable=True)
  798. created_by_role = db.Column(db.String, nullable=False)
  799. created_by = db.Column(StringUUID, nullable=False)
  800. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  801. class DatasetKeywordTable(Base):
  802. __tablename__ = "dataset_keyword_tables"
  803. __table_args__ = (
  804. db.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  805. db.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  806. )
  807. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  808. dataset_id = db.Column(StringUUID, nullable=False, unique=True)
  809. keyword_table = db.Column(db.Text, nullable=False)
  810. data_source_type = db.Column(
  811. db.String(255), nullable=False, server_default=db.text("'database'::character varying")
  812. )
  813. @property
  814. def keyword_table_dict(self):
  815. class SetDecoder(json.JSONDecoder):
  816. def __init__(self, *args, **kwargs):
  817. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  818. def object_hook(self, dct):
  819. if isinstance(dct, dict):
  820. for keyword, node_idxs in dct.items():
  821. if isinstance(node_idxs, list):
  822. dct[keyword] = set(node_idxs)
  823. return dct
  824. # get dataset
  825. dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
  826. if not dataset:
  827. return None
  828. if self.data_source_type == "database":
  829. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  830. else:
  831. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  832. try:
  833. keyword_table_text = storage.load_once(file_key)
  834. if keyword_table_text:
  835. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  836. return None
  837. except Exception as e:
  838. logging.exception(f"Failed to load keyword table from file: {file_key}")
  839. return None
  840. class Embedding(Base):
  841. __tablename__ = "embeddings"
  842. __table_args__ = (
  843. db.PrimaryKeyConstraint("id", name="embedding_pkey"),
  844. db.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  845. db.Index("created_at_idx", "created_at"),
  846. )
  847. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  848. model_name = db.Column(
  849. db.String(255), nullable=False, server_default=db.text("'text-embedding-ada-002'::character varying")
  850. )
  851. hash = db.Column(db.String(64), nullable=False)
  852. embedding = db.Column(db.LargeBinary, nullable=False)
  853. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  854. provider_name = db.Column(db.String(255), nullable=False, server_default=db.text("''::character varying"))
  855. def set_embedding(self, embedding_data: list[float]):
  856. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  857. def get_embedding(self) -> list[float]:
  858. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  859. class DatasetCollectionBinding(Base):
  860. __tablename__ = "dataset_collection_bindings"
  861. __table_args__ = (
  862. db.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  863. db.Index("provider_model_name_idx", "provider_name", "model_name"),
  864. )
  865. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  866. provider_name = db.Column(db.String(255), nullable=False)
  867. model_name = db.Column(db.String(255), nullable=False)
  868. type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
  869. collection_name = db.Column(db.String(64), nullable=False)
  870. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  871. class TidbAuthBinding(Base):
  872. __tablename__ = "tidb_auth_bindings"
  873. __table_args__ = (
  874. db.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  875. db.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  876. db.Index("tidb_auth_bindings_active_idx", "active"),
  877. db.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  878. db.Index("tidb_auth_bindings_status_idx", "status"),
  879. )
  880. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  881. tenant_id = db.Column(StringUUID, nullable=True)
  882. cluster_id = db.Column(db.String(255), nullable=False)
  883. cluster_name = db.Column(db.String(255), nullable=False)
  884. active = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  885. status = db.Column(db.String(255), nullable=False, server_default=db.text("CREATING"))
  886. account = db.Column(db.String(255), nullable=False)
  887. password = db.Column(db.String(255), nullable=False)
  888. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  889. class Whitelist(Base):
  890. __tablename__ = "whitelists"
  891. __table_args__ = (
  892. db.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  893. db.Index("whitelists_tenant_idx", "tenant_id"),
  894. )
  895. id = db.Column(StringUUID, primary_key=True, server_default=db.text("uuid_generate_v4()"))
  896. tenant_id = db.Column(StringUUID, nullable=True)
  897. category = db.Column(db.String(255), nullable=False)
  898. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  899. class DatasetPermission(Base):
  900. __tablename__ = "dataset_permissions"
  901. __table_args__ = (
  902. db.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  903. db.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  904. db.Index("idx_dataset_permissions_account_id", "account_id"),
  905. db.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  906. )
  907. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"), primary_key=True)
  908. dataset_id = db.Column(StringUUID, nullable=False)
  909. account_id = db.Column(StringUUID, nullable=False)
  910. tenant_id = db.Column(StringUUID, nullable=False)
  911. has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  912. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  913. class ExternalKnowledgeApis(Base):
  914. __tablename__ = "external_knowledge_apis"
  915. __table_args__ = (
  916. db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  917. db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  918. db.Index("external_knowledge_apis_name_idx", "name"),
  919. )
  920. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  921. name = db.Column(db.String(255), nullable=False)
  922. description = db.Column(db.String(255), nullable=False)
  923. tenant_id = db.Column(StringUUID, nullable=False)
  924. settings = db.Column(db.Text, nullable=True)
  925. created_by = db.Column(StringUUID, nullable=False)
  926. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  927. updated_by = db.Column(StringUUID, nullable=True)
  928. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  929. def to_dict(self):
  930. return {
  931. "id": self.id,
  932. "tenant_id": self.tenant_id,
  933. "name": self.name,
  934. "description": self.description,
  935. "settings": self.settings_dict,
  936. "dataset_bindings": self.dataset_bindings,
  937. "created_by": self.created_by,
  938. "created_at": self.created_at.isoformat(),
  939. }
  940. @property
  941. def settings_dict(self):
  942. try:
  943. return json.loads(self.settings) if self.settings else None
  944. except JSONDecodeError:
  945. return None
  946. @property
  947. def dataset_bindings(self):
  948. external_knowledge_bindings = (
  949. db.session.query(ExternalKnowledgeBindings)
  950. .filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  951. .all()
  952. )
  953. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  954. datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
  955. dataset_bindings = []
  956. for dataset in datasets:
  957. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  958. return dataset_bindings
  959. class ExternalKnowledgeBindings(Base):
  960. __tablename__ = "external_knowledge_bindings"
  961. __table_args__ = (
  962. db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  963. db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  964. db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  965. db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  966. db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  967. )
  968. id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
  969. tenant_id = db.Column(StringUUID, nullable=False)
  970. external_knowledge_api_id = db.Column(StringUUID, nullable=False)
  971. dataset_id = db.Column(StringUUID, nullable=False)
  972. external_knowledge_id = db.Column(db.Text, nullable=False)
  973. created_by = db.Column(StringUUID, nullable=False)
  974. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  975. updated_by = db.Column(StringUUID, nullable=True)
  976. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  977. class DatasetAutoDisableLog(Base):
  978. __tablename__ = "dataset_auto_disable_logs"
  979. __table_args__ = (
  980. db.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  981. db.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  982. db.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  983. db.Index("dataset_auto_disable_log_created_atx", "created_at"),
  984. )
  985. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  986. tenant_id = db.Column(StringUUID, nullable=False)
  987. dataset_id = db.Column(StringUUID, nullable=False)
  988. document_id = db.Column(StringUUID, nullable=False)
  989. notified = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  990. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  991. class RateLimitLog(Base):
  992. __tablename__ = "rate_limit_logs"
  993. __table_args__ = (
  994. db.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  995. db.Index("rate_limit_log_tenant_idx", "tenant_id"),
  996. db.Index("rate_limit_log_operation_idx", "operation"),
  997. )
  998. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  999. tenant_id = db.Column(StringUUID, nullable=False)
  1000. subscription_plan = db.Column(db.String(255), nullable=False)
  1001. operation = db.Column(db.String(255), nullable=False)
  1002. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1003. class DatasetMetadata(Base):
  1004. __tablename__ = "dataset_metadatas"
  1005. __table_args__ = (
  1006. db.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  1007. db.Index("dataset_metadata_tenant_idx", "tenant_id"),
  1008. db.Index("dataset_metadata_dataset_idx", "dataset_id"),
  1009. )
  1010. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1011. tenant_id = db.Column(StringUUID, nullable=False)
  1012. dataset_id = db.Column(StringUUID, nullable=False)
  1013. type = db.Column(db.String(255), nullable=False)
  1014. name = db.Column(db.String(255), nullable=False)
  1015. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1016. updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
  1017. created_by = db.Column(StringUUID, nullable=False)
  1018. updated_by = db.Column(StringUUID, nullable=True)
  1019. class DatasetMetadataBinding(Base):
  1020. __tablename__ = "dataset_metadata_bindings"
  1021. __table_args__ = (
  1022. db.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  1023. db.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  1024. db.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  1025. db.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  1026. db.Index("dataset_metadata_binding_document_idx", "document_id"),
  1027. )
  1028. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1029. tenant_id = db.Column(StringUUID, nullable=False)
  1030. dataset_id = db.Column(StringUUID, nullable=False)
  1031. metadata_id = db.Column(StringUUID, nullable=False)
  1032. document_id = db.Column(StringUUID, nullable=False)
  1033. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1034. created_by = db.Column(StringUUID, nullable=False)
  1035. class PipelineBuiltInTemplate(Base): # type: ignore[name-defined]
  1036. __tablename__ = "pipeline_built_in_templates"
  1037. __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_built_in_template_pkey"),)
  1038. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1039. name = db.Column(db.String(255), nullable=False)
  1040. description = db.Column(db.Text, nullable=False)
  1041. chunk_structure = db.Column(db.String(255), nullable=False)
  1042. icon = db.Column(db.JSON, nullable=False)
  1043. yaml_content = db.Column(db.Text, nullable=False)
  1044. copyright = db.Column(db.String(255), nullable=False)
  1045. privacy_policy = db.Column(db.String(255), nullable=False)
  1046. position = db.Column(db.Integer, nullable=False)
  1047. install_count = db.Column(db.Integer, nullable=False, default=0)
  1048. language = db.Column(db.String(255), nullable=False)
  1049. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1050. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1051. created_by = db.Column(StringUUID, nullable=False)
  1052. updated_by = db.Column(StringUUID, nullable=True)
  1053. @property
  1054. def created_user_name(self):
  1055. account = db.session.query(Account).filter(Account.id == self.created_by).first()
  1056. if account:
  1057. return account.name
  1058. return ""
  1059. class PipelineCustomizedTemplate(Base): # type: ignore[name-defined]
  1060. __tablename__ = "pipeline_customized_templates"
  1061. __table_args__ = (
  1062. db.PrimaryKeyConstraint("id", name="pipeline_customized_template_pkey"),
  1063. db.Index("pipeline_customized_template_tenant_idx", "tenant_id"),
  1064. )
  1065. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1066. tenant_id = db.Column(StringUUID, nullable=False)
  1067. name = db.Column(db.String(255), nullable=False)
  1068. description = db.Column(db.Text, nullable=False)
  1069. chunk_structure = db.Column(db.String(255), nullable=False)
  1070. icon = db.Column(db.JSON, nullable=False)
  1071. position = db.Column(db.Integer, nullable=False)
  1072. yaml_content = db.Column(db.Text, nullable=False)
  1073. install_count = db.Column(db.Integer, nullable=False, default=0)
  1074. language = db.Column(db.String(255), nullable=False)
  1075. created_by = db.Column(StringUUID, nullable=False)
  1076. updated_by = db.Column(StringUUID, nullable=True)
  1077. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1078. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1079. @property
  1080. def created_user_name(self):
  1081. account = db.session.query(Account).filter(Account.id == self.created_by).first()
  1082. if account:
  1083. return account.name
  1084. return ""
  1085. class Pipeline(Base): # type: ignore[name-defined]
  1086. __tablename__ = "pipelines"
  1087. __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_pkey"),)
  1088. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1089. tenant_id: Mapped[str] = db.Column(StringUUID, nullable=False)
  1090. name = db.Column(db.String(255), nullable=False)
  1091. description = db.Column(db.Text, nullable=False, server_default=db.text("''::character varying"))
  1092. workflow_id = db.Column(StringUUID, nullable=True)
  1093. is_public = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1094. is_published = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1095. created_by = db.Column(StringUUID, nullable=True)
  1096. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1097. updated_by = db.Column(StringUUID, nullable=True)
  1098. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1099. @property
  1100. def dataset(self):
  1101. return db.session.query(Dataset).filter(Dataset.pipeline_id == self.id).first()
  1102. class DocumentPipelineExecutionLog(Base):
  1103. __tablename__ = "document_pipeline_execution_logs"
  1104. __table_args__ = (
  1105. db.PrimaryKeyConstraint("id", name="document_pipeline_execution_log_pkey"),
  1106. db.Index("document_pipeline_execution_logs_document_id_idx", "document_id"),
  1107. )
  1108. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1109. pipeline_id = db.Column(StringUUID, nullable=False)
  1110. document_id = db.Column(StringUUID, nullable=False)
  1111. datasource_type = db.Column(db.String(255), nullable=False)
  1112. datasource_info = db.Column(db.Text, nullable=False)
  1113. input_data = db.Column(db.JSON, nullable=False)
  1114. created_by = db.Column(StringUUID, nullable=True)
  1115. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())