Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

dataset.py 55KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from datetime import datetime
  12. from json import JSONDecodeError
  13. from typing import Any, Optional, cast
  14. import sqlalchemy as sa
  15. from sqlalchemy import DateTime, String, func, select
  16. from sqlalchemy.dialects.postgresql import JSONB
  17. from sqlalchemy.orm import Mapped, mapped_column
  18. from configs import dify_config
  19. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  20. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  21. from extensions.ext_storage import storage
  22. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  23. from .account import Account
  24. from .base import Base
  25. from .engine import db
  26. from .model import App, Tag, TagBinding, UploadFile
  27. from .types import StringUUID
  28. class DatasetPermissionEnum(enum.StrEnum):
  29. ONLY_ME = "only_me"
  30. ALL_TEAM = "all_team_members"
  31. PARTIAL_TEAM = "partial_members"
  32. class Dataset(Base):
  33. __tablename__ = "datasets"
  34. __table_args__ = (
  35. sa.PrimaryKeyConstraint("id", name="dataset_pkey"),
  36. sa.Index("dataset_tenant_idx", "tenant_id"),
  37. sa.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  38. )
  39. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  40. PROVIDER_LIST = ["vendor", "external", None]
  41. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  42. tenant_id: Mapped[str] = mapped_column(StringUUID)
  43. name: Mapped[str] = mapped_column(String(255))
  44. description = mapped_column(sa.Text, nullable=True)
  45. provider: Mapped[str] = mapped_column(String(255), server_default=sa.text("'vendor'::character varying"))
  46. permission: Mapped[str] = mapped_column(String(255), server_default=sa.text("'only_me'::character varying"))
  47. data_source_type = mapped_column(String(255))
  48. indexing_technique: Mapped[Optional[str]] = mapped_column(String(255))
  49. index_struct = mapped_column(sa.Text, nullable=True)
  50. created_by = mapped_column(StringUUID, nullable=False)
  51. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  52. updated_by = mapped_column(StringUUID, nullable=True)
  53. updated_at = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  54. embedding_model = db.Column(db.String(255), nullable=True) # TODO: mapped_column
  55. embedding_model_provider = db.Column(db.String(255), nullable=True) # TODO: mapped_column
  56. keyword_number = db.Column(db.Integer, nullable=True, server_default=db.text("10"))
  57. collection_binding_id = mapped_column(StringUUID, nullable=True)
  58. retrieval_model = mapped_column(JSONB, nullable=True)
  59. built_in_field_enabled = mapped_column(db.Boolean, nullable=False, server_default=db.text("false"))
  60. icon_info = db.Column(JSONB, nullable=True)
  61. runtime_mode = db.Column(db.String(255), nullable=True, server_default=db.text("'general'::character varying"))
  62. pipeline_id = db.Column(StringUUID, nullable=True)
  63. chunk_structure = db.Column(db.String(255), nullable=True)
  64. @property
  65. def total_documents(self):
  66. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  67. @property
  68. def total_available_documents(self):
  69. return (
  70. db.session.query(func.count(Document.id))
  71. .filter(
  72. Document.dataset_id == self.id,
  73. Document.indexing_status == "completed",
  74. Document.enabled == True,
  75. Document.archived == False,
  76. )
  77. .scalar()
  78. )
  79. @property
  80. def dataset_keyword_table(self):
  81. dataset_keyword_table = (
  82. db.session.query(DatasetKeywordTable).where(DatasetKeywordTable.dataset_id == self.id).first()
  83. )
  84. if dataset_keyword_table:
  85. return dataset_keyword_table
  86. return None
  87. @property
  88. def index_struct_dict(self):
  89. return json.loads(self.index_struct) if self.index_struct else None
  90. @property
  91. def external_retrieval_model(self):
  92. default_retrieval_model = {
  93. "top_k": 2,
  94. "score_threshold": 0.0,
  95. }
  96. return self.retrieval_model or default_retrieval_model
  97. @property
  98. def created_by_account(self):
  99. return db.session.get(Account, self.created_by)
  100. @property
  101. def latest_process_rule(self):
  102. return (
  103. db.session.query(DatasetProcessRule)
  104. .where(DatasetProcessRule.dataset_id == self.id)
  105. .order_by(DatasetProcessRule.created_at.desc())
  106. .first()
  107. )
  108. @property
  109. def app_count(self):
  110. return (
  111. db.session.query(func.count(AppDatasetJoin.id))
  112. .where(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  113. .scalar()
  114. )
  115. @property
  116. def document_count(self):
  117. return db.session.query(func.count(Document.id)).where(Document.dataset_id == self.id).scalar()
  118. @property
  119. def available_document_count(self):
  120. return (
  121. db.session.query(func.count(Document.id))
  122. .where(
  123. Document.dataset_id == self.id,
  124. Document.indexing_status == "completed",
  125. Document.enabled == True,
  126. Document.archived == False,
  127. )
  128. .scalar()
  129. )
  130. @property
  131. def available_segment_count(self):
  132. return (
  133. db.session.query(func.count(DocumentSegment.id))
  134. .where(
  135. DocumentSegment.dataset_id == self.id,
  136. DocumentSegment.status == "completed",
  137. DocumentSegment.enabled == True,
  138. )
  139. .scalar()
  140. )
  141. @property
  142. def word_count(self):
  143. return (
  144. db.session.query(Document)
  145. .with_entities(func.coalesce(func.sum(Document.word_count), 0))
  146. .where(Document.dataset_id == self.id)
  147. .scalar()
  148. )
  149. @property
  150. def doc_form(self):
  151. if self.chunk_structure:
  152. return self.chunk_structure
  153. document = db.session.query(Document).filter(Document.dataset_id == self.id).first()
  154. if document:
  155. return document.doc_form
  156. return None
  157. @property
  158. def retrieval_model_dict(self):
  159. default_retrieval_model = {
  160. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  161. "reranking_enable": False,
  162. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  163. "top_k": 2,
  164. "score_threshold_enabled": False,
  165. }
  166. return self.retrieval_model or default_retrieval_model
  167. @property
  168. def tags(self):
  169. tags = (
  170. db.session.query(Tag)
  171. .join(TagBinding, Tag.id == TagBinding.tag_id)
  172. .where(
  173. TagBinding.target_id == self.id,
  174. TagBinding.tenant_id == self.tenant_id,
  175. Tag.tenant_id == self.tenant_id,
  176. Tag.type == "knowledge",
  177. )
  178. .all()
  179. )
  180. return tags or []
  181. @property
  182. def external_knowledge_info(self):
  183. if self.provider != "external":
  184. return None
  185. external_knowledge_binding = (
  186. db.session.query(ExternalKnowledgeBindings).where(ExternalKnowledgeBindings.dataset_id == self.id).first()
  187. )
  188. if not external_knowledge_binding:
  189. return None
  190. external_knowledge_api = db.session.scalar(
  191. select(ExternalKnowledgeApis).where(
  192. ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id
  193. )
  194. )
  195. if not external_knowledge_api:
  196. return None
  197. return {
  198. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  199. "external_knowledge_api_id": external_knowledge_api.id,
  200. "external_knowledge_api_name": external_knowledge_api.name,
  201. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  202. }
  203. @property
  204. def is_published(self):
  205. if self.pipeline_id:
  206. pipeline = db.session.query(Pipeline).filter(Pipeline.id == self.pipeline_id).first()
  207. if pipeline:
  208. return pipeline.is_published
  209. return False
  210. @property
  211. def doc_metadata(self):
  212. dataset_metadatas = db.session.query(DatasetMetadata).where(DatasetMetadata.dataset_id == self.id).all()
  213. doc_metadata = [
  214. {
  215. "id": dataset_metadata.id,
  216. "name": dataset_metadata.name,
  217. "type": dataset_metadata.type,
  218. }
  219. for dataset_metadata in dataset_metadatas
  220. ]
  221. if self.built_in_field_enabled:
  222. doc_metadata.append(
  223. {
  224. "id": "built-in",
  225. "name": BuiltInField.document_name.value,
  226. "type": "string",
  227. }
  228. )
  229. doc_metadata.append(
  230. {
  231. "id": "built-in",
  232. "name": BuiltInField.uploader.value,
  233. "type": "string",
  234. }
  235. )
  236. doc_metadata.append(
  237. {
  238. "id": "built-in",
  239. "name": BuiltInField.upload_date.value,
  240. "type": "time",
  241. }
  242. )
  243. doc_metadata.append(
  244. {
  245. "id": "built-in",
  246. "name": BuiltInField.last_update_date.value,
  247. "type": "time",
  248. }
  249. )
  250. doc_metadata.append(
  251. {
  252. "id": "built-in",
  253. "name": BuiltInField.source.value,
  254. "type": "string",
  255. }
  256. )
  257. return doc_metadata
  258. @staticmethod
  259. def gen_collection_name_by_id(dataset_id: str) -> str:
  260. normalized_dataset_id = dataset_id.replace("-", "_")
  261. return f"{dify_config.VECTOR_INDEX_NAME_PREFIX}_{normalized_dataset_id}_Node"
  262. class DatasetProcessRule(Base):
  263. __tablename__ = "dataset_process_rules"
  264. __table_args__ = (
  265. sa.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  266. sa.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  267. )
  268. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  269. dataset_id = mapped_column(StringUUID, nullable=False)
  270. mode = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  271. rules = mapped_column(sa.Text, nullable=True)
  272. created_by = mapped_column(StringUUID, nullable=False)
  273. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  274. MODES = ["automatic", "custom", "hierarchical"]
  275. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  276. AUTOMATIC_RULES: dict[str, Any] = {
  277. "pre_processing_rules": [
  278. {"id": "remove_extra_spaces", "enabled": True},
  279. {"id": "remove_urls_emails", "enabled": False},
  280. ],
  281. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  282. }
  283. def to_dict(self):
  284. return {
  285. "id": self.id,
  286. "dataset_id": self.dataset_id,
  287. "mode": self.mode,
  288. "rules": self.rules_dict,
  289. }
  290. @property
  291. def rules_dict(self):
  292. try:
  293. return json.loads(self.rules) if self.rules else None
  294. except JSONDecodeError:
  295. return None
  296. class Document(Base):
  297. __tablename__ = "documents"
  298. __table_args__ = (
  299. sa.PrimaryKeyConstraint("id", name="document_pkey"),
  300. sa.Index("document_dataset_id_idx", "dataset_id"),
  301. sa.Index("document_is_paused_idx", "is_paused"),
  302. sa.Index("document_tenant_idx", "tenant_id"),
  303. sa.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  304. )
  305. # initial fields
  306. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  307. tenant_id = mapped_column(StringUUID, nullable=False)
  308. dataset_id = mapped_column(StringUUID, nullable=False)
  309. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  310. data_source_type: Mapped[str] = mapped_column(String(255), nullable=False)
  311. data_source_info = mapped_column(sa.Text, nullable=True)
  312. dataset_process_rule_id = mapped_column(StringUUID, nullable=True)
  313. batch: Mapped[str] = mapped_column(String(255), nullable=False)
  314. name: Mapped[str] = mapped_column(String(255), nullable=False)
  315. created_from: Mapped[str] = mapped_column(String(255), nullable=False)
  316. created_by = mapped_column(StringUUID, nullable=False)
  317. created_api_request_id = mapped_column(StringUUID, nullable=True)
  318. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  319. # start processing
  320. processing_started_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  321. # parsing
  322. file_id = mapped_column(sa.Text, nullable=True)
  323. word_count: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True) # TODO: make this not nullable
  324. parsing_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  325. # cleaning
  326. cleaning_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  327. # split
  328. splitting_completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  329. # indexing
  330. tokens: Mapped[Optional[int]] = mapped_column(sa.Integer, nullable=True)
  331. indexing_latency: Mapped[Optional[float]] = mapped_column(sa.Float, nullable=True)
  332. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  333. # pause
  334. is_paused: Mapped[Optional[bool]] = mapped_column(sa.Boolean, nullable=True, server_default=sa.text("false"))
  335. paused_by = mapped_column(StringUUID, nullable=True)
  336. paused_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  337. # error
  338. error = mapped_column(sa.Text, nullable=True)
  339. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  340. # basic fields
  341. indexing_status = mapped_column(String(255), nullable=False, server_default=sa.text("'waiting'::character varying"))
  342. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  343. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  344. disabled_by = mapped_column(StringUUID, nullable=True)
  345. archived: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  346. archived_reason = mapped_column(String(255), nullable=True)
  347. archived_by = mapped_column(StringUUID, nullable=True)
  348. archived_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  349. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  350. doc_type = mapped_column(String(40), nullable=True)
  351. doc_metadata = mapped_column(JSONB, nullable=True)
  352. doc_form = mapped_column(String(255), nullable=False, server_default=sa.text("'text_model'::character varying"))
  353. doc_language = mapped_column(String(255), nullable=True)
  354. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  355. @property
  356. def display_status(self):
  357. status = None
  358. if self.indexing_status == "waiting":
  359. status = "queuing"
  360. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  361. status = "paused"
  362. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  363. status = "indexing"
  364. elif self.indexing_status == "error":
  365. status = "error"
  366. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  367. status = "available"
  368. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  369. status = "disabled"
  370. elif self.indexing_status == "completed" and self.archived:
  371. status = "archived"
  372. return status
  373. @property
  374. def data_source_info_dict(self):
  375. if self.data_source_info:
  376. try:
  377. data_source_info_dict = json.loads(self.data_source_info)
  378. except JSONDecodeError:
  379. data_source_info_dict = {}
  380. return data_source_info_dict
  381. return None
  382. @property
  383. def data_source_detail_dict(self):
  384. if self.data_source_info:
  385. if self.data_source_type == "upload_file":
  386. data_source_info_dict = json.loads(self.data_source_info)
  387. file_detail = (
  388. db.session.query(UploadFile)
  389. .where(UploadFile.id == data_source_info_dict["upload_file_id"])
  390. .one_or_none()
  391. )
  392. if file_detail:
  393. return {
  394. "upload_file": {
  395. "id": file_detail.id,
  396. "name": file_detail.name,
  397. "size": file_detail.size,
  398. "extension": file_detail.extension,
  399. "mime_type": file_detail.mime_type,
  400. "created_by": file_detail.created_by,
  401. "created_at": file_detail.created_at.timestamp(),
  402. }
  403. }
  404. elif self.data_source_type in {"notion_import", "website_crawl"}:
  405. return json.loads(self.data_source_info)
  406. return {}
  407. @property
  408. def average_segment_length(self):
  409. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  410. return self.word_count // self.segment_count
  411. return 0
  412. @property
  413. def dataset_process_rule(self):
  414. if self.dataset_process_rule_id:
  415. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  416. return None
  417. @property
  418. def dataset(self):
  419. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).one_or_none()
  420. @property
  421. def segment_count(self):
  422. return db.session.query(DocumentSegment).where(DocumentSegment.document_id == self.id).count()
  423. @property
  424. def hit_count(self):
  425. return (
  426. db.session.query(DocumentSegment)
  427. .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count), 0))
  428. .where(DocumentSegment.document_id == self.id)
  429. .scalar()
  430. )
  431. @property
  432. def uploader(self):
  433. user = db.session.query(Account).where(Account.id == self.created_by).first()
  434. return user.name if user else None
  435. @property
  436. def upload_date(self):
  437. return self.created_at
  438. @property
  439. def last_update_date(self):
  440. return self.updated_at
  441. @property
  442. def doc_metadata_details(self):
  443. if self.doc_metadata:
  444. document_metadatas = (
  445. db.session.query(DatasetMetadata)
  446. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  447. .where(
  448. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  449. )
  450. .all()
  451. )
  452. metadata_list = []
  453. for metadata in document_metadatas:
  454. metadata_dict = {
  455. "id": metadata.id,
  456. "name": metadata.name,
  457. "type": metadata.type,
  458. "value": self.doc_metadata.get(metadata.name),
  459. }
  460. metadata_list.append(metadata_dict)
  461. # deal built-in fields
  462. metadata_list.extend(self.get_built_in_fields())
  463. return metadata_list
  464. return None
  465. @property
  466. def process_rule_dict(self):
  467. if self.dataset_process_rule_id:
  468. return self.dataset_process_rule.to_dict()
  469. return None
  470. def get_built_in_fields(self):
  471. built_in_fields = []
  472. built_in_fields.append(
  473. {
  474. "id": "built-in",
  475. "name": BuiltInField.document_name,
  476. "type": "string",
  477. "value": self.name,
  478. }
  479. )
  480. built_in_fields.append(
  481. {
  482. "id": "built-in",
  483. "name": BuiltInField.uploader,
  484. "type": "string",
  485. "value": self.uploader,
  486. }
  487. )
  488. built_in_fields.append(
  489. {
  490. "id": "built-in",
  491. "name": BuiltInField.upload_date,
  492. "type": "time",
  493. "value": str(self.created_at.timestamp()),
  494. }
  495. )
  496. built_in_fields.append(
  497. {
  498. "id": "built-in",
  499. "name": BuiltInField.last_update_date,
  500. "type": "time",
  501. "value": str(self.updated_at.timestamp()),
  502. }
  503. )
  504. built_in_fields.append(
  505. {
  506. "id": "built-in",
  507. "name": BuiltInField.source,
  508. "type": "string",
  509. "value": MetadataDataSource[self.data_source_type].value,
  510. }
  511. )
  512. return built_in_fields
  513. def to_dict(self):
  514. return {
  515. "id": self.id,
  516. "tenant_id": self.tenant_id,
  517. "dataset_id": self.dataset_id,
  518. "position": self.position,
  519. "data_source_type": self.data_source_type,
  520. "data_source_info": self.data_source_info,
  521. "dataset_process_rule_id": self.dataset_process_rule_id,
  522. "batch": self.batch,
  523. "name": self.name,
  524. "created_from": self.created_from,
  525. "created_by": self.created_by,
  526. "created_api_request_id": self.created_api_request_id,
  527. "created_at": self.created_at,
  528. "processing_started_at": self.processing_started_at,
  529. "file_id": self.file_id,
  530. "word_count": self.word_count,
  531. "parsing_completed_at": self.parsing_completed_at,
  532. "cleaning_completed_at": self.cleaning_completed_at,
  533. "splitting_completed_at": self.splitting_completed_at,
  534. "tokens": self.tokens,
  535. "indexing_latency": self.indexing_latency,
  536. "completed_at": self.completed_at,
  537. "is_paused": self.is_paused,
  538. "paused_by": self.paused_by,
  539. "paused_at": self.paused_at,
  540. "error": self.error,
  541. "stopped_at": self.stopped_at,
  542. "indexing_status": self.indexing_status,
  543. "enabled": self.enabled,
  544. "disabled_at": self.disabled_at,
  545. "disabled_by": self.disabled_by,
  546. "archived": self.archived,
  547. "archived_reason": self.archived_reason,
  548. "archived_by": self.archived_by,
  549. "archived_at": self.archived_at,
  550. "updated_at": self.updated_at,
  551. "doc_type": self.doc_type,
  552. "doc_metadata": self.doc_metadata,
  553. "doc_form": self.doc_form,
  554. "doc_language": self.doc_language,
  555. "display_status": self.display_status,
  556. "data_source_info_dict": self.data_source_info_dict,
  557. "average_segment_length": self.average_segment_length,
  558. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  559. "dataset": self.dataset.to_dict() if self.dataset else None,
  560. "segment_count": self.segment_count,
  561. "hit_count": self.hit_count,
  562. }
  563. @classmethod
  564. def from_dict(cls, data: dict):
  565. return cls(
  566. id=data.get("id"),
  567. tenant_id=data.get("tenant_id"),
  568. dataset_id=data.get("dataset_id"),
  569. position=data.get("position"),
  570. data_source_type=data.get("data_source_type"),
  571. data_source_info=data.get("data_source_info"),
  572. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  573. batch=data.get("batch"),
  574. name=data.get("name"),
  575. created_from=data.get("created_from"),
  576. created_by=data.get("created_by"),
  577. created_api_request_id=data.get("created_api_request_id"),
  578. created_at=data.get("created_at"),
  579. processing_started_at=data.get("processing_started_at"),
  580. file_id=data.get("file_id"),
  581. word_count=data.get("word_count"),
  582. parsing_completed_at=data.get("parsing_completed_at"),
  583. cleaning_completed_at=data.get("cleaning_completed_at"),
  584. splitting_completed_at=data.get("splitting_completed_at"),
  585. tokens=data.get("tokens"),
  586. indexing_latency=data.get("indexing_latency"),
  587. completed_at=data.get("completed_at"),
  588. is_paused=data.get("is_paused"),
  589. paused_by=data.get("paused_by"),
  590. paused_at=data.get("paused_at"),
  591. error=data.get("error"),
  592. stopped_at=data.get("stopped_at"),
  593. indexing_status=data.get("indexing_status"),
  594. enabled=data.get("enabled"),
  595. disabled_at=data.get("disabled_at"),
  596. disabled_by=data.get("disabled_by"),
  597. archived=data.get("archived"),
  598. archived_reason=data.get("archived_reason"),
  599. archived_by=data.get("archived_by"),
  600. archived_at=data.get("archived_at"),
  601. updated_at=data.get("updated_at"),
  602. doc_type=data.get("doc_type"),
  603. doc_metadata=data.get("doc_metadata"),
  604. doc_form=data.get("doc_form"),
  605. doc_language=data.get("doc_language"),
  606. )
  607. class DocumentSegment(Base):
  608. __tablename__ = "document_segments"
  609. __table_args__ = (
  610. sa.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  611. sa.Index("document_segment_dataset_id_idx", "dataset_id"),
  612. sa.Index("document_segment_document_id_idx", "document_id"),
  613. sa.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  614. sa.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  615. sa.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
  616. sa.Index("document_segment_tenant_idx", "tenant_id"),
  617. )
  618. # initial fields
  619. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  620. tenant_id = mapped_column(StringUUID, nullable=False)
  621. dataset_id = mapped_column(StringUUID, nullable=False)
  622. document_id = mapped_column(StringUUID, nullable=False)
  623. position: Mapped[int]
  624. content = mapped_column(sa.Text, nullable=False)
  625. answer = mapped_column(sa.Text, nullable=True)
  626. word_count: Mapped[int]
  627. tokens: Mapped[int]
  628. # indexing fields
  629. keywords = mapped_column(sa.JSON, nullable=True)
  630. index_node_id = mapped_column(String(255), nullable=True)
  631. index_node_hash = mapped_column(String(255), nullable=True)
  632. # basic fields
  633. hit_count: Mapped[int] = mapped_column(sa.Integer, nullable=False, default=0)
  634. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  635. disabled_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  636. disabled_by = mapped_column(StringUUID, nullable=True)
  637. status: Mapped[str] = mapped_column(String(255), server_default=sa.text("'waiting'::character varying"))
  638. created_by = mapped_column(StringUUID, nullable=False)
  639. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  640. updated_by = mapped_column(StringUUID, nullable=True)
  641. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  642. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  643. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  644. error = mapped_column(sa.Text, nullable=True)
  645. stopped_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  646. @property
  647. def dataset(self):
  648. return db.session.scalar(select(Dataset).where(Dataset.id == self.dataset_id))
  649. @property
  650. def document(self):
  651. return db.session.scalar(select(Document).where(Document.id == self.document_id))
  652. @property
  653. def previous_segment(self):
  654. return db.session.scalar(
  655. select(DocumentSegment).where(
  656. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1
  657. )
  658. )
  659. @property
  660. def next_segment(self):
  661. return db.session.scalar(
  662. select(DocumentSegment).where(
  663. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1
  664. )
  665. )
  666. @property
  667. def child_chunks(self):
  668. process_rule = self.document.dataset_process_rule
  669. if process_rule.mode == "hierarchical":
  670. rules = Rule(**process_rule.rules_dict)
  671. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  672. child_chunks = (
  673. db.session.query(ChildChunk)
  674. .where(ChildChunk.segment_id == self.id)
  675. .order_by(ChildChunk.position.asc())
  676. .all()
  677. )
  678. return child_chunks or []
  679. else:
  680. return []
  681. else:
  682. return []
  683. def get_child_chunks(self):
  684. process_rule = self.document.dataset_process_rule
  685. if process_rule.mode == "hierarchical":
  686. rules = Rule(**process_rule.rules_dict)
  687. if rules.parent_mode:
  688. child_chunks = (
  689. db.session.query(ChildChunk)
  690. .where(ChildChunk.segment_id == self.id)
  691. .order_by(ChildChunk.position.asc())
  692. .all()
  693. )
  694. return child_chunks or []
  695. else:
  696. return []
  697. else:
  698. return []
  699. @property
  700. def sign_content(self):
  701. return self.get_sign_content()
  702. def get_sign_content(self):
  703. signed_urls = []
  704. text = self.content
  705. # For data before v0.10.0
  706. pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
  707. matches = re.finditer(pattern, text)
  708. for match in matches:
  709. upload_file_id = match.group(1)
  710. nonce = os.urandom(16).hex()
  711. timestamp = str(int(time.time()))
  712. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  713. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  714. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  715. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  716. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  717. base_url = f"/files/{upload_file_id}/image-preview"
  718. signed_url = f"{base_url}?{params}"
  719. signed_urls.append((match.start(), match.end(), signed_url))
  720. # For data after v0.10.0
  721. pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
  722. matches = re.finditer(pattern, text)
  723. for match in matches:
  724. upload_file_id = match.group(1)
  725. nonce = os.urandom(16).hex()
  726. timestamp = str(int(time.time()))
  727. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  728. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  729. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  730. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  731. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  732. base_url = f"/files/{upload_file_id}/file-preview"
  733. signed_url = f"{base_url}?{params}"
  734. signed_urls.append((match.start(), match.end(), signed_url))
  735. # For tools directory - direct file formats (e.g., .png, .jpg, etc.)
  736. pattern = r"/files/tools/([a-f0-9\-]+)\.([a-zA-Z0-9]+)(?:\?.*?)?"
  737. matches = re.finditer(pattern, text)
  738. for match in matches:
  739. upload_file_id = match.group(1)
  740. file_extension = match.group(2)
  741. nonce = os.urandom(16).hex()
  742. timestamp = str(int(time.time()))
  743. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  744. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  745. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  746. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  747. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  748. base_url = f"/files/tools/{upload_file_id}.{file_extension}"
  749. signed_url = f"{base_url}?{params}"
  750. signed_urls.append((match.start(), match.end(), signed_url))
  751. # Reconstruct the text with signed URLs
  752. offset = 0
  753. for start, end, signed_url in signed_urls:
  754. text = text[: start + offset] + signed_url + text[end + offset :]
  755. offset += len(signed_url) - (end - start)
  756. return text
  757. class ChildChunk(Base):
  758. __tablename__ = "child_chunks"
  759. __table_args__ = (
  760. sa.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  761. sa.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  762. sa.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
  763. sa.Index("child_chunks_segment_idx", "segment_id"),
  764. )
  765. # initial fields
  766. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  767. tenant_id = mapped_column(StringUUID, nullable=False)
  768. dataset_id = mapped_column(StringUUID, nullable=False)
  769. document_id = mapped_column(StringUUID, nullable=False)
  770. segment_id = mapped_column(StringUUID, nullable=False)
  771. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  772. content = mapped_column(sa.Text, nullable=False)
  773. word_count: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  774. # indexing fields
  775. index_node_id = mapped_column(String(255), nullable=True)
  776. index_node_hash = mapped_column(String(255), nullable=True)
  777. type = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  778. created_by = mapped_column(StringUUID, nullable=False)
  779. created_at: Mapped[datetime] = mapped_column(
  780. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  781. )
  782. updated_by = mapped_column(StringUUID, nullable=True)
  783. updated_at: Mapped[datetime] = mapped_column(
  784. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  785. )
  786. indexing_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  787. completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime, nullable=True)
  788. error = mapped_column(sa.Text, nullable=True)
  789. @property
  790. def dataset(self):
  791. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).first()
  792. @property
  793. def document(self):
  794. return db.session.query(Document).where(Document.id == self.document_id).first()
  795. @property
  796. def segment(self):
  797. return db.session.query(DocumentSegment).where(DocumentSegment.id == self.segment_id).first()
  798. class AppDatasetJoin(Base):
  799. __tablename__ = "app_dataset_joins"
  800. __table_args__ = (
  801. sa.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  802. sa.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  803. )
  804. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  805. app_id = mapped_column(StringUUID, nullable=False)
  806. dataset_id = mapped_column(StringUUID, nullable=False)
  807. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  808. @property
  809. def app(self):
  810. return db.session.get(App, self.app_id)
  811. class DatasetQuery(Base):
  812. __tablename__ = "dataset_queries"
  813. __table_args__ = (
  814. sa.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  815. sa.Index("dataset_query_dataset_id_idx", "dataset_id"),
  816. )
  817. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  818. dataset_id = mapped_column(StringUUID, nullable=False)
  819. content = mapped_column(sa.Text, nullable=False)
  820. source: Mapped[str] = mapped_column(String(255), nullable=False)
  821. source_app_id = mapped_column(StringUUID, nullable=True)
  822. created_by_role = mapped_column(String, nullable=False)
  823. created_by = mapped_column(StringUUID, nullable=False)
  824. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  825. class DatasetKeywordTable(Base):
  826. __tablename__ = "dataset_keyword_tables"
  827. __table_args__ = (
  828. sa.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  829. sa.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  830. )
  831. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  832. dataset_id = mapped_column(StringUUID, nullable=False, unique=True)
  833. keyword_table = mapped_column(sa.Text, nullable=False)
  834. data_source_type = mapped_column(
  835. String(255), nullable=False, server_default=sa.text("'database'::character varying")
  836. )
  837. @property
  838. def keyword_table_dict(self):
  839. class SetDecoder(json.JSONDecoder):
  840. def __init__(self, *args, **kwargs):
  841. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  842. def object_hook(self, dct):
  843. if isinstance(dct, dict):
  844. for keyword, node_idxs in dct.items():
  845. if isinstance(node_idxs, list):
  846. dct[keyword] = set(node_idxs)
  847. return dct
  848. # get dataset
  849. dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
  850. if not dataset:
  851. return None
  852. if self.data_source_type == "database":
  853. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  854. else:
  855. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  856. try:
  857. keyword_table_text = storage.load_once(file_key)
  858. if keyword_table_text:
  859. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  860. return None
  861. except Exception as e:
  862. logging.exception("Failed to load keyword table from file: %s", file_key)
  863. return None
  864. class Embedding(Base):
  865. __tablename__ = "embeddings"
  866. __table_args__ = (
  867. sa.PrimaryKeyConstraint("id", name="embedding_pkey"),
  868. sa.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  869. sa.Index("created_at_idx", "created_at"),
  870. )
  871. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  872. model_name = mapped_column(
  873. String(255), nullable=False, server_default=sa.text("'text-embedding-ada-002'::character varying")
  874. )
  875. hash = mapped_column(String(64), nullable=False)
  876. embedding = mapped_column(sa.LargeBinary, nullable=False)
  877. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  878. provider_name = mapped_column(String(255), nullable=False, server_default=sa.text("''::character varying"))
  879. def set_embedding(self, embedding_data: list[float]):
  880. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  881. def get_embedding(self) -> list[float]:
  882. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  883. class DatasetCollectionBinding(Base):
  884. __tablename__ = "dataset_collection_bindings"
  885. __table_args__ = (
  886. sa.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  887. sa.Index("provider_model_name_idx", "provider_name", "model_name"),
  888. )
  889. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  890. provider_name: Mapped[str] = mapped_column(String(255), nullable=False)
  891. model_name: Mapped[str] = mapped_column(String(255), nullable=False)
  892. type = mapped_column(String(40), server_default=sa.text("'dataset'::character varying"), nullable=False)
  893. collection_name = mapped_column(String(64), nullable=False)
  894. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  895. class TidbAuthBinding(Base):
  896. __tablename__ = "tidb_auth_bindings"
  897. __table_args__ = (
  898. sa.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  899. sa.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  900. sa.Index("tidb_auth_bindings_active_idx", "active"),
  901. sa.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  902. sa.Index("tidb_auth_bindings_status_idx", "status"),
  903. )
  904. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  905. tenant_id = mapped_column(StringUUID, nullable=True)
  906. cluster_id: Mapped[str] = mapped_column(String(255), nullable=False)
  907. cluster_name: Mapped[str] = mapped_column(String(255), nullable=False)
  908. active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=db.text("false"))
  909. status = mapped_column(String(255), nullable=False, server_default=db.text("'CREATING'::character varying"))
  910. account: Mapped[str] = mapped_column(String(255), nullable=False)
  911. password: Mapped[str] = mapped_column(String(255), nullable=False)
  912. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  913. class Whitelist(Base):
  914. __tablename__ = "whitelists"
  915. __table_args__ = (
  916. sa.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  917. sa.Index("whitelists_tenant_idx", "tenant_id"),
  918. )
  919. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  920. tenant_id = mapped_column(StringUUID, nullable=True)
  921. category: Mapped[str] = mapped_column(String(255), nullable=False)
  922. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  923. class DatasetPermission(Base):
  924. __tablename__ = "dataset_permissions"
  925. __table_args__ = (
  926. sa.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  927. sa.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  928. sa.Index("idx_dataset_permissions_account_id", "account_id"),
  929. sa.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  930. )
  931. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"), primary_key=True)
  932. dataset_id = mapped_column(StringUUID, nullable=False)
  933. account_id = mapped_column(StringUUID, nullable=False)
  934. tenant_id = mapped_column(StringUUID, nullable=False)
  935. has_permission: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  936. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  937. class ExternalKnowledgeApis(Base):
  938. __tablename__ = "external_knowledge_apis"
  939. __table_args__ = (
  940. sa.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  941. sa.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  942. sa.Index("external_knowledge_apis_name_idx", "name"),
  943. )
  944. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  945. name: Mapped[str] = mapped_column(String(255), nullable=False)
  946. description: Mapped[str] = mapped_column(String(255), nullable=False)
  947. tenant_id = mapped_column(StringUUID, nullable=False)
  948. settings = mapped_column(sa.Text, nullable=True)
  949. created_by = mapped_column(StringUUID, nullable=False)
  950. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  951. updated_by = mapped_column(StringUUID, nullable=True)
  952. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  953. def to_dict(self):
  954. return {
  955. "id": self.id,
  956. "tenant_id": self.tenant_id,
  957. "name": self.name,
  958. "description": self.description,
  959. "settings": self.settings_dict,
  960. "dataset_bindings": self.dataset_bindings,
  961. "created_by": self.created_by,
  962. "created_at": self.created_at.isoformat(),
  963. }
  964. @property
  965. def settings_dict(self):
  966. try:
  967. return json.loads(self.settings) if self.settings else None
  968. except JSONDecodeError:
  969. return None
  970. @property
  971. def dataset_bindings(self):
  972. external_knowledge_bindings = (
  973. db.session.query(ExternalKnowledgeBindings)
  974. .where(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  975. .all()
  976. )
  977. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  978. datasets = db.session.query(Dataset).where(Dataset.id.in_(dataset_ids)).all()
  979. dataset_bindings = []
  980. for dataset in datasets:
  981. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  982. return dataset_bindings
  983. class ExternalKnowledgeBindings(Base):
  984. __tablename__ = "external_knowledge_bindings"
  985. __table_args__ = (
  986. sa.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  987. sa.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  988. sa.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  989. sa.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  990. sa.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  991. )
  992. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  993. tenant_id = mapped_column(StringUUID, nullable=False)
  994. external_knowledge_api_id = mapped_column(StringUUID, nullable=False)
  995. dataset_id = mapped_column(StringUUID, nullable=False)
  996. external_knowledge_id = mapped_column(sa.Text, nullable=False)
  997. created_by = mapped_column(StringUUID, nullable=False)
  998. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  999. updated_by = mapped_column(StringUUID, nullable=True)
  1000. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1001. class DatasetAutoDisableLog(Base):
  1002. __tablename__ = "dataset_auto_disable_logs"
  1003. __table_args__ = (
  1004. sa.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  1005. sa.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  1006. sa.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  1007. sa.Index("dataset_auto_disable_log_created_atx", "created_at"),
  1008. )
  1009. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1010. tenant_id = mapped_column(StringUUID, nullable=False)
  1011. dataset_id = mapped_column(StringUUID, nullable=False)
  1012. document_id = mapped_column(StringUUID, nullable=False)
  1013. notified: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  1014. created_at: Mapped[datetime] = mapped_column(
  1015. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1016. )
  1017. class RateLimitLog(Base):
  1018. __tablename__ = "rate_limit_logs"
  1019. __table_args__ = (
  1020. sa.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  1021. sa.Index("rate_limit_log_tenant_idx", "tenant_id"),
  1022. sa.Index("rate_limit_log_operation_idx", "operation"),
  1023. )
  1024. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1025. tenant_id = mapped_column(StringUUID, nullable=False)
  1026. subscription_plan: Mapped[str] = mapped_column(String(255), nullable=False)
  1027. operation: Mapped[str] = mapped_column(String(255), nullable=False)
  1028. created_at: Mapped[datetime] = mapped_column(
  1029. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1030. )
  1031. class DatasetMetadata(Base):
  1032. __tablename__ = "dataset_metadatas"
  1033. __table_args__ = (
  1034. sa.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  1035. sa.Index("dataset_metadata_tenant_idx", "tenant_id"),
  1036. sa.Index("dataset_metadata_dataset_idx", "dataset_id"),
  1037. )
  1038. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1039. tenant_id = mapped_column(StringUUID, nullable=False)
  1040. dataset_id = mapped_column(StringUUID, nullable=False)
  1041. type: Mapped[str] = mapped_column(String(255), nullable=False)
  1042. name: Mapped[str] = mapped_column(String(255), nullable=False)
  1043. created_at: Mapped[datetime] = mapped_column(
  1044. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1045. )
  1046. updated_at: Mapped[datetime] = mapped_column(
  1047. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1048. )
  1049. created_by = mapped_column(StringUUID, nullable=False)
  1050. updated_by = mapped_column(StringUUID, nullable=True)
  1051. class DatasetMetadataBinding(Base):
  1052. __tablename__ = "dataset_metadata_bindings"
  1053. __table_args__ = (
  1054. sa.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  1055. sa.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  1056. sa.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  1057. sa.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  1058. sa.Index("dataset_metadata_binding_document_idx", "document_id"),
  1059. )
  1060. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1061. tenant_id = mapped_column(StringUUID, nullable=False)
  1062. dataset_id = mapped_column(StringUUID, nullable=False)
  1063. metadata_id = mapped_column(StringUUID, nullable=False)
  1064. document_id = mapped_column(StringUUID, nullable=False)
  1065. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1066. created_by = mapped_column(StringUUID, nullable=False)
  1067. class PipelineBuiltInTemplate(Base): # type: ignore[name-defined]
  1068. __tablename__ = "pipeline_built_in_templates"
  1069. __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_built_in_template_pkey"),)
  1070. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1071. name = db.Column(db.String(255), nullable=False)
  1072. description = db.Column(db.Text, nullable=False)
  1073. chunk_structure = db.Column(db.String(255), nullable=False)
  1074. icon = db.Column(db.JSON, nullable=False)
  1075. yaml_content = db.Column(db.Text, nullable=False)
  1076. copyright = db.Column(db.String(255), nullable=False)
  1077. privacy_policy = db.Column(db.String(255), nullable=False)
  1078. position = db.Column(db.Integer, nullable=False)
  1079. install_count = db.Column(db.Integer, nullable=False, default=0)
  1080. language = db.Column(db.String(255), nullable=False)
  1081. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1082. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1083. created_by = db.Column(StringUUID, nullable=False)
  1084. updated_by = db.Column(StringUUID, nullable=True)
  1085. @property
  1086. def created_user_name(self):
  1087. account = db.session.query(Account).filter(Account.id == self.created_by).first()
  1088. if account:
  1089. return account.name
  1090. return ""
  1091. class PipelineCustomizedTemplate(Base): # type: ignore[name-defined]
  1092. __tablename__ = "pipeline_customized_templates"
  1093. __table_args__ = (
  1094. db.PrimaryKeyConstraint("id", name="pipeline_customized_template_pkey"),
  1095. db.Index("pipeline_customized_template_tenant_idx", "tenant_id"),
  1096. )
  1097. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1098. tenant_id = db.Column(StringUUID, nullable=False)
  1099. name = db.Column(db.String(255), nullable=False)
  1100. description = db.Column(db.Text, nullable=False)
  1101. chunk_structure = db.Column(db.String(255), nullable=False)
  1102. icon = db.Column(db.JSON, nullable=False)
  1103. position = db.Column(db.Integer, nullable=False)
  1104. yaml_content = db.Column(db.Text, nullable=False)
  1105. install_count = db.Column(db.Integer, nullable=False, default=0)
  1106. language = db.Column(db.String(255), nullable=False)
  1107. created_by = db.Column(StringUUID, nullable=False)
  1108. updated_by = db.Column(StringUUID, nullable=True)
  1109. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1110. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1111. @property
  1112. def created_user_name(self):
  1113. account = db.session.query(Account).filter(Account.id == self.created_by).first()
  1114. if account:
  1115. return account.name
  1116. return ""
  1117. class Pipeline(Base): # type: ignore[name-defined]
  1118. __tablename__ = "pipelines"
  1119. __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_pkey"),)
  1120. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1121. tenant_id: Mapped[str] = db.Column(StringUUID, nullable=False)
  1122. name = db.Column(db.String(255), nullable=False)
  1123. description = db.Column(db.Text, nullable=False, server_default=db.text("''::character varying"))
  1124. workflow_id = db.Column(StringUUID, nullable=True)
  1125. is_public = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1126. is_published = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1127. created_by = db.Column(StringUUID, nullable=True)
  1128. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1129. updated_by = db.Column(StringUUID, nullable=True)
  1130. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1131. @property
  1132. def dataset(self):
  1133. return db.session.query(Dataset).filter(Dataset.pipeline_id == self.id).first()
  1134. class DocumentPipelineExecutionLog(Base):
  1135. __tablename__ = "document_pipeline_execution_logs"
  1136. __table_args__ = (
  1137. db.PrimaryKeyConstraint("id", name="document_pipeline_execution_log_pkey"),
  1138. db.Index("document_pipeline_execution_logs_document_id_idx", "document_id"),
  1139. )
  1140. id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
  1141. pipeline_id = db.Column(StringUUID, nullable=False)
  1142. document_id = db.Column(StringUUID, nullable=False)
  1143. datasource_type = db.Column(db.String(255), nullable=False)
  1144. datasource_info = db.Column(db.Text, nullable=False)
  1145. datasource_node_id = db.Column(db.String(255), nullable=False)
  1146. input_data = db.Column(db.JSON, nullable=False)
  1147. created_by = db.Column(StringUUID, nullable=True)
  1148. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())