You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

dataset.py 56KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331
  1. import base64
  2. import enum
  3. import hashlib
  4. import hmac
  5. import json
  6. import logging
  7. import os
  8. import pickle
  9. import re
  10. import time
  11. from datetime import datetime
  12. from json import JSONDecodeError
  13. from typing import Any, cast
  14. import sqlalchemy as sa
  15. from sqlalchemy import DateTime, String, func, select
  16. from sqlalchemy.dialects.postgresql import JSONB
  17. from sqlalchemy.orm import Mapped, Session, mapped_column
  18. from configs import dify_config
  19. from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource
  20. from core.rag.retrieval.retrieval_methods import RetrievalMethod
  21. from extensions.ext_storage import storage
  22. from services.entities.knowledge_entities.knowledge_entities import ParentMode, Rule
  23. from .account import Account
  24. from .base import Base
  25. from .engine import db
  26. from .model import App, Tag, TagBinding, UploadFile
  27. from .types import StringUUID
  28. logger = logging.getLogger(__name__)
  29. class DatasetPermissionEnum(enum.StrEnum):
  30. ONLY_ME = "only_me"
  31. ALL_TEAM = "all_team_members"
  32. PARTIAL_TEAM = "partial_members"
  33. class Dataset(Base):
  34. __tablename__ = "datasets"
  35. __table_args__ = (
  36. sa.PrimaryKeyConstraint("id", name="dataset_pkey"),
  37. sa.Index("dataset_tenant_idx", "tenant_id"),
  38. sa.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
  39. )
  40. INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
  41. PROVIDER_LIST = ["vendor", "external", None]
  42. id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  43. tenant_id: Mapped[str] = mapped_column(StringUUID)
  44. name: Mapped[str] = mapped_column(String(255))
  45. description = mapped_column(sa.Text, nullable=True)
  46. provider: Mapped[str] = mapped_column(String(255), server_default=sa.text("'vendor'::character varying"))
  47. permission: Mapped[str] = mapped_column(String(255), server_default=sa.text("'only_me'::character varying"))
  48. data_source_type = mapped_column(String(255))
  49. indexing_technique: Mapped[str | None] = mapped_column(String(255))
  50. index_struct = mapped_column(sa.Text, nullable=True)
  51. created_by = mapped_column(StringUUID, nullable=False)
  52. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  53. updated_by = mapped_column(StringUUID, nullable=True)
  54. updated_at = mapped_column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  55. embedding_model = mapped_column(db.String(255), nullable=True)
  56. embedding_model_provider = mapped_column(db.String(255), nullable=True)
  57. keyword_number = db.Column(db.Integer, nullable=True, server_default=db.text("10"))
  58. collection_binding_id = mapped_column(StringUUID, nullable=True)
  59. retrieval_model = mapped_column(JSONB, nullable=True)
  60. built_in_field_enabled = mapped_column(db.Boolean, nullable=False, server_default=db.text("false"))
  61. icon_info = db.Column(JSONB, nullable=True)
  62. runtime_mode = db.Column(db.String(255), nullable=True, server_default=db.text("'general'::character varying"))
  63. pipeline_id = db.Column(StringUUID, nullable=True)
  64. chunk_structure = db.Column(db.String(255), nullable=True)
  65. enable_api = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
  66. @property
  67. def total_documents(self):
  68. return db.session.query(func.count(Document.id)).where(Document.dataset_id == self.id).scalar()
  69. @property
  70. def total_available_documents(self):
  71. return (
  72. db.session.query(func.count(Document.id))
  73. .where(
  74. Document.dataset_id == self.id,
  75. Document.indexing_status == "completed",
  76. Document.enabled == True,
  77. Document.archived == False,
  78. )
  79. .scalar()
  80. )
  81. @property
  82. def dataset_keyword_table(self):
  83. dataset_keyword_table = (
  84. db.session.query(DatasetKeywordTable).where(DatasetKeywordTable.dataset_id == self.id).first()
  85. )
  86. if dataset_keyword_table:
  87. return dataset_keyword_table
  88. return None
  89. @property
  90. def index_struct_dict(self):
  91. return json.loads(self.index_struct) if self.index_struct else None
  92. @property
  93. def external_retrieval_model(self):
  94. default_retrieval_model = {
  95. "top_k": 2,
  96. "score_threshold": 0.0,
  97. }
  98. return self.retrieval_model or default_retrieval_model
  99. @property
  100. def created_by_account(self):
  101. return db.session.get(Account, self.created_by)
  102. @property
  103. def latest_process_rule(self):
  104. return (
  105. db.session.query(DatasetProcessRule)
  106. .where(DatasetProcessRule.dataset_id == self.id)
  107. .order_by(DatasetProcessRule.created_at.desc())
  108. .first()
  109. )
  110. @property
  111. def app_count(self):
  112. return (
  113. db.session.query(func.count(AppDatasetJoin.id))
  114. .where(AppDatasetJoin.dataset_id == self.id, App.id == AppDatasetJoin.app_id)
  115. .scalar()
  116. )
  117. @property
  118. def document_count(self):
  119. return db.session.query(func.count(Document.id)).where(Document.dataset_id == self.id).scalar()
  120. @property
  121. def available_document_count(self):
  122. return (
  123. db.session.query(func.count(Document.id))
  124. .where(
  125. Document.dataset_id == self.id,
  126. Document.indexing_status == "completed",
  127. Document.enabled == True,
  128. Document.archived == False,
  129. )
  130. .scalar()
  131. )
  132. @property
  133. def available_segment_count(self):
  134. return (
  135. db.session.query(func.count(DocumentSegment.id))
  136. .where(
  137. DocumentSegment.dataset_id == self.id,
  138. DocumentSegment.status == "completed",
  139. DocumentSegment.enabled == True,
  140. )
  141. .scalar()
  142. )
  143. @property
  144. def word_count(self):
  145. return (
  146. db.session.query(Document)
  147. .with_entities(func.coalesce(func.sum(Document.word_count), 0))
  148. .where(Document.dataset_id == self.id)
  149. .scalar()
  150. )
  151. @property
  152. def doc_form(self) -> str | None:
  153. if self.chunk_structure:
  154. return self.chunk_structure
  155. document = db.session.query(Document).where(Document.dataset_id == self.id).first()
  156. if document:
  157. return document.doc_form
  158. return None
  159. @property
  160. def retrieval_model_dict(self):
  161. default_retrieval_model = {
  162. "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
  163. "reranking_enable": False,
  164. "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
  165. "top_k": 2,
  166. "score_threshold_enabled": False,
  167. }
  168. return self.retrieval_model or default_retrieval_model
  169. @property
  170. def tags(self):
  171. tags = (
  172. db.session.query(Tag)
  173. .join(TagBinding, Tag.id == TagBinding.tag_id)
  174. .where(
  175. TagBinding.target_id == self.id,
  176. TagBinding.tenant_id == self.tenant_id,
  177. Tag.tenant_id == self.tenant_id,
  178. Tag.type == "knowledge",
  179. )
  180. .all()
  181. )
  182. return tags or []
  183. @property
  184. def external_knowledge_info(self):
  185. if self.provider != "external":
  186. return None
  187. external_knowledge_binding = (
  188. db.session.query(ExternalKnowledgeBindings).where(ExternalKnowledgeBindings.dataset_id == self.id).first()
  189. )
  190. if not external_knowledge_binding:
  191. return None
  192. external_knowledge_api = db.session.scalar(
  193. select(ExternalKnowledgeApis).where(
  194. ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id
  195. )
  196. )
  197. if not external_knowledge_api:
  198. return None
  199. return {
  200. "external_knowledge_id": external_knowledge_binding.external_knowledge_id,
  201. "external_knowledge_api_id": external_knowledge_api.id,
  202. "external_knowledge_api_name": external_knowledge_api.name,
  203. "external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
  204. }
  205. @property
  206. def is_published(self):
  207. if self.pipeline_id:
  208. pipeline = db.session.query(Pipeline).where(Pipeline.id == self.pipeline_id).first()
  209. if pipeline:
  210. return pipeline.is_published
  211. return False
  212. @property
  213. def doc_metadata(self):
  214. dataset_metadatas = db.session.scalars(
  215. select(DatasetMetadata).where(DatasetMetadata.dataset_id == self.id)
  216. ).all()
  217. doc_metadata = [
  218. {
  219. "id": dataset_metadata.id,
  220. "name": dataset_metadata.name,
  221. "type": dataset_metadata.type,
  222. }
  223. for dataset_metadata in dataset_metadatas
  224. ]
  225. if self.built_in_field_enabled:
  226. doc_metadata.append(
  227. {
  228. "id": "built-in",
  229. "name": BuiltInField.document_name,
  230. "type": "string",
  231. }
  232. )
  233. doc_metadata.append(
  234. {
  235. "id": "built-in",
  236. "name": BuiltInField.uploader,
  237. "type": "string",
  238. }
  239. )
  240. doc_metadata.append(
  241. {
  242. "id": "built-in",
  243. "name": BuiltInField.upload_date,
  244. "type": "time",
  245. }
  246. )
  247. doc_metadata.append(
  248. {
  249. "id": "built-in",
  250. "name": BuiltInField.last_update_date,
  251. "type": "time",
  252. }
  253. )
  254. doc_metadata.append(
  255. {
  256. "id": "built-in",
  257. "name": BuiltInField.source,
  258. "type": "string",
  259. }
  260. )
  261. return doc_metadata
  262. @staticmethod
  263. def gen_collection_name_by_id(dataset_id: str) -> str:
  264. normalized_dataset_id = dataset_id.replace("-", "_")
  265. return f"{dify_config.VECTOR_INDEX_NAME_PREFIX}_{normalized_dataset_id}_Node"
  266. class DatasetProcessRule(Base):
  267. __tablename__ = "dataset_process_rules"
  268. __table_args__ = (
  269. sa.PrimaryKeyConstraint("id", name="dataset_process_rule_pkey"),
  270. sa.Index("dataset_process_rule_dataset_id_idx", "dataset_id"),
  271. )
  272. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  273. dataset_id = mapped_column(StringUUID, nullable=False)
  274. mode = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  275. rules = mapped_column(sa.Text, nullable=True)
  276. created_by = mapped_column(StringUUID, nullable=False)
  277. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  278. MODES = ["automatic", "custom", "hierarchical"]
  279. PRE_PROCESSING_RULES = ["remove_stopwords", "remove_extra_spaces", "remove_urls_emails"]
  280. AUTOMATIC_RULES: dict[str, Any] = {
  281. "pre_processing_rules": [
  282. {"id": "remove_extra_spaces", "enabled": True},
  283. {"id": "remove_urls_emails", "enabled": False},
  284. ],
  285. "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
  286. }
  287. def to_dict(self) -> dict[str, Any]:
  288. return {
  289. "id": self.id,
  290. "dataset_id": self.dataset_id,
  291. "mode": self.mode,
  292. "rules": self.rules_dict,
  293. }
  294. @property
  295. def rules_dict(self) -> dict[str, Any] | None:
  296. try:
  297. return json.loads(self.rules) if self.rules else None
  298. except JSONDecodeError:
  299. return None
  300. class Document(Base):
  301. __tablename__ = "documents"
  302. __table_args__ = (
  303. sa.PrimaryKeyConstraint("id", name="document_pkey"),
  304. sa.Index("document_dataset_id_idx", "dataset_id"),
  305. sa.Index("document_is_paused_idx", "is_paused"),
  306. sa.Index("document_tenant_idx", "tenant_id"),
  307. sa.Index("document_metadata_idx", "doc_metadata", postgresql_using="gin"),
  308. )
  309. # initial fields
  310. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  311. tenant_id = mapped_column(StringUUID, nullable=False)
  312. dataset_id = mapped_column(StringUUID, nullable=False)
  313. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  314. data_source_type: Mapped[str] = mapped_column(String(255), nullable=False)
  315. data_source_info = mapped_column(sa.Text, nullable=True)
  316. dataset_process_rule_id = mapped_column(StringUUID, nullable=True)
  317. batch: Mapped[str] = mapped_column(String(255), nullable=False)
  318. name: Mapped[str] = mapped_column(String(255), nullable=False)
  319. created_from: Mapped[str] = mapped_column(String(255), nullable=False)
  320. created_by = mapped_column(StringUUID, nullable=False)
  321. created_api_request_id = mapped_column(StringUUID, nullable=True)
  322. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  323. # start processing
  324. processing_started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  325. # parsing
  326. file_id = mapped_column(sa.Text, nullable=True)
  327. word_count: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) # TODO: make this not nullable
  328. parsing_completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  329. # cleaning
  330. cleaning_completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  331. # split
  332. splitting_completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  333. # indexing
  334. tokens: Mapped[int | None] = mapped_column(sa.Integer, nullable=True)
  335. indexing_latency: Mapped[float | None] = mapped_column(sa.Float, nullable=True)
  336. completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  337. # pause
  338. is_paused: Mapped[bool | None] = mapped_column(sa.Boolean, nullable=True, server_default=sa.text("false"))
  339. paused_by = mapped_column(StringUUID, nullable=True)
  340. paused_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  341. # error
  342. error = mapped_column(sa.Text, nullable=True)
  343. stopped_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  344. # basic fields
  345. indexing_status = mapped_column(String(255), nullable=False, server_default=sa.text("'waiting'::character varying"))
  346. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  347. disabled_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  348. disabled_by = mapped_column(StringUUID, nullable=True)
  349. archived: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  350. archived_reason = mapped_column(String(255), nullable=True)
  351. archived_by = mapped_column(StringUUID, nullable=True)
  352. archived_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  353. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  354. doc_type = mapped_column(String(40), nullable=True)
  355. doc_metadata = mapped_column(JSONB, nullable=True)
  356. doc_form = mapped_column(String(255), nullable=False, server_default=sa.text("'text_model'::character varying"))
  357. doc_language = mapped_column(String(255), nullable=True)
  358. DATA_SOURCES = ["upload_file", "notion_import", "website_crawl"]
  359. @property
  360. def display_status(self):
  361. status = None
  362. if self.indexing_status == "waiting":
  363. status = "queuing"
  364. elif self.indexing_status not in {"completed", "error", "waiting"} and self.is_paused:
  365. status = "paused"
  366. elif self.indexing_status in {"parsing", "cleaning", "splitting", "indexing"}:
  367. status = "indexing"
  368. elif self.indexing_status == "error":
  369. status = "error"
  370. elif self.indexing_status == "completed" and not self.archived and self.enabled:
  371. status = "available"
  372. elif self.indexing_status == "completed" and not self.archived and not self.enabled:
  373. status = "disabled"
  374. elif self.indexing_status == "completed" and self.archived:
  375. status = "archived"
  376. return status
  377. @property
  378. def data_source_info_dict(self) -> dict[str, Any]:
  379. if self.data_source_info:
  380. try:
  381. data_source_info_dict: dict[str, Any] = json.loads(self.data_source_info)
  382. except JSONDecodeError:
  383. data_source_info_dict = {}
  384. return data_source_info_dict
  385. return {}
  386. @property
  387. def data_source_detail_dict(self) -> dict[str, Any]:
  388. if self.data_source_info:
  389. if self.data_source_type == "upload_file":
  390. data_source_info_dict: dict[str, Any] = json.loads(self.data_source_info)
  391. file_detail = (
  392. db.session.query(UploadFile)
  393. .where(UploadFile.id == data_source_info_dict["upload_file_id"])
  394. .one_or_none()
  395. )
  396. if file_detail:
  397. return {
  398. "upload_file": {
  399. "id": file_detail.id,
  400. "name": file_detail.name,
  401. "size": file_detail.size,
  402. "extension": file_detail.extension,
  403. "mime_type": file_detail.mime_type,
  404. "created_by": file_detail.created_by,
  405. "created_at": file_detail.created_at.timestamp(),
  406. }
  407. }
  408. elif self.data_source_type in {"notion_import", "website_crawl"}:
  409. result: dict[str, Any] = json.loads(self.data_source_info)
  410. return result
  411. return {}
  412. @property
  413. def average_segment_length(self):
  414. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  415. return self.word_count // self.segment_count
  416. return 0
  417. @property
  418. def dataset_process_rule(self):
  419. if self.dataset_process_rule_id:
  420. return db.session.get(DatasetProcessRule, self.dataset_process_rule_id)
  421. return None
  422. @property
  423. def dataset(self):
  424. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).one_or_none()
  425. @property
  426. def segment_count(self):
  427. return db.session.query(DocumentSegment).where(DocumentSegment.document_id == self.id).count()
  428. @property
  429. def hit_count(self):
  430. return (
  431. db.session.query(DocumentSegment)
  432. .with_entities(func.coalesce(func.sum(DocumentSegment.hit_count), 0))
  433. .where(DocumentSegment.document_id == self.id)
  434. .scalar()
  435. )
  436. @property
  437. def uploader(self):
  438. user = db.session.query(Account).where(Account.id == self.created_by).first()
  439. return user.name if user else None
  440. @property
  441. def upload_date(self):
  442. return self.created_at
  443. @property
  444. def last_update_date(self):
  445. return self.updated_at
  446. @property
  447. def doc_metadata_details(self) -> list[dict[str, Any]] | None:
  448. if self.doc_metadata:
  449. document_metadatas = (
  450. db.session.query(DatasetMetadata)
  451. .join(DatasetMetadataBinding, DatasetMetadataBinding.metadata_id == DatasetMetadata.id)
  452. .where(
  453. DatasetMetadataBinding.dataset_id == self.dataset_id, DatasetMetadataBinding.document_id == self.id
  454. )
  455. .all()
  456. )
  457. metadata_list: list[dict[str, Any]] = []
  458. for metadata in document_metadatas:
  459. metadata_dict: dict[str, Any] = {
  460. "id": metadata.id,
  461. "name": metadata.name,
  462. "type": metadata.type,
  463. "value": self.doc_metadata.get(metadata.name),
  464. }
  465. metadata_list.append(metadata_dict)
  466. # deal built-in fields
  467. metadata_list.extend(self.get_built_in_fields())
  468. return metadata_list
  469. return None
  470. @property
  471. def process_rule_dict(self) -> dict[str, Any] | None:
  472. if self.dataset_process_rule_id and self.dataset_process_rule:
  473. return self.dataset_process_rule.to_dict()
  474. return None
  475. def get_built_in_fields(self) -> list[dict[str, Any]]:
  476. built_in_fields: list[dict[str, Any]] = []
  477. built_in_fields.append(
  478. {
  479. "id": "built-in",
  480. "name": BuiltInField.document_name,
  481. "type": "string",
  482. "value": self.name,
  483. }
  484. )
  485. built_in_fields.append(
  486. {
  487. "id": "built-in",
  488. "name": BuiltInField.uploader,
  489. "type": "string",
  490. "value": self.uploader,
  491. }
  492. )
  493. built_in_fields.append(
  494. {
  495. "id": "built-in",
  496. "name": BuiltInField.upload_date,
  497. "type": "time",
  498. "value": str(self.created_at.timestamp()),
  499. }
  500. )
  501. built_in_fields.append(
  502. {
  503. "id": "built-in",
  504. "name": BuiltInField.last_update_date,
  505. "type": "time",
  506. "value": str(self.updated_at.timestamp()),
  507. }
  508. )
  509. built_in_fields.append(
  510. {
  511. "id": "built-in",
  512. "name": BuiltInField.source,
  513. "type": "string",
  514. "value": MetadataDataSource[self.data_source_type],
  515. }
  516. )
  517. return built_in_fields
  518. def to_dict(self) -> dict[str, Any]:
  519. return {
  520. "id": self.id,
  521. "tenant_id": self.tenant_id,
  522. "dataset_id": self.dataset_id,
  523. "position": self.position,
  524. "data_source_type": self.data_source_type,
  525. "data_source_info": self.data_source_info,
  526. "dataset_process_rule_id": self.dataset_process_rule_id,
  527. "batch": self.batch,
  528. "name": self.name,
  529. "created_from": self.created_from,
  530. "created_by": self.created_by,
  531. "created_api_request_id": self.created_api_request_id,
  532. "created_at": self.created_at,
  533. "processing_started_at": self.processing_started_at,
  534. "file_id": self.file_id,
  535. "word_count": self.word_count,
  536. "parsing_completed_at": self.parsing_completed_at,
  537. "cleaning_completed_at": self.cleaning_completed_at,
  538. "splitting_completed_at": self.splitting_completed_at,
  539. "tokens": self.tokens,
  540. "indexing_latency": self.indexing_latency,
  541. "completed_at": self.completed_at,
  542. "is_paused": self.is_paused,
  543. "paused_by": self.paused_by,
  544. "paused_at": self.paused_at,
  545. "error": self.error,
  546. "stopped_at": self.stopped_at,
  547. "indexing_status": self.indexing_status,
  548. "enabled": self.enabled,
  549. "disabled_at": self.disabled_at,
  550. "disabled_by": self.disabled_by,
  551. "archived": self.archived,
  552. "archived_reason": self.archived_reason,
  553. "archived_by": self.archived_by,
  554. "archived_at": self.archived_at,
  555. "updated_at": self.updated_at,
  556. "doc_type": self.doc_type,
  557. "doc_metadata": self.doc_metadata,
  558. "doc_form": self.doc_form,
  559. "doc_language": self.doc_language,
  560. "display_status": self.display_status,
  561. "data_source_info_dict": self.data_source_info_dict,
  562. "average_segment_length": self.average_segment_length,
  563. "dataset_process_rule": self.dataset_process_rule.to_dict() if self.dataset_process_rule else None,
  564. "dataset": None, # Dataset class doesn't have a to_dict method
  565. "segment_count": self.segment_count,
  566. "hit_count": self.hit_count,
  567. }
  568. @classmethod
  569. def from_dict(cls, data: dict[str, Any]):
  570. return cls(
  571. id=data.get("id"),
  572. tenant_id=data.get("tenant_id"),
  573. dataset_id=data.get("dataset_id"),
  574. position=data.get("position"),
  575. data_source_type=data.get("data_source_type"),
  576. data_source_info=data.get("data_source_info"),
  577. dataset_process_rule_id=data.get("dataset_process_rule_id"),
  578. batch=data.get("batch"),
  579. name=data.get("name"),
  580. created_from=data.get("created_from"),
  581. created_by=data.get("created_by"),
  582. created_api_request_id=data.get("created_api_request_id"),
  583. created_at=data.get("created_at"),
  584. processing_started_at=data.get("processing_started_at"),
  585. file_id=data.get("file_id"),
  586. word_count=data.get("word_count"),
  587. parsing_completed_at=data.get("parsing_completed_at"),
  588. cleaning_completed_at=data.get("cleaning_completed_at"),
  589. splitting_completed_at=data.get("splitting_completed_at"),
  590. tokens=data.get("tokens"),
  591. indexing_latency=data.get("indexing_latency"),
  592. completed_at=data.get("completed_at"),
  593. is_paused=data.get("is_paused"),
  594. paused_by=data.get("paused_by"),
  595. paused_at=data.get("paused_at"),
  596. error=data.get("error"),
  597. stopped_at=data.get("stopped_at"),
  598. indexing_status=data.get("indexing_status"),
  599. enabled=data.get("enabled"),
  600. disabled_at=data.get("disabled_at"),
  601. disabled_by=data.get("disabled_by"),
  602. archived=data.get("archived"),
  603. archived_reason=data.get("archived_reason"),
  604. archived_by=data.get("archived_by"),
  605. archived_at=data.get("archived_at"),
  606. updated_at=data.get("updated_at"),
  607. doc_type=data.get("doc_type"),
  608. doc_metadata=data.get("doc_metadata"),
  609. doc_form=data.get("doc_form"),
  610. doc_language=data.get("doc_language"),
  611. )
  612. class DocumentSegment(Base):
  613. __tablename__ = "document_segments"
  614. __table_args__ = (
  615. sa.PrimaryKeyConstraint("id", name="document_segment_pkey"),
  616. sa.Index("document_segment_dataset_id_idx", "dataset_id"),
  617. sa.Index("document_segment_document_id_idx", "document_id"),
  618. sa.Index("document_segment_tenant_dataset_idx", "dataset_id", "tenant_id"),
  619. sa.Index("document_segment_tenant_document_idx", "document_id", "tenant_id"),
  620. sa.Index("document_segment_node_dataset_idx", "index_node_id", "dataset_id"),
  621. sa.Index("document_segment_tenant_idx", "tenant_id"),
  622. )
  623. # initial fields
  624. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  625. tenant_id = mapped_column(StringUUID, nullable=False)
  626. dataset_id = mapped_column(StringUUID, nullable=False)
  627. document_id = mapped_column(StringUUID, nullable=False)
  628. position: Mapped[int]
  629. content = mapped_column(sa.Text, nullable=False)
  630. answer = mapped_column(sa.Text, nullable=True)
  631. word_count: Mapped[int]
  632. tokens: Mapped[int]
  633. # indexing fields
  634. keywords = mapped_column(sa.JSON, nullable=True)
  635. index_node_id = mapped_column(String(255), nullable=True)
  636. index_node_hash = mapped_column(String(255), nullable=True)
  637. # basic fields
  638. hit_count: Mapped[int] = mapped_column(sa.Integer, nullable=False, default=0)
  639. enabled: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  640. disabled_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  641. disabled_by = mapped_column(StringUUID, nullable=True)
  642. status: Mapped[str] = mapped_column(String(255), server_default=sa.text("'waiting'::character varying"))
  643. created_by = mapped_column(StringUUID, nullable=False)
  644. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  645. updated_by = mapped_column(StringUUID, nullable=True)
  646. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  647. indexing_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  648. completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  649. error = mapped_column(sa.Text, nullable=True)
  650. stopped_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  651. @property
  652. def dataset(self):
  653. return db.session.scalar(select(Dataset).where(Dataset.id == self.dataset_id))
  654. @property
  655. def document(self):
  656. return db.session.scalar(select(Document).where(Document.id == self.document_id))
  657. @property
  658. def previous_segment(self):
  659. return db.session.scalar(
  660. select(DocumentSegment).where(
  661. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position - 1
  662. )
  663. )
  664. @property
  665. def next_segment(self):
  666. return db.session.scalar(
  667. select(DocumentSegment).where(
  668. DocumentSegment.document_id == self.document_id, DocumentSegment.position == self.position + 1
  669. )
  670. )
  671. @property
  672. def child_chunks(self) -> list[Any]:
  673. if not self.document:
  674. return []
  675. process_rule = self.document.dataset_process_rule
  676. if process_rule and process_rule.mode == "hierarchical":
  677. rules_dict = process_rule.rules_dict
  678. if rules_dict:
  679. rules = Rule(**rules_dict)
  680. if rules.parent_mode and rules.parent_mode != ParentMode.FULL_DOC:
  681. child_chunks = (
  682. db.session.query(ChildChunk)
  683. .where(ChildChunk.segment_id == self.id)
  684. .order_by(ChildChunk.position.asc())
  685. .all()
  686. )
  687. return child_chunks or []
  688. return []
  689. def get_child_chunks(self) -> list[Any]:
  690. if not self.document:
  691. return []
  692. process_rule = self.document.dataset_process_rule
  693. if process_rule and process_rule.mode == "hierarchical":
  694. rules_dict = process_rule.rules_dict
  695. if rules_dict:
  696. rules = Rule(**rules_dict)
  697. if rules.parent_mode:
  698. child_chunks = (
  699. db.session.query(ChildChunk)
  700. .where(ChildChunk.segment_id == self.id)
  701. .order_by(ChildChunk.position.asc())
  702. .all()
  703. )
  704. return child_chunks or []
  705. return []
  706. @property
  707. def sign_content(self) -> str:
  708. return self.get_sign_content()
  709. def get_sign_content(self) -> str:
  710. signed_urls: list[tuple[int, int, str]] = []
  711. text = self.content
  712. # For data before v0.10.0
  713. pattern = r"/files/([a-f0-9\-]+)/image-preview(?:\?.*?)?"
  714. matches = re.finditer(pattern, text)
  715. for match in matches:
  716. upload_file_id = match.group(1)
  717. nonce = os.urandom(16).hex()
  718. timestamp = str(int(time.time()))
  719. data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}"
  720. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  721. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  722. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  723. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  724. base_url = f"/files/{upload_file_id}/image-preview"
  725. signed_url = f"{base_url}?{params}"
  726. signed_urls.append((match.start(), match.end(), signed_url))
  727. # For data after v0.10.0
  728. pattern = r"/files/([a-f0-9\-]+)/file-preview(?:\?.*?)?"
  729. matches = re.finditer(pattern, text)
  730. for match in matches:
  731. upload_file_id = match.group(1)
  732. nonce = os.urandom(16).hex()
  733. timestamp = str(int(time.time()))
  734. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  735. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  736. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  737. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  738. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  739. base_url = f"/files/{upload_file_id}/file-preview"
  740. signed_url = f"{base_url}?{params}"
  741. signed_urls.append((match.start(), match.end(), signed_url))
  742. # For tools directory - direct file formats (e.g., .png, .jpg, etc.)
  743. # Match URL including any query parameters up to common URL boundaries (space, parenthesis, quotes)
  744. pattern = r"/files/tools/([a-f0-9\-]+)\.([a-zA-Z0-9]+)(?:\?[^\s\)\"\']*)?"
  745. matches = re.finditer(pattern, text)
  746. for match in matches:
  747. upload_file_id = match.group(1)
  748. file_extension = match.group(2)
  749. nonce = os.urandom(16).hex()
  750. timestamp = str(int(time.time()))
  751. data_to_sign = f"file-preview|{upload_file_id}|{timestamp}|{nonce}"
  752. secret_key = dify_config.SECRET_KEY.encode() if dify_config.SECRET_KEY else b""
  753. sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest()
  754. encoded_sign = base64.urlsafe_b64encode(sign).decode()
  755. params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}"
  756. base_url = f"/files/tools/{upload_file_id}.{file_extension}"
  757. signed_url = f"{base_url}?{params}"
  758. signed_urls.append((match.start(), match.end(), signed_url))
  759. # Reconstruct the text with signed URLs
  760. offset = 0
  761. for start, end, signed_url in signed_urls:
  762. text = text[: start + offset] + signed_url + text[end + offset :]
  763. offset += len(signed_url) - (end - start)
  764. return text
  765. class ChildChunk(Base):
  766. __tablename__ = "child_chunks"
  767. __table_args__ = (
  768. sa.PrimaryKeyConstraint("id", name="child_chunk_pkey"),
  769. sa.Index("child_chunk_dataset_id_idx", "tenant_id", "dataset_id", "document_id", "segment_id", "index_node_id"),
  770. sa.Index("child_chunks_node_idx", "index_node_id", "dataset_id"),
  771. sa.Index("child_chunks_segment_idx", "segment_id"),
  772. )
  773. # initial fields
  774. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  775. tenant_id = mapped_column(StringUUID, nullable=False)
  776. dataset_id = mapped_column(StringUUID, nullable=False)
  777. document_id = mapped_column(StringUUID, nullable=False)
  778. segment_id = mapped_column(StringUUID, nullable=False)
  779. position: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  780. content = mapped_column(sa.Text, nullable=False)
  781. word_count: Mapped[int] = mapped_column(sa.Integer, nullable=False)
  782. # indexing fields
  783. index_node_id = mapped_column(String(255), nullable=True)
  784. index_node_hash = mapped_column(String(255), nullable=True)
  785. type = mapped_column(String(255), nullable=False, server_default=sa.text("'automatic'::character varying"))
  786. created_by = mapped_column(StringUUID, nullable=False)
  787. created_at: Mapped[datetime] = mapped_column(
  788. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  789. )
  790. updated_by = mapped_column(StringUUID, nullable=True)
  791. updated_at: Mapped[datetime] = mapped_column(
  792. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  793. )
  794. indexing_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  795. completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
  796. error = mapped_column(sa.Text, nullable=True)
  797. @property
  798. def dataset(self):
  799. return db.session.query(Dataset).where(Dataset.id == self.dataset_id).first()
  800. @property
  801. def document(self):
  802. return db.session.query(Document).where(Document.id == self.document_id).first()
  803. @property
  804. def segment(self):
  805. return db.session.query(DocumentSegment).where(DocumentSegment.id == self.segment_id).first()
  806. class AppDatasetJoin(Base):
  807. __tablename__ = "app_dataset_joins"
  808. __table_args__ = (
  809. sa.PrimaryKeyConstraint("id", name="app_dataset_join_pkey"),
  810. sa.Index("app_dataset_join_app_dataset_idx", "dataset_id", "app_id"),
  811. )
  812. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  813. app_id = mapped_column(StringUUID, nullable=False)
  814. dataset_id = mapped_column(StringUUID, nullable=False)
  815. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  816. @property
  817. def app(self):
  818. return db.session.get(App, self.app_id)
  819. class DatasetQuery(Base):
  820. __tablename__ = "dataset_queries"
  821. __table_args__ = (
  822. sa.PrimaryKeyConstraint("id", name="dataset_query_pkey"),
  823. sa.Index("dataset_query_dataset_id_idx", "dataset_id"),
  824. )
  825. id = mapped_column(StringUUID, primary_key=True, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  826. dataset_id = mapped_column(StringUUID, nullable=False)
  827. content = mapped_column(sa.Text, nullable=False)
  828. source: Mapped[str] = mapped_column(String(255), nullable=False)
  829. source_app_id = mapped_column(StringUUID, nullable=True)
  830. created_by_role = mapped_column(String, nullable=False)
  831. created_by = mapped_column(StringUUID, nullable=False)
  832. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=db.func.current_timestamp())
  833. class DatasetKeywordTable(Base):
  834. __tablename__ = "dataset_keyword_tables"
  835. __table_args__ = (
  836. sa.PrimaryKeyConstraint("id", name="dataset_keyword_table_pkey"),
  837. sa.Index("dataset_keyword_table_dataset_id_idx", "dataset_id"),
  838. )
  839. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  840. dataset_id = mapped_column(StringUUID, nullable=False, unique=True)
  841. keyword_table = mapped_column(sa.Text, nullable=False)
  842. data_source_type = mapped_column(
  843. String(255), nullable=False, server_default=sa.text("'database'::character varying")
  844. )
  845. @property
  846. def keyword_table_dict(self) -> dict[str, set[Any]] | None:
  847. class SetDecoder(json.JSONDecoder):
  848. def __init__(self, *args: Any, **kwargs: Any) -> None:
  849. def object_hook(dct: Any) -> Any:
  850. if isinstance(dct, dict):
  851. result: dict[str, Any] = {}
  852. items = cast(dict[str, Any], dct).items()
  853. for keyword, node_idxs in items:
  854. if isinstance(node_idxs, list):
  855. result[keyword] = set(cast(list[Any], node_idxs))
  856. else:
  857. result[keyword] = node_idxs
  858. return result
  859. return dct
  860. super().__init__(object_hook=object_hook, *args, **kwargs)
  861. # get dataset
  862. dataset = db.session.query(Dataset).filter_by(id=self.dataset_id).first()
  863. if not dataset:
  864. return None
  865. if self.data_source_type == "database":
  866. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  867. else:
  868. file_key = "keyword_files/" + dataset.tenant_id + "/" + self.dataset_id + ".txt"
  869. try:
  870. keyword_table_text = storage.load_once(file_key)
  871. if keyword_table_text:
  872. return json.loads(keyword_table_text.decode("utf-8"), cls=SetDecoder)
  873. return None
  874. except Exception:
  875. logger.exception("Failed to load keyword table from file: %s", file_key)
  876. return None
  877. class Embedding(Base):
  878. __tablename__ = "embeddings"
  879. __table_args__ = (
  880. sa.PrimaryKeyConstraint("id", name="embedding_pkey"),
  881. sa.UniqueConstraint("model_name", "hash", "provider_name", name="embedding_hash_idx"),
  882. sa.Index("created_at_idx", "created_at"),
  883. )
  884. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  885. model_name = mapped_column(
  886. String(255), nullable=False, server_default=sa.text("'text-embedding-ada-002'::character varying")
  887. )
  888. hash = mapped_column(String(64), nullable=False)
  889. embedding = mapped_column(sa.LargeBinary, nullable=False)
  890. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  891. provider_name = mapped_column(String(255), nullable=False, server_default=sa.text("''::character varying"))
  892. def set_embedding(self, embedding_data: list[float]):
  893. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  894. def get_embedding(self) -> list[float]:
  895. return cast(list[float], pickle.loads(self.embedding)) # noqa: S301
  896. class DatasetCollectionBinding(Base):
  897. __tablename__ = "dataset_collection_bindings"
  898. __table_args__ = (
  899. sa.PrimaryKeyConstraint("id", name="dataset_collection_bindings_pkey"),
  900. sa.Index("provider_model_name_idx", "provider_name", "model_name"),
  901. )
  902. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  903. provider_name: Mapped[str] = mapped_column(String(255), nullable=False)
  904. model_name: Mapped[str] = mapped_column(String(255), nullable=False)
  905. type = mapped_column(String(40), server_default=sa.text("'dataset'::character varying"), nullable=False)
  906. collection_name = mapped_column(String(64), nullable=False)
  907. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  908. class TidbAuthBinding(Base):
  909. __tablename__ = "tidb_auth_bindings"
  910. __table_args__ = (
  911. sa.PrimaryKeyConstraint("id", name="tidb_auth_bindings_pkey"),
  912. sa.Index("tidb_auth_bindings_tenant_idx", "tenant_id"),
  913. sa.Index("tidb_auth_bindings_active_idx", "active"),
  914. sa.Index("tidb_auth_bindings_created_at_idx", "created_at"),
  915. sa.Index("tidb_auth_bindings_status_idx", "status"),
  916. )
  917. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  918. tenant_id = mapped_column(StringUUID, nullable=True)
  919. cluster_id: Mapped[str] = mapped_column(String(255), nullable=False)
  920. cluster_name: Mapped[str] = mapped_column(String(255), nullable=False)
  921. active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=db.text("false"))
  922. status = mapped_column(String(255), nullable=False, server_default=db.text("'CREATING'::character varying"))
  923. account: Mapped[str] = mapped_column(String(255), nullable=False)
  924. password: Mapped[str] = mapped_column(String(255), nullable=False)
  925. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  926. class Whitelist(Base):
  927. __tablename__ = "whitelists"
  928. __table_args__ = (
  929. sa.PrimaryKeyConstraint("id", name="whitelists_pkey"),
  930. sa.Index("whitelists_tenant_idx", "tenant_id"),
  931. )
  932. id = mapped_column(StringUUID, primary_key=True, server_default=sa.text("uuid_generate_v4()"))
  933. tenant_id = mapped_column(StringUUID, nullable=True)
  934. category: Mapped[str] = mapped_column(String(255), nullable=False)
  935. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  936. class DatasetPermission(Base):
  937. __tablename__ = "dataset_permissions"
  938. __table_args__ = (
  939. sa.PrimaryKeyConstraint("id", name="dataset_permission_pkey"),
  940. sa.Index("idx_dataset_permissions_dataset_id", "dataset_id"),
  941. sa.Index("idx_dataset_permissions_account_id", "account_id"),
  942. sa.Index("idx_dataset_permissions_tenant_id", "tenant_id"),
  943. )
  944. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"), primary_key=True)
  945. dataset_id = mapped_column(StringUUID, nullable=False)
  946. account_id = mapped_column(StringUUID, nullable=False)
  947. tenant_id = mapped_column(StringUUID, nullable=False)
  948. has_permission: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
  949. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  950. class ExternalKnowledgeApis(Base):
  951. __tablename__ = "external_knowledge_apis"
  952. __table_args__ = (
  953. sa.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
  954. sa.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
  955. sa.Index("external_knowledge_apis_name_idx", "name"),
  956. )
  957. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  958. name: Mapped[str] = mapped_column(String(255), nullable=False)
  959. description: Mapped[str] = mapped_column(String(255), nullable=False)
  960. tenant_id = mapped_column(StringUUID, nullable=False)
  961. settings = mapped_column(sa.Text, nullable=True)
  962. created_by = mapped_column(StringUUID, nullable=False)
  963. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  964. updated_by = mapped_column(StringUUID, nullable=True)
  965. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  966. def to_dict(self) -> dict[str, Any]:
  967. return {
  968. "id": self.id,
  969. "tenant_id": self.tenant_id,
  970. "name": self.name,
  971. "description": self.description,
  972. "settings": self.settings_dict,
  973. "dataset_bindings": self.dataset_bindings,
  974. "created_by": self.created_by,
  975. "created_at": self.created_at.isoformat(),
  976. }
  977. @property
  978. def settings_dict(self) -> dict[str, Any] | None:
  979. try:
  980. return json.loads(self.settings) if self.settings else None
  981. except JSONDecodeError:
  982. return None
  983. @property
  984. def dataset_bindings(self) -> list[dict[str, Any]]:
  985. external_knowledge_bindings = db.session.scalars(
  986. select(ExternalKnowledgeBindings).where(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
  987. ).all()
  988. dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
  989. datasets = db.session.scalars(select(Dataset).where(Dataset.id.in_(dataset_ids))).all()
  990. dataset_bindings: list[dict[str, Any]] = []
  991. for dataset in datasets:
  992. dataset_bindings.append({"id": dataset.id, "name": dataset.name})
  993. return dataset_bindings
  994. class ExternalKnowledgeBindings(Base):
  995. __tablename__ = "external_knowledge_bindings"
  996. __table_args__ = (
  997. sa.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
  998. sa.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
  999. sa.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
  1000. sa.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
  1001. sa.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
  1002. )
  1003. id = mapped_column(StringUUID, nullable=False, server_default=sa.text("uuid_generate_v4()"))
  1004. tenant_id = mapped_column(StringUUID, nullable=False)
  1005. external_knowledge_api_id = mapped_column(StringUUID, nullable=False)
  1006. dataset_id = mapped_column(StringUUID, nullable=False)
  1007. external_knowledge_id = mapped_column(sa.Text, nullable=False)
  1008. created_by = mapped_column(StringUUID, nullable=False)
  1009. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1010. updated_by = mapped_column(StringUUID, nullable=True)
  1011. updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1012. class DatasetAutoDisableLog(Base):
  1013. __tablename__ = "dataset_auto_disable_logs"
  1014. __table_args__ = (
  1015. sa.PrimaryKeyConstraint("id", name="dataset_auto_disable_log_pkey"),
  1016. sa.Index("dataset_auto_disable_log_tenant_idx", "tenant_id"),
  1017. sa.Index("dataset_auto_disable_log_dataset_idx", "dataset_id"),
  1018. sa.Index("dataset_auto_disable_log_created_atx", "created_at"),
  1019. )
  1020. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1021. tenant_id = mapped_column(StringUUID, nullable=False)
  1022. dataset_id = mapped_column(StringUUID, nullable=False)
  1023. document_id = mapped_column(StringUUID, nullable=False)
  1024. notified: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
  1025. created_at: Mapped[datetime] = mapped_column(
  1026. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1027. )
  1028. class RateLimitLog(Base):
  1029. __tablename__ = "rate_limit_logs"
  1030. __table_args__ = (
  1031. sa.PrimaryKeyConstraint("id", name="rate_limit_log_pkey"),
  1032. sa.Index("rate_limit_log_tenant_idx", "tenant_id"),
  1033. sa.Index("rate_limit_log_operation_idx", "operation"),
  1034. )
  1035. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1036. tenant_id = mapped_column(StringUUID, nullable=False)
  1037. subscription_plan: Mapped[str] = mapped_column(String(255), nullable=False)
  1038. operation: Mapped[str] = mapped_column(String(255), nullable=False)
  1039. created_at: Mapped[datetime] = mapped_column(
  1040. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1041. )
  1042. class DatasetMetadata(Base):
  1043. __tablename__ = "dataset_metadatas"
  1044. __table_args__ = (
  1045. sa.PrimaryKeyConstraint("id", name="dataset_metadata_pkey"),
  1046. sa.Index("dataset_metadata_tenant_idx", "tenant_id"),
  1047. sa.Index("dataset_metadata_dataset_idx", "dataset_id"),
  1048. )
  1049. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1050. tenant_id = mapped_column(StringUUID, nullable=False)
  1051. dataset_id = mapped_column(StringUUID, nullable=False)
  1052. type: Mapped[str] = mapped_column(String(255), nullable=False)
  1053. name: Mapped[str] = mapped_column(String(255), nullable=False)
  1054. created_at: Mapped[datetime] = mapped_column(
  1055. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1056. )
  1057. updated_at: Mapped[datetime] = mapped_column(
  1058. DateTime, nullable=False, server_default=sa.text("CURRENT_TIMESTAMP(0)")
  1059. )
  1060. created_by = mapped_column(StringUUID, nullable=False)
  1061. updated_by = mapped_column(StringUUID, nullable=True)
  1062. class DatasetMetadataBinding(Base):
  1063. __tablename__ = "dataset_metadata_bindings"
  1064. __table_args__ = (
  1065. sa.PrimaryKeyConstraint("id", name="dataset_metadata_binding_pkey"),
  1066. sa.Index("dataset_metadata_binding_tenant_idx", "tenant_id"),
  1067. sa.Index("dataset_metadata_binding_dataset_idx", "dataset_id"),
  1068. sa.Index("dataset_metadata_binding_metadata_idx", "metadata_id"),
  1069. sa.Index("dataset_metadata_binding_document_idx", "document_id"),
  1070. )
  1071. id = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
  1072. tenant_id = mapped_column(StringUUID, nullable=False)
  1073. dataset_id = mapped_column(StringUUID, nullable=False)
  1074. metadata_id = mapped_column(StringUUID, nullable=False)
  1075. document_id = mapped_column(StringUUID, nullable=False)
  1076. created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, server_default=func.current_timestamp())
  1077. created_by = mapped_column(StringUUID, nullable=False)
  1078. class PipelineBuiltInTemplate(Base): # type: ignore[name-defined]
  1079. __tablename__ = "pipeline_built_in_templates"
  1080. __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_built_in_template_pkey"),)
  1081. id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
  1082. name = db.Column(db.String(255), nullable=False)
  1083. description = db.Column(db.Text, nullable=False)
  1084. chunk_structure = db.Column(db.String(255), nullable=False)
  1085. icon = db.Column(db.JSON, nullable=False)
  1086. yaml_content = db.Column(db.Text, nullable=False)
  1087. copyright = db.Column(db.String(255), nullable=False)
  1088. privacy_policy = db.Column(db.String(255), nullable=False)
  1089. position = db.Column(db.Integer, nullable=False)
  1090. install_count = db.Column(db.Integer, nullable=False, default=0)
  1091. language = db.Column(db.String(255), nullable=False)
  1092. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1093. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1094. created_by = db.Column(StringUUID, nullable=False)
  1095. updated_by = db.Column(StringUUID, nullable=True)
  1096. @property
  1097. def created_user_name(self):
  1098. account = db.session.query(Account).where(Account.id == self.created_by).first()
  1099. if account:
  1100. return account.name
  1101. return ""
  1102. class PipelineCustomizedTemplate(Base): # type: ignore[name-defined]
  1103. __tablename__ = "pipeline_customized_templates"
  1104. __table_args__ = (
  1105. db.PrimaryKeyConstraint("id", name="pipeline_customized_template_pkey"),
  1106. db.Index("pipeline_customized_template_tenant_idx", "tenant_id"),
  1107. )
  1108. id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
  1109. tenant_id = db.Column(StringUUID, nullable=False)
  1110. name = db.Column(db.String(255), nullable=False)
  1111. description = db.Column(db.Text, nullable=False)
  1112. chunk_structure = db.Column(db.String(255), nullable=False)
  1113. icon = db.Column(db.JSON, nullable=False)
  1114. position = db.Column(db.Integer, nullable=False)
  1115. yaml_content = db.Column(db.Text, nullable=False)
  1116. install_count = db.Column(db.Integer, nullable=False, default=0)
  1117. language = db.Column(db.String(255), nullable=False)
  1118. created_by = db.Column(StringUUID, nullable=False)
  1119. updated_by = db.Column(StringUUID, nullable=True)
  1120. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1121. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1122. @property
  1123. def created_user_name(self):
  1124. account = db.session.query(Account).where(Account.id == self.created_by).first()
  1125. if account:
  1126. return account.name
  1127. return ""
  1128. class Pipeline(Base): # type: ignore[name-defined]
  1129. __tablename__ = "pipelines"
  1130. __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_pkey"),)
  1131. id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
  1132. tenant_id: Mapped[str] = db.Column(StringUUID, nullable=False)
  1133. name = db.Column(db.String(255), nullable=False)
  1134. description = db.Column(db.Text, nullable=False, server_default=db.text("''::character varying"))
  1135. workflow_id = db.Column(StringUUID, nullable=True)
  1136. is_public = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1137. is_published = db.Column(db.Boolean, nullable=False, server_default=db.text("false"))
  1138. created_by = db.Column(StringUUID, nullable=True)
  1139. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1140. updated_by = db.Column(StringUUID, nullable=True)
  1141. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1142. def retrieve_dataset(self, session: Session):
  1143. return session.query(Dataset).where(Dataset.pipeline_id == self.id).first()
  1144. class DocumentPipelineExecutionLog(Base):
  1145. __tablename__ = "document_pipeline_execution_logs"
  1146. __table_args__ = (
  1147. db.PrimaryKeyConstraint("id", name="document_pipeline_execution_log_pkey"),
  1148. db.Index("document_pipeline_execution_logs_document_id_idx", "document_id"),
  1149. )
  1150. id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
  1151. pipeline_id = db.Column(StringUUID, nullable=False)
  1152. document_id = db.Column(StringUUID, nullable=False)
  1153. datasource_type = db.Column(db.String(255), nullable=False)
  1154. datasource_info = db.Column(db.Text, nullable=False)
  1155. datasource_node_id = db.Column(db.String(255), nullable=False)
  1156. input_data = db.Column(db.JSON, nullable=False)
  1157. created_by = db.Column(StringUUID, nullable=True)
  1158. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1159. class PipelineRecommendedPlugin(Base):
  1160. __tablename__ = "pipeline_recommended_plugins"
  1161. __table_args__ = (db.PrimaryKeyConstraint("id", name="pipeline_recommended_plugin_pkey"),)
  1162. id = db.Column(StringUUID, server_default=db.text("uuidv7()"))
  1163. plugin_id = db.Column(db.Text, nullable=False)
  1164. provider_name = db.Column(db.Text, nullable=False)
  1165. position = db.Column(db.Integer, nullable=False, default=0)
  1166. active = db.Column(db.Boolean, nullable=False, default=True)
  1167. created_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())
  1168. updated_at = db.Column(db.DateTime, nullable=False, server_default=func.current_timestamp())