Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

datasets_document.py 45KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from collections.abc import Sequence
  4. from typing import Literal, cast
  5. from flask import request
  6. from flask_login import current_user
  7. from flask_restx import Resource, fields, marshal, marshal_with, reqparse
  8. from sqlalchemy import asc, desc, select
  9. from werkzeug.exceptions import Forbidden, NotFound
  10. import services
  11. from controllers.console import api, console_ns
  12. from controllers.console.app.error import (
  13. ProviderModelCurrentlyNotSupportError,
  14. ProviderNotInitializeError,
  15. ProviderQuotaExceededError,
  16. )
  17. from controllers.console.datasets.error import (
  18. ArchivedDocumentImmutableError,
  19. DocumentAlreadyFinishedError,
  20. DocumentIndexingError,
  21. IndexingEstimateError,
  22. InvalidActionError,
  23. InvalidMetadataError,
  24. )
  25. from controllers.console.wraps import (
  26. account_initialization_required,
  27. cloud_edition_billing_rate_limit_check,
  28. cloud_edition_billing_resource_check,
  29. setup_required,
  30. )
  31. from core.errors.error import (
  32. LLMBadRequestError,
  33. ModelCurrentlyNotSupportError,
  34. ProviderTokenNotInitError,
  35. QuotaExceededError,
  36. )
  37. from core.indexing_runner import IndexingRunner
  38. from core.model_manager import ModelManager
  39. from core.model_runtime.entities.model_entities import ModelType
  40. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  41. from core.plugin.impl.exc import PluginDaemonClientSideError
  42. from core.rag.extractor.entity.datasource_type import DatasourceType
  43. from core.rag.extractor.entity.extract_setting import ExtractSetting
  44. from extensions.ext_database import db
  45. from fields.document_fields import (
  46. dataset_and_document_fields,
  47. document_fields,
  48. document_status_fields,
  49. document_with_segments_fields,
  50. )
  51. from libs.datetime_utils import naive_utc_now
  52. from libs.login import login_required
  53. from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
  54. from services.dataset_service import DatasetService, DocumentService
  55. from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
  56. logger = logging.getLogger(__name__)
  57. class DocumentResource(Resource):
  58. def get_document(self, dataset_id: str, document_id: str) -> Document:
  59. dataset = DatasetService.get_dataset(dataset_id)
  60. if not dataset:
  61. raise NotFound("Dataset not found.")
  62. try:
  63. DatasetService.check_dataset_permission(dataset, current_user)
  64. except services.errors.account.NoPermissionError as e:
  65. raise Forbidden(str(e))
  66. document = DocumentService.get_document(dataset_id, document_id)
  67. if not document:
  68. raise NotFound("Document not found.")
  69. if document.tenant_id != current_user.current_tenant_id:
  70. raise Forbidden("No permission.")
  71. return document
  72. def get_batch_documents(self, dataset_id: str, batch: str) -> Sequence[Document]:
  73. dataset = DatasetService.get_dataset(dataset_id)
  74. if not dataset:
  75. raise NotFound("Dataset not found.")
  76. try:
  77. DatasetService.check_dataset_permission(dataset, current_user)
  78. except services.errors.account.NoPermissionError as e:
  79. raise Forbidden(str(e))
  80. documents = DocumentService.get_batch_documents(dataset_id, batch)
  81. if not documents:
  82. raise NotFound("Documents not found.")
  83. return documents
  84. @console_ns.route("/datasets/process-rule")
  85. class GetProcessRuleApi(Resource):
  86. @api.doc("get_process_rule")
  87. @api.doc(description="Get dataset document processing rules")
  88. @api.doc(params={"document_id": "Document ID (optional)"})
  89. @api.response(200, "Process rules retrieved successfully")
  90. @setup_required
  91. @login_required
  92. @account_initialization_required
  93. def get(self):
  94. req_data = request.args
  95. document_id = req_data.get("document_id")
  96. # get default rules
  97. mode = DocumentService.DEFAULT_RULES["mode"]
  98. rules = DocumentService.DEFAULT_RULES["rules"]
  99. limits = DocumentService.DEFAULT_RULES["limits"]
  100. if document_id:
  101. # get the latest process rule
  102. document = db.get_or_404(Document, document_id)
  103. dataset = DatasetService.get_dataset(document.dataset_id)
  104. if not dataset:
  105. raise NotFound("Dataset not found.")
  106. try:
  107. DatasetService.check_dataset_permission(dataset, current_user)
  108. except services.errors.account.NoPermissionError as e:
  109. raise Forbidden(str(e))
  110. # get the latest process rule
  111. dataset_process_rule = (
  112. db.session.query(DatasetProcessRule)
  113. .where(DatasetProcessRule.dataset_id == document.dataset_id)
  114. .order_by(DatasetProcessRule.created_at.desc())
  115. .limit(1)
  116. .one_or_none()
  117. )
  118. if dataset_process_rule:
  119. mode = dataset_process_rule.mode
  120. rules = dataset_process_rule.rules_dict
  121. return {"mode": mode, "rules": rules, "limits": limits}
  122. @console_ns.route("/datasets/<uuid:dataset_id>/documents")
  123. class DatasetDocumentListApi(Resource):
  124. @api.doc("get_dataset_documents")
  125. @api.doc(description="Get documents in a dataset")
  126. @api.doc(
  127. params={
  128. "dataset_id": "Dataset ID",
  129. "page": "Page number (default: 1)",
  130. "limit": "Number of items per page (default: 20)",
  131. "keyword": "Search keyword",
  132. "sort": "Sort order (default: -created_at)",
  133. "fetch": "Fetch full details (default: false)",
  134. }
  135. )
  136. @api.response(200, "Documents retrieved successfully")
  137. @setup_required
  138. @login_required
  139. @account_initialization_required
  140. def get(self, dataset_id):
  141. dataset_id = str(dataset_id)
  142. page = request.args.get("page", default=1, type=int)
  143. limit = request.args.get("limit", default=20, type=int)
  144. search = request.args.get("keyword", default=None, type=str)
  145. sort = request.args.get("sort", default="-created_at", type=str)
  146. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  147. try:
  148. fetch_val = request.args.get("fetch", default="false")
  149. if isinstance(fetch_val, bool):
  150. fetch = fetch_val
  151. else:
  152. if fetch_val.lower() in ("yes", "true", "t", "y", "1"):
  153. fetch = True
  154. elif fetch_val.lower() in ("no", "false", "f", "n", "0"):
  155. fetch = False
  156. else:
  157. raise ArgumentTypeError(
  158. f"Truthy value expected: got {fetch_val} but expected one of yes/no, true/false, t/f, y/n, 1/0 "
  159. f"(case insensitive)."
  160. )
  161. except (ArgumentTypeError, ValueError, Exception):
  162. fetch = False
  163. dataset = DatasetService.get_dataset(dataset_id)
  164. if not dataset:
  165. raise NotFound("Dataset not found.")
  166. try:
  167. DatasetService.check_dataset_permission(dataset, current_user)
  168. except services.errors.account.NoPermissionError as e:
  169. raise Forbidden(str(e))
  170. query = select(Document).filter_by(dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  171. if search:
  172. search = f"%{search}%"
  173. query = query.where(Document.name.like(search))
  174. if sort.startswith("-"):
  175. sort_logic = desc
  176. sort = sort[1:]
  177. else:
  178. sort_logic = asc
  179. if sort == "hit_count":
  180. sub_query = (
  181. db.select(DocumentSegment.document_id, db.func.sum(DocumentSegment.hit_count).label("total_hit_count"))
  182. .group_by(DocumentSegment.document_id)
  183. .subquery()
  184. )
  185. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id).order_by(
  186. sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)),
  187. sort_logic(Document.position),
  188. )
  189. elif sort == "created_at":
  190. query = query.order_by(
  191. sort_logic(Document.created_at),
  192. sort_logic(Document.position),
  193. )
  194. else:
  195. query = query.order_by(
  196. desc(Document.created_at),
  197. desc(Document.position),
  198. )
  199. paginated_documents = db.paginate(select=query, page=page, per_page=limit, max_per_page=100, error_out=False)
  200. documents = paginated_documents.items
  201. if fetch:
  202. for document in documents:
  203. completed_segments = (
  204. db.session.query(DocumentSegment)
  205. .where(
  206. DocumentSegment.completed_at.isnot(None),
  207. DocumentSegment.document_id == str(document.id),
  208. DocumentSegment.status != "re_segment",
  209. )
  210. .count()
  211. )
  212. total_segments = (
  213. db.session.query(DocumentSegment)
  214. .where(DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment")
  215. .count()
  216. )
  217. document.completed_segments = completed_segments
  218. document.total_segments = total_segments
  219. data = marshal(documents, document_with_segments_fields)
  220. else:
  221. data = marshal(documents, document_fields)
  222. response = {
  223. "data": data,
  224. "has_more": len(documents) == limit,
  225. "limit": limit,
  226. "total": paginated_documents.total,
  227. "page": page,
  228. }
  229. return response
  230. @setup_required
  231. @login_required
  232. @account_initialization_required
  233. @marshal_with(dataset_and_document_fields)
  234. @cloud_edition_billing_resource_check("vector_space")
  235. @cloud_edition_billing_rate_limit_check("knowledge")
  236. def post(self, dataset_id):
  237. dataset_id = str(dataset_id)
  238. dataset = DatasetService.get_dataset(dataset_id)
  239. if not dataset:
  240. raise NotFound("Dataset not found.")
  241. # The role of the current user in the ta table must be admin, owner, or editor
  242. if not current_user.is_dataset_editor:
  243. raise Forbidden()
  244. try:
  245. DatasetService.check_dataset_permission(dataset, current_user)
  246. except services.errors.account.NoPermissionError as e:
  247. raise Forbidden(str(e))
  248. parser = reqparse.RequestParser()
  249. parser.add_argument(
  250. "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
  251. )
  252. parser.add_argument("data_source", type=dict, required=False, location="json")
  253. parser.add_argument("process_rule", type=dict, required=False, location="json")
  254. parser.add_argument("duplicate", type=bool, default=True, nullable=False, location="json")
  255. parser.add_argument("original_document_id", type=str, required=False, location="json")
  256. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  257. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  258. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  259. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  260. parser.add_argument(
  261. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  262. )
  263. args = parser.parse_args()
  264. knowledge_config = KnowledgeConfig(**args)
  265. if not dataset.indexing_technique and not knowledge_config.indexing_technique:
  266. raise ValueError("indexing_technique is required.")
  267. # validate args
  268. DocumentService.document_create_args_validate(knowledge_config)
  269. try:
  270. documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, current_user)
  271. dataset = DatasetService.get_dataset(dataset_id)
  272. except ProviderTokenNotInitError as ex:
  273. raise ProviderNotInitializeError(ex.description)
  274. except QuotaExceededError:
  275. raise ProviderQuotaExceededError()
  276. except ModelCurrentlyNotSupportError:
  277. raise ProviderModelCurrentlyNotSupportError()
  278. return {"dataset": dataset, "documents": documents, "batch": batch}
  279. @setup_required
  280. @login_required
  281. @account_initialization_required
  282. @cloud_edition_billing_rate_limit_check("knowledge")
  283. def delete(self, dataset_id):
  284. dataset_id = str(dataset_id)
  285. dataset = DatasetService.get_dataset(dataset_id)
  286. if dataset is None:
  287. raise NotFound("Dataset not found.")
  288. # check user's model setting
  289. DatasetService.check_dataset_model_setting(dataset)
  290. try:
  291. document_ids = request.args.getlist("document_id")
  292. DocumentService.delete_documents(dataset, document_ids)
  293. except services.errors.document.DocumentIndexingError:
  294. raise DocumentIndexingError("Cannot delete document during indexing.")
  295. return {"result": "success"}, 204
  296. @console_ns.route("/datasets/init")
  297. class DatasetInitApi(Resource):
  298. @api.doc("init_dataset")
  299. @api.doc(description="Initialize dataset with documents")
  300. @api.expect(
  301. api.model(
  302. "DatasetInitRequest",
  303. {
  304. "upload_file_id": fields.String(required=True, description="Upload file ID"),
  305. "indexing_technique": fields.String(description="Indexing technique"),
  306. "process_rule": fields.Raw(description="Processing rules"),
  307. "data_source": fields.Raw(description="Data source configuration"),
  308. },
  309. )
  310. )
  311. @api.response(201, "Dataset initialized successfully", dataset_and_document_fields)
  312. @api.response(400, "Invalid request parameters")
  313. @setup_required
  314. @login_required
  315. @account_initialization_required
  316. @marshal_with(dataset_and_document_fields)
  317. @cloud_edition_billing_resource_check("vector_space")
  318. @cloud_edition_billing_rate_limit_check("knowledge")
  319. def post(self):
  320. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  321. if not current_user.is_dataset_editor:
  322. raise Forbidden()
  323. parser = reqparse.RequestParser()
  324. parser.add_argument(
  325. "indexing_technique",
  326. type=str,
  327. choices=Dataset.INDEXING_TECHNIQUE_LIST,
  328. required=True,
  329. nullable=False,
  330. location="json",
  331. )
  332. parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
  333. parser.add_argument("process_rule", type=dict, required=True, nullable=True, location="json")
  334. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  335. parser.add_argument(
  336. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  337. )
  338. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  339. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  340. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  341. args = parser.parse_args()
  342. knowledge_config = KnowledgeConfig(**args)
  343. if knowledge_config.indexing_technique == "high_quality":
  344. if knowledge_config.embedding_model is None or knowledge_config.embedding_model_provider is None:
  345. raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
  346. try:
  347. model_manager = ModelManager()
  348. model_manager.get_model_instance(
  349. tenant_id=current_user.current_tenant_id,
  350. provider=args["embedding_model_provider"],
  351. model_type=ModelType.TEXT_EMBEDDING,
  352. model=args["embedding_model"],
  353. )
  354. except InvokeAuthorizationError:
  355. raise ProviderNotInitializeError(
  356. "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
  357. )
  358. except ProviderTokenNotInitError as ex:
  359. raise ProviderNotInitializeError(ex.description)
  360. # validate args
  361. DocumentService.document_create_args_validate(knowledge_config)
  362. try:
  363. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  364. tenant_id=current_user.current_tenant_id, knowledge_config=knowledge_config, account=current_user
  365. )
  366. except ProviderTokenNotInitError as ex:
  367. raise ProviderNotInitializeError(ex.description)
  368. except QuotaExceededError:
  369. raise ProviderQuotaExceededError()
  370. except ModelCurrentlyNotSupportError:
  371. raise ProviderModelCurrentlyNotSupportError()
  372. response = {"dataset": dataset, "documents": documents, "batch": batch}
  373. return response
  374. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate")
  375. class DocumentIndexingEstimateApi(DocumentResource):
  376. @api.doc("estimate_document_indexing")
  377. @api.doc(description="Estimate document indexing cost")
  378. @api.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
  379. @api.response(200, "Indexing estimate calculated successfully")
  380. @api.response(404, "Document not found")
  381. @api.response(400, "Document already finished")
  382. @setup_required
  383. @login_required
  384. @account_initialization_required
  385. def get(self, dataset_id, document_id):
  386. dataset_id = str(dataset_id)
  387. document_id = str(document_id)
  388. document = self.get_document(dataset_id, document_id)
  389. if document.indexing_status in {"completed", "error"}:
  390. raise DocumentAlreadyFinishedError()
  391. data_process_rule = document.dataset_process_rule
  392. data_process_rule_dict = data_process_rule.to_dict()
  393. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  394. if document.data_source_type == "upload_file":
  395. data_source_info = document.data_source_info_dict
  396. if data_source_info and "upload_file_id" in data_source_info:
  397. file_id = data_source_info["upload_file_id"]
  398. file = (
  399. db.session.query(UploadFile)
  400. .where(UploadFile.tenant_id == document.tenant_id, UploadFile.id == file_id)
  401. .first()
  402. )
  403. # raise error if file not found
  404. if not file:
  405. raise NotFound("File not found.")
  406. extract_setting = ExtractSetting(
  407. datasource_type=DatasourceType.FILE.value, upload_file=file, document_model=document.doc_form
  408. )
  409. indexing_runner = IndexingRunner()
  410. try:
  411. estimate_response = indexing_runner.indexing_estimate(
  412. current_user.current_tenant_id,
  413. [extract_setting],
  414. data_process_rule_dict,
  415. document.doc_form,
  416. "English",
  417. dataset_id,
  418. )
  419. return estimate_response.model_dump(), 200
  420. except LLMBadRequestError:
  421. raise ProviderNotInitializeError(
  422. "No Embedding Model available. Please configure a valid provider "
  423. "in the Settings -> Model Provider."
  424. )
  425. except ProviderTokenNotInitError as ex:
  426. raise ProviderNotInitializeError(ex.description)
  427. except PluginDaemonClientSideError as ex:
  428. raise ProviderNotInitializeError(ex.description)
  429. except Exception as e:
  430. raise IndexingEstimateError(str(e))
  431. return response, 200
  432. class DocumentBatchIndexingEstimateApi(DocumentResource):
  433. @setup_required
  434. @login_required
  435. @account_initialization_required
  436. def get(self, dataset_id, batch):
  437. dataset_id = str(dataset_id)
  438. batch = str(batch)
  439. documents = self.get_batch_documents(dataset_id, batch)
  440. if not documents:
  441. return {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}, 200
  442. data_process_rule = documents[0].dataset_process_rule
  443. data_process_rule_dict = data_process_rule.to_dict()
  444. extract_settings = []
  445. for document in documents:
  446. if document.indexing_status in {"completed", "error"}:
  447. raise DocumentAlreadyFinishedError()
  448. data_source_info = document.data_source_info_dict
  449. if document.data_source_type == "upload_file":
  450. if not data_source_info:
  451. continue
  452. file_id = data_source_info["upload_file_id"]
  453. file_detail = (
  454. db.session.query(UploadFile)
  455. .where(UploadFile.tenant_id == current_user.current_tenant_id, UploadFile.id == file_id)
  456. .first()
  457. )
  458. if file_detail is None:
  459. raise NotFound("File not found.")
  460. extract_setting = ExtractSetting(
  461. datasource_type=DatasourceType.FILE.value, upload_file=file_detail, document_model=document.doc_form
  462. )
  463. extract_settings.append(extract_setting)
  464. elif document.data_source_type == "notion_import":
  465. if not data_source_info:
  466. continue
  467. extract_setting = ExtractSetting(
  468. datasource_type=DatasourceType.NOTION.value,
  469. notion_info={
  470. "notion_workspace_id": data_source_info["notion_workspace_id"],
  471. "notion_obj_id": data_source_info["notion_page_id"],
  472. "notion_page_type": data_source_info["type"],
  473. "tenant_id": current_user.current_tenant_id,
  474. },
  475. document_model=document.doc_form,
  476. )
  477. extract_settings.append(extract_setting)
  478. elif document.data_source_type == "website_crawl":
  479. if not data_source_info:
  480. continue
  481. extract_setting = ExtractSetting(
  482. datasource_type=DatasourceType.WEBSITE.value,
  483. website_info={
  484. "provider": data_source_info["provider"],
  485. "job_id": data_source_info["job_id"],
  486. "url": data_source_info["url"],
  487. "tenant_id": current_user.current_tenant_id,
  488. "mode": data_source_info["mode"],
  489. "only_main_content": data_source_info["only_main_content"],
  490. },
  491. document_model=document.doc_form,
  492. )
  493. extract_settings.append(extract_setting)
  494. else:
  495. raise ValueError("Data source type not support")
  496. indexing_runner = IndexingRunner()
  497. try:
  498. response = indexing_runner.indexing_estimate(
  499. current_user.current_tenant_id,
  500. extract_settings,
  501. data_process_rule_dict,
  502. document.doc_form,
  503. "English",
  504. dataset_id,
  505. )
  506. return response.model_dump(), 200
  507. except LLMBadRequestError:
  508. raise ProviderNotInitializeError(
  509. "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
  510. )
  511. except ProviderTokenNotInitError as ex:
  512. raise ProviderNotInitializeError(ex.description)
  513. except PluginDaemonClientSideError as ex:
  514. raise ProviderNotInitializeError(ex.description)
  515. except Exception as e:
  516. raise IndexingEstimateError(str(e))
  517. class DocumentBatchIndexingStatusApi(DocumentResource):
  518. @setup_required
  519. @login_required
  520. @account_initialization_required
  521. def get(self, dataset_id, batch):
  522. dataset_id = str(dataset_id)
  523. batch = str(batch)
  524. documents = self.get_batch_documents(dataset_id, batch)
  525. documents_status = []
  526. for document in documents:
  527. completed_segments = (
  528. db.session.query(DocumentSegment)
  529. .where(
  530. DocumentSegment.completed_at.isnot(None),
  531. DocumentSegment.document_id == str(document.id),
  532. DocumentSegment.status != "re_segment",
  533. )
  534. .count()
  535. )
  536. total_segments = (
  537. db.session.query(DocumentSegment)
  538. .where(DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment")
  539. .count()
  540. )
  541. # Create a dictionary with document attributes and additional fields
  542. document_dict = {
  543. "id": document.id,
  544. "indexing_status": "paused" if document.is_paused else document.indexing_status,
  545. "processing_started_at": document.processing_started_at,
  546. "parsing_completed_at": document.parsing_completed_at,
  547. "cleaning_completed_at": document.cleaning_completed_at,
  548. "splitting_completed_at": document.splitting_completed_at,
  549. "completed_at": document.completed_at,
  550. "paused_at": document.paused_at,
  551. "error": document.error,
  552. "stopped_at": document.stopped_at,
  553. "completed_segments": completed_segments,
  554. "total_segments": total_segments,
  555. }
  556. documents_status.append(marshal(document_dict, document_status_fields))
  557. data = {"data": documents_status}
  558. return data
  559. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status")
  560. class DocumentIndexingStatusApi(DocumentResource):
  561. @api.doc("get_document_indexing_status")
  562. @api.doc(description="Get document indexing status")
  563. @api.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
  564. @api.response(200, "Indexing status retrieved successfully")
  565. @api.response(404, "Document not found")
  566. @setup_required
  567. @login_required
  568. @account_initialization_required
  569. def get(self, dataset_id, document_id):
  570. dataset_id = str(dataset_id)
  571. document_id = str(document_id)
  572. document = self.get_document(dataset_id, document_id)
  573. completed_segments = (
  574. db.session.query(DocumentSegment)
  575. .where(
  576. DocumentSegment.completed_at.isnot(None),
  577. DocumentSegment.document_id == str(document_id),
  578. DocumentSegment.status != "re_segment",
  579. )
  580. .count()
  581. )
  582. total_segments = (
  583. db.session.query(DocumentSegment)
  584. .where(DocumentSegment.document_id == str(document_id), DocumentSegment.status != "re_segment")
  585. .count()
  586. )
  587. # Create a dictionary with document attributes and additional fields
  588. document_dict = {
  589. "id": document.id,
  590. "indexing_status": "paused" if document.is_paused else document.indexing_status,
  591. "processing_started_at": document.processing_started_at,
  592. "parsing_completed_at": document.parsing_completed_at,
  593. "cleaning_completed_at": document.cleaning_completed_at,
  594. "splitting_completed_at": document.splitting_completed_at,
  595. "completed_at": document.completed_at,
  596. "paused_at": document.paused_at,
  597. "error": document.error,
  598. "stopped_at": document.stopped_at,
  599. "completed_segments": completed_segments,
  600. "total_segments": total_segments,
  601. }
  602. return marshal(document_dict, document_status_fields)
  603. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  604. class DocumentApi(DocumentResource):
  605. METADATA_CHOICES = {"all", "only", "without"}
  606. @api.doc("get_document")
  607. @api.doc(description="Get document details")
  608. @api.doc(
  609. params={
  610. "dataset_id": "Dataset ID",
  611. "document_id": "Document ID",
  612. "metadata": "Metadata inclusion (all/only/without)",
  613. }
  614. )
  615. @api.response(200, "Document retrieved successfully")
  616. @api.response(404, "Document not found")
  617. @setup_required
  618. @login_required
  619. @account_initialization_required
  620. def get(self, dataset_id, document_id):
  621. dataset_id = str(dataset_id)
  622. document_id = str(document_id)
  623. document = self.get_document(dataset_id, document_id)
  624. metadata = request.args.get("metadata", "all")
  625. if metadata not in self.METADATA_CHOICES:
  626. raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
  627. if metadata == "only":
  628. response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
  629. elif metadata == "without":
  630. dataset_process_rules = DatasetService.get_process_rules(dataset_id)
  631. document_process_rules = document.dataset_process_rule.to_dict()
  632. data_source_info = document.data_source_detail_dict
  633. response = {
  634. "id": document.id,
  635. "position": document.position,
  636. "data_source_type": document.data_source_type,
  637. "data_source_info": data_source_info,
  638. "dataset_process_rule_id": document.dataset_process_rule_id,
  639. "dataset_process_rule": dataset_process_rules,
  640. "document_process_rule": document_process_rules,
  641. "name": document.name,
  642. "created_from": document.created_from,
  643. "created_by": document.created_by,
  644. "created_at": document.created_at.timestamp(),
  645. "tokens": document.tokens,
  646. "indexing_status": document.indexing_status,
  647. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  648. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  649. "indexing_latency": document.indexing_latency,
  650. "error": document.error,
  651. "enabled": document.enabled,
  652. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  653. "disabled_by": document.disabled_by,
  654. "archived": document.archived,
  655. "segment_count": document.segment_count,
  656. "average_segment_length": document.average_segment_length,
  657. "hit_count": document.hit_count,
  658. "display_status": document.display_status,
  659. "doc_form": document.doc_form,
  660. "doc_language": document.doc_language,
  661. }
  662. else:
  663. dataset_process_rules = DatasetService.get_process_rules(dataset_id)
  664. document_process_rules = document.dataset_process_rule.to_dict()
  665. data_source_info = document.data_source_detail_dict
  666. response = {
  667. "id": document.id,
  668. "position": document.position,
  669. "data_source_type": document.data_source_type,
  670. "data_source_info": data_source_info,
  671. "dataset_process_rule_id": document.dataset_process_rule_id,
  672. "dataset_process_rule": dataset_process_rules,
  673. "document_process_rule": document_process_rules,
  674. "name": document.name,
  675. "created_from": document.created_from,
  676. "created_by": document.created_by,
  677. "created_at": document.created_at.timestamp(),
  678. "tokens": document.tokens,
  679. "indexing_status": document.indexing_status,
  680. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  681. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  682. "indexing_latency": document.indexing_latency,
  683. "error": document.error,
  684. "enabled": document.enabled,
  685. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  686. "disabled_by": document.disabled_by,
  687. "archived": document.archived,
  688. "doc_type": document.doc_type,
  689. "doc_metadata": document.doc_metadata_details,
  690. "segment_count": document.segment_count,
  691. "average_segment_length": document.average_segment_length,
  692. "hit_count": document.hit_count,
  693. "display_status": document.display_status,
  694. "doc_form": document.doc_form,
  695. "doc_language": document.doc_language,
  696. }
  697. return response, 200
  698. @setup_required
  699. @login_required
  700. @account_initialization_required
  701. @cloud_edition_billing_rate_limit_check("knowledge")
  702. def delete(self, dataset_id, document_id):
  703. dataset_id = str(dataset_id)
  704. document_id = str(document_id)
  705. dataset = DatasetService.get_dataset(dataset_id)
  706. if dataset is None:
  707. raise NotFound("Dataset not found.")
  708. # check user's model setting
  709. DatasetService.check_dataset_model_setting(dataset)
  710. document = self.get_document(dataset_id, document_id)
  711. try:
  712. DocumentService.delete_document(document)
  713. except services.errors.document.DocumentIndexingError:
  714. raise DocumentIndexingError("Cannot delete document during indexing.")
  715. return {"result": "success"}, 204
  716. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>")
  717. class DocumentProcessingApi(DocumentResource):
  718. @api.doc("update_document_processing")
  719. @api.doc(description="Update document processing status (pause/resume)")
  720. @api.doc(
  721. params={"dataset_id": "Dataset ID", "document_id": "Document ID", "action": "Action to perform (pause/resume)"}
  722. )
  723. @api.response(200, "Processing status updated successfully")
  724. @api.response(404, "Document not found")
  725. @api.response(400, "Invalid action")
  726. @setup_required
  727. @login_required
  728. @account_initialization_required
  729. @cloud_edition_billing_rate_limit_check("knowledge")
  730. def patch(self, dataset_id, document_id, action: Literal["pause", "resume"]):
  731. dataset_id = str(dataset_id)
  732. document_id = str(document_id)
  733. document = self.get_document(dataset_id, document_id)
  734. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  735. if not current_user.is_dataset_editor:
  736. raise Forbidden()
  737. if action == "pause":
  738. if document.indexing_status != "indexing":
  739. raise InvalidActionError("Document not in indexing state.")
  740. document.paused_by = current_user.id
  741. document.paused_at = naive_utc_now()
  742. document.is_paused = True
  743. db.session.commit()
  744. elif action == "resume":
  745. if document.indexing_status not in {"paused", "error"}:
  746. raise InvalidActionError("Document not in paused or error state.")
  747. document.paused_by = None
  748. document.paused_at = None
  749. document.is_paused = False
  750. db.session.commit()
  751. return {"result": "success"}, 200
  752. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata")
  753. class DocumentMetadataApi(DocumentResource):
  754. @api.doc("update_document_metadata")
  755. @api.doc(description="Update document metadata")
  756. @api.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
  757. @api.expect(
  758. api.model(
  759. "UpdateDocumentMetadataRequest",
  760. {
  761. "doc_type": fields.String(description="Document type"),
  762. "doc_metadata": fields.Raw(description="Document metadata"),
  763. },
  764. )
  765. )
  766. @api.response(200, "Document metadata updated successfully")
  767. @api.response(404, "Document not found")
  768. @api.response(403, "Permission denied")
  769. @setup_required
  770. @login_required
  771. @account_initialization_required
  772. def put(self, dataset_id, document_id):
  773. dataset_id = str(dataset_id)
  774. document_id = str(document_id)
  775. document = self.get_document(dataset_id, document_id)
  776. req_data = request.get_json()
  777. doc_type = req_data.get("doc_type")
  778. doc_metadata = req_data.get("doc_metadata")
  779. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  780. if not current_user.is_dataset_editor:
  781. raise Forbidden()
  782. if doc_type is None or doc_metadata is None:
  783. raise ValueError("Both doc_type and doc_metadata must be provided.")
  784. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  785. raise ValueError("Invalid doc_type.")
  786. if not isinstance(doc_metadata, dict):
  787. raise ValueError("doc_metadata must be a dictionary.")
  788. metadata_schema: dict = cast(dict, DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type])
  789. document.doc_metadata = {}
  790. if doc_type == "others":
  791. document.doc_metadata = doc_metadata
  792. else:
  793. for key, value_type in metadata_schema.items():
  794. value = doc_metadata.get(key)
  795. if value is not None and isinstance(value, value_type):
  796. document.doc_metadata[key] = value
  797. document.doc_type = doc_type
  798. document.updated_at = naive_utc_now()
  799. db.session.commit()
  800. return {"result": "success", "message": "Document metadata updated."}, 200
  801. class DocumentStatusApi(DocumentResource):
  802. @setup_required
  803. @login_required
  804. @account_initialization_required
  805. @cloud_edition_billing_resource_check("vector_space")
  806. @cloud_edition_billing_rate_limit_check("knowledge")
  807. def patch(self, dataset_id, action: Literal["enable", "disable", "archive", "un_archive"]):
  808. dataset_id = str(dataset_id)
  809. dataset = DatasetService.get_dataset(dataset_id)
  810. if dataset is None:
  811. raise NotFound("Dataset not found.")
  812. # The role of the current user in the ta table must be admin, owner, or editor
  813. if not current_user.is_dataset_editor:
  814. raise Forbidden()
  815. # check user's model setting
  816. DatasetService.check_dataset_model_setting(dataset)
  817. # check user's permission
  818. DatasetService.check_dataset_permission(dataset, current_user)
  819. document_ids = request.args.getlist("document_id")
  820. try:
  821. DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
  822. except services.errors.document.DocumentIndexingError as e:
  823. raise InvalidActionError(str(e))
  824. except ValueError as e:
  825. raise InvalidActionError(str(e))
  826. except NotFound as e:
  827. raise NotFound(str(e))
  828. return {"result": "success"}, 200
  829. class DocumentPauseApi(DocumentResource):
  830. @setup_required
  831. @login_required
  832. @account_initialization_required
  833. @cloud_edition_billing_rate_limit_check("knowledge")
  834. def patch(self, dataset_id, document_id):
  835. """pause document."""
  836. dataset_id = str(dataset_id)
  837. document_id = str(document_id)
  838. dataset = DatasetService.get_dataset(dataset_id)
  839. if not dataset:
  840. raise NotFound("Dataset not found.")
  841. document = DocumentService.get_document(dataset.id, document_id)
  842. # 404 if document not found
  843. if document is None:
  844. raise NotFound("Document Not Exists.")
  845. # 403 if document is archived
  846. if DocumentService.check_archived(document):
  847. raise ArchivedDocumentImmutableError()
  848. try:
  849. # pause document
  850. DocumentService.pause_document(document)
  851. except services.errors.document.DocumentIndexingError:
  852. raise DocumentIndexingError("Cannot pause completed document.")
  853. return {"result": "success"}, 204
  854. class DocumentRecoverApi(DocumentResource):
  855. @setup_required
  856. @login_required
  857. @account_initialization_required
  858. @cloud_edition_billing_rate_limit_check("knowledge")
  859. def patch(self, dataset_id, document_id):
  860. """recover document."""
  861. dataset_id = str(dataset_id)
  862. document_id = str(document_id)
  863. dataset = DatasetService.get_dataset(dataset_id)
  864. if not dataset:
  865. raise NotFound("Dataset not found.")
  866. document = DocumentService.get_document(dataset.id, document_id)
  867. # 404 if document not found
  868. if document is None:
  869. raise NotFound("Document Not Exists.")
  870. # 403 if document is archived
  871. if DocumentService.check_archived(document):
  872. raise ArchivedDocumentImmutableError()
  873. try:
  874. # pause document
  875. DocumentService.recover_document(document)
  876. except services.errors.document.DocumentIndexingError:
  877. raise DocumentIndexingError("Document is not in paused status.")
  878. return {"result": "success"}, 204
  879. class DocumentRetryApi(DocumentResource):
  880. @setup_required
  881. @login_required
  882. @account_initialization_required
  883. @cloud_edition_billing_rate_limit_check("knowledge")
  884. def post(self, dataset_id):
  885. """retry document."""
  886. parser = reqparse.RequestParser()
  887. parser.add_argument("document_ids", type=list, required=True, nullable=False, location="json")
  888. args = parser.parse_args()
  889. dataset_id = str(dataset_id)
  890. dataset = DatasetService.get_dataset(dataset_id)
  891. retry_documents = []
  892. if not dataset:
  893. raise NotFound("Dataset not found.")
  894. for document_id in args["document_ids"]:
  895. try:
  896. document_id = str(document_id)
  897. document = DocumentService.get_document(dataset.id, document_id)
  898. # 404 if document not found
  899. if document is None:
  900. raise NotFound("Document Not Exists.")
  901. # 403 if document is archived
  902. if DocumentService.check_archived(document):
  903. raise ArchivedDocumentImmutableError()
  904. # 400 if document is completed
  905. if document.indexing_status == "completed":
  906. raise DocumentAlreadyFinishedError()
  907. retry_documents.append(document)
  908. except Exception:
  909. logger.exception("Failed to retry document, document id: %s", document_id)
  910. continue
  911. # retry document
  912. DocumentService.retry_document(dataset_id, retry_documents)
  913. return {"result": "success"}, 204
  914. class DocumentRenameApi(DocumentResource):
  915. @setup_required
  916. @login_required
  917. @account_initialization_required
  918. @marshal_with(document_fields)
  919. def post(self, dataset_id, document_id):
  920. # The role of the current user in the ta table must be admin, owner, editor, or dataset_operator
  921. if not current_user.is_dataset_editor:
  922. raise Forbidden()
  923. dataset = DatasetService.get_dataset(dataset_id)
  924. DatasetService.check_dataset_operator_permission(current_user, dataset)
  925. parser = reqparse.RequestParser()
  926. parser.add_argument("name", type=str, required=True, nullable=False, location="json")
  927. args = parser.parse_args()
  928. try:
  929. document = DocumentService.rename_document(dataset_id, document_id, args["name"])
  930. except services.errors.document.DocumentIndexingError:
  931. raise DocumentIndexingError("Cannot delete document during indexing.")
  932. return document
  933. class WebsiteDocumentSyncApi(DocumentResource):
  934. @setup_required
  935. @login_required
  936. @account_initialization_required
  937. def get(self, dataset_id, document_id):
  938. """sync website document."""
  939. dataset_id = str(dataset_id)
  940. dataset = DatasetService.get_dataset(dataset_id)
  941. if not dataset:
  942. raise NotFound("Dataset not found.")
  943. document_id = str(document_id)
  944. document = DocumentService.get_document(dataset.id, document_id)
  945. if not document:
  946. raise NotFound("Document not found.")
  947. if document.tenant_id != current_user.current_tenant_id:
  948. raise Forbidden("No permission.")
  949. if document.data_source_type != "website_crawl":
  950. raise ValueError("Document is not a website document.")
  951. # 403 if document is archived
  952. if DocumentService.check_archived(document):
  953. raise ArchivedDocumentImmutableError()
  954. # sync document
  955. DocumentService.sync_website_document(dataset_id, document)
  956. return {"result": "success"}, 200