Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

datasets_document.py 46KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from collections.abc import Sequence
  4. from typing import Literal, cast
  5. from flask import request
  6. from flask_login import current_user
  7. from flask_restx import Resource, fields, marshal, marshal_with, reqparse
  8. from sqlalchemy import asc, desc, select
  9. from werkzeug.exceptions import Forbidden, NotFound
  10. import services
  11. from controllers.console import api, console_ns
  12. from controllers.console.app.error import (
  13. ProviderModelCurrentlyNotSupportError,
  14. ProviderNotInitializeError,
  15. ProviderQuotaExceededError,
  16. )
  17. from controllers.console.datasets.error import (
  18. ArchivedDocumentImmutableError,
  19. DocumentAlreadyFinishedError,
  20. DocumentIndexingError,
  21. IndexingEstimateError,
  22. InvalidActionError,
  23. InvalidMetadataError,
  24. )
  25. from controllers.console.wraps import (
  26. account_initialization_required,
  27. cloud_edition_billing_rate_limit_check,
  28. cloud_edition_billing_resource_check,
  29. setup_required,
  30. )
  31. from core.errors.error import (
  32. LLMBadRequestError,
  33. ModelCurrentlyNotSupportError,
  34. ProviderTokenNotInitError,
  35. QuotaExceededError,
  36. )
  37. from core.indexing_runner import IndexingRunner
  38. from core.model_manager import ModelManager
  39. from core.model_runtime.entities.model_entities import ModelType
  40. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  41. from core.plugin.impl.exc import PluginDaemonClientSideError
  42. from core.rag.extractor.entity.datasource_type import DatasourceType
  43. from core.rag.extractor.entity.extract_setting import ExtractSetting
  44. from extensions.ext_database import db
  45. from fields.document_fields import (
  46. dataset_and_document_fields,
  47. document_fields,
  48. document_status_fields,
  49. document_with_segments_fields,
  50. )
  51. from libs.datetime_utils import naive_utc_now
  52. from libs.login import login_required
  53. from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
  54. from services.dataset_service import DatasetService, DocumentService
  55. from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
  56. logger = logging.getLogger(__name__)
  57. class DocumentResource(Resource):
  58. def get_document(self, dataset_id: str, document_id: str) -> Document:
  59. dataset = DatasetService.get_dataset(dataset_id)
  60. if not dataset:
  61. raise NotFound("Dataset not found.")
  62. try:
  63. DatasetService.check_dataset_permission(dataset, current_user)
  64. except services.errors.account.NoPermissionError as e:
  65. raise Forbidden(str(e))
  66. document = DocumentService.get_document(dataset_id, document_id)
  67. if not document:
  68. raise NotFound("Document not found.")
  69. if document.tenant_id != current_user.current_tenant_id:
  70. raise Forbidden("No permission.")
  71. return document
  72. def get_batch_documents(self, dataset_id: str, batch: str) -> Sequence[Document]:
  73. dataset = DatasetService.get_dataset(dataset_id)
  74. if not dataset:
  75. raise NotFound("Dataset not found.")
  76. try:
  77. DatasetService.check_dataset_permission(dataset, current_user)
  78. except services.errors.account.NoPermissionError as e:
  79. raise Forbidden(str(e))
  80. documents = DocumentService.get_batch_documents(dataset_id, batch)
  81. if not documents:
  82. raise NotFound("Documents not found.")
  83. return documents
  84. @console_ns.route("/datasets/process-rule")
  85. class GetProcessRuleApi(Resource):
  86. @api.doc("get_process_rule")
  87. @api.doc(description="Get dataset document processing rules")
  88. @api.doc(params={"document_id": "Document ID (optional)"})
  89. @api.response(200, "Process rules retrieved successfully")
  90. @setup_required
  91. @login_required
  92. @account_initialization_required
  93. def get(self):
  94. req_data = request.args
  95. document_id = req_data.get("document_id")
  96. # get default rules
  97. mode = DocumentService.DEFAULT_RULES["mode"]
  98. rules = DocumentService.DEFAULT_RULES["rules"]
  99. limits = DocumentService.DEFAULT_RULES["limits"]
  100. if document_id:
  101. # get the latest process rule
  102. document = db.get_or_404(Document, document_id)
  103. dataset = DatasetService.get_dataset(document.dataset_id)
  104. if not dataset:
  105. raise NotFound("Dataset not found.")
  106. try:
  107. DatasetService.check_dataset_permission(dataset, current_user)
  108. except services.errors.account.NoPermissionError as e:
  109. raise Forbidden(str(e))
  110. # get the latest process rule
  111. dataset_process_rule = (
  112. db.session.query(DatasetProcessRule)
  113. .where(DatasetProcessRule.dataset_id == document.dataset_id)
  114. .order_by(DatasetProcessRule.created_at.desc())
  115. .limit(1)
  116. .one_or_none()
  117. )
  118. if dataset_process_rule:
  119. mode = dataset_process_rule.mode
  120. rules = dataset_process_rule.rules_dict
  121. return {"mode": mode, "rules": rules, "limits": limits}
  122. @console_ns.route("/datasets/<uuid:dataset_id>/documents")
  123. class DatasetDocumentListApi(Resource):
  124. @api.doc("get_dataset_documents")
  125. @api.doc(description="Get documents in a dataset")
  126. @api.doc(
  127. params={
  128. "dataset_id": "Dataset ID",
  129. "page": "Page number (default: 1)",
  130. "limit": "Number of items per page (default: 20)",
  131. "keyword": "Search keyword",
  132. "sort": "Sort order (default: -created_at)",
  133. "fetch": "Fetch full details (default: false)",
  134. }
  135. )
  136. @api.response(200, "Documents retrieved successfully")
  137. @setup_required
  138. @login_required
  139. @account_initialization_required
  140. def get(self, dataset_id):
  141. dataset_id = str(dataset_id)
  142. page = request.args.get("page", default=1, type=int)
  143. limit = request.args.get("limit", default=20, type=int)
  144. search = request.args.get("keyword", default=None, type=str)
  145. sort = request.args.get("sort", default="-created_at", type=str)
  146. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  147. try:
  148. fetch_val = request.args.get("fetch", default="false")
  149. if isinstance(fetch_val, bool):
  150. fetch = fetch_val
  151. else:
  152. if fetch_val.lower() in ("yes", "true", "t", "y", "1"):
  153. fetch = True
  154. elif fetch_val.lower() in ("no", "false", "f", "n", "0"):
  155. fetch = False
  156. else:
  157. raise ArgumentTypeError(
  158. f"Truthy value expected: got {fetch_val} but expected one of yes/no, true/false, t/f, y/n, 1/0 "
  159. f"(case insensitive)."
  160. )
  161. except (ArgumentTypeError, ValueError, Exception):
  162. fetch = False
  163. dataset = DatasetService.get_dataset(dataset_id)
  164. if not dataset:
  165. raise NotFound("Dataset not found.")
  166. try:
  167. DatasetService.check_dataset_permission(dataset, current_user)
  168. except services.errors.account.NoPermissionError as e:
  169. raise Forbidden(str(e))
  170. query = select(Document).filter_by(dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  171. if search:
  172. search = f"%{search}%"
  173. query = query.where(Document.name.like(search))
  174. if sort.startswith("-"):
  175. sort_logic = desc
  176. sort = sort[1:]
  177. else:
  178. sort_logic = asc
  179. if sort == "hit_count":
  180. sub_query = (
  181. db.select(DocumentSegment.document_id, db.func.sum(DocumentSegment.hit_count).label("total_hit_count"))
  182. .group_by(DocumentSegment.document_id)
  183. .subquery()
  184. )
  185. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id).order_by(
  186. sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)),
  187. sort_logic(Document.position),
  188. )
  189. elif sort == "created_at":
  190. query = query.order_by(
  191. sort_logic(Document.created_at),
  192. sort_logic(Document.position),
  193. )
  194. else:
  195. query = query.order_by(
  196. desc(Document.created_at),
  197. desc(Document.position),
  198. )
  199. paginated_documents = db.paginate(select=query, page=page, per_page=limit, max_per_page=100, error_out=False)
  200. documents = paginated_documents.items
  201. if fetch:
  202. for document in documents:
  203. completed_segments = (
  204. db.session.query(DocumentSegment)
  205. .where(
  206. DocumentSegment.completed_at.isnot(None),
  207. DocumentSegment.document_id == str(document.id),
  208. DocumentSegment.status != "re_segment",
  209. )
  210. .count()
  211. )
  212. total_segments = (
  213. db.session.query(DocumentSegment)
  214. .where(DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment")
  215. .count()
  216. )
  217. document.completed_segments = completed_segments
  218. document.total_segments = total_segments
  219. data = marshal(documents, document_with_segments_fields)
  220. else:
  221. data = marshal(documents, document_fields)
  222. response = {
  223. "data": data,
  224. "has_more": len(documents) == limit,
  225. "limit": limit,
  226. "total": paginated_documents.total,
  227. "page": page,
  228. }
  229. return response
  230. @setup_required
  231. @login_required
  232. @account_initialization_required
  233. @marshal_with(dataset_and_document_fields)
  234. @cloud_edition_billing_resource_check("vector_space")
  235. @cloud_edition_billing_rate_limit_check("knowledge")
  236. def post(self, dataset_id):
  237. dataset_id = str(dataset_id)
  238. dataset = DatasetService.get_dataset(dataset_id)
  239. if not dataset:
  240. raise NotFound("Dataset not found.")
  241. # The role of the current user in the ta table must be admin, owner, or editor
  242. if not current_user.is_dataset_editor:
  243. raise Forbidden()
  244. try:
  245. DatasetService.check_dataset_permission(dataset, current_user)
  246. except services.errors.account.NoPermissionError as e:
  247. raise Forbidden(str(e))
  248. parser = reqparse.RequestParser()
  249. parser.add_argument(
  250. "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
  251. )
  252. parser.add_argument("data_source", type=dict, required=False, location="json")
  253. parser.add_argument("process_rule", type=dict, required=False, location="json")
  254. parser.add_argument("duplicate", type=bool, default=True, nullable=False, location="json")
  255. parser.add_argument("original_document_id", type=str, required=False, location="json")
  256. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  257. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  258. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  259. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  260. parser.add_argument(
  261. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  262. )
  263. args = parser.parse_args()
  264. knowledge_config = KnowledgeConfig(**args)
  265. if not dataset.indexing_technique and not knowledge_config.indexing_technique:
  266. raise ValueError("indexing_technique is required.")
  267. # validate args
  268. DocumentService.document_create_args_validate(knowledge_config)
  269. try:
  270. documents, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, current_user)
  271. dataset = DatasetService.get_dataset(dataset_id)
  272. except ProviderTokenNotInitError as ex:
  273. raise ProviderNotInitializeError(ex.description)
  274. except QuotaExceededError:
  275. raise ProviderQuotaExceededError()
  276. except ModelCurrentlyNotSupportError:
  277. raise ProviderModelCurrentlyNotSupportError()
  278. return {"dataset": dataset, "documents": documents, "batch": batch}
  279. @setup_required
  280. @login_required
  281. @account_initialization_required
  282. @cloud_edition_billing_rate_limit_check("knowledge")
  283. def delete(self, dataset_id):
  284. dataset_id = str(dataset_id)
  285. dataset = DatasetService.get_dataset(dataset_id)
  286. if dataset is None:
  287. raise NotFound("Dataset not found.")
  288. # check user's model setting
  289. DatasetService.check_dataset_model_setting(dataset)
  290. try:
  291. document_ids = request.args.getlist("document_id")
  292. DocumentService.delete_documents(dataset, document_ids)
  293. except services.errors.document.DocumentIndexingError:
  294. raise DocumentIndexingError("Cannot delete document during indexing.")
  295. return {"result": "success"}, 204
  296. @console_ns.route("/datasets/init")
  297. class DatasetInitApi(Resource):
  298. @api.doc("init_dataset")
  299. @api.doc(description="Initialize dataset with documents")
  300. @api.expect(
  301. api.model(
  302. "DatasetInitRequest",
  303. {
  304. "upload_file_id": fields.String(required=True, description="Upload file ID"),
  305. "indexing_technique": fields.String(description="Indexing technique"),
  306. "process_rule": fields.Raw(description="Processing rules"),
  307. "data_source": fields.Raw(description="Data source configuration"),
  308. },
  309. )
  310. )
  311. @api.response(201, "Dataset initialized successfully", dataset_and_document_fields)
  312. @api.response(400, "Invalid request parameters")
  313. @setup_required
  314. @login_required
  315. @account_initialization_required
  316. @marshal_with(dataset_and_document_fields)
  317. @cloud_edition_billing_resource_check("vector_space")
  318. @cloud_edition_billing_rate_limit_check("knowledge")
  319. def post(self):
  320. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  321. if not current_user.is_dataset_editor:
  322. raise Forbidden()
  323. parser = reqparse.RequestParser()
  324. parser.add_argument(
  325. "indexing_technique",
  326. type=str,
  327. choices=Dataset.INDEXING_TECHNIQUE_LIST,
  328. required=True,
  329. nullable=False,
  330. location="json",
  331. )
  332. parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
  333. parser.add_argument("process_rule", type=dict, required=True, nullable=True, location="json")
  334. parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
  335. parser.add_argument(
  336. "doc_language", type=str, default="English", required=False, nullable=False, location="json"
  337. )
  338. parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
  339. parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
  340. parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
  341. args = parser.parse_args()
  342. knowledge_config = KnowledgeConfig(**args)
  343. if knowledge_config.indexing_technique == "high_quality":
  344. if knowledge_config.embedding_model is None or knowledge_config.embedding_model_provider is None:
  345. raise ValueError("embedding model and embedding model provider are required for high quality indexing.")
  346. try:
  347. model_manager = ModelManager()
  348. model_manager.get_model_instance(
  349. tenant_id=current_user.current_tenant_id,
  350. provider=args["embedding_model_provider"],
  351. model_type=ModelType.TEXT_EMBEDDING,
  352. model=args["embedding_model"],
  353. )
  354. except InvokeAuthorizationError:
  355. raise ProviderNotInitializeError(
  356. "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
  357. )
  358. except ProviderTokenNotInitError as ex:
  359. raise ProviderNotInitializeError(ex.description)
  360. # validate args
  361. DocumentService.document_create_args_validate(knowledge_config)
  362. try:
  363. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  364. tenant_id=current_user.current_tenant_id, knowledge_config=knowledge_config, account=current_user
  365. )
  366. except ProviderTokenNotInitError as ex:
  367. raise ProviderNotInitializeError(ex.description)
  368. except QuotaExceededError:
  369. raise ProviderQuotaExceededError()
  370. except ModelCurrentlyNotSupportError:
  371. raise ProviderModelCurrentlyNotSupportError()
  372. response = {"dataset": dataset, "documents": documents, "batch": batch}
  373. return response
  374. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate")
  375. class DocumentIndexingEstimateApi(DocumentResource):
  376. @api.doc("estimate_document_indexing")
  377. @api.doc(description="Estimate document indexing cost")
  378. @api.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
  379. @api.response(200, "Indexing estimate calculated successfully")
  380. @api.response(404, "Document not found")
  381. @api.response(400, "Document already finished")
  382. @setup_required
  383. @login_required
  384. @account_initialization_required
  385. def get(self, dataset_id, document_id):
  386. dataset_id = str(dataset_id)
  387. document_id = str(document_id)
  388. document = self.get_document(dataset_id, document_id)
  389. if document.indexing_status in {"completed", "error"}:
  390. raise DocumentAlreadyFinishedError()
  391. data_process_rule = document.dataset_process_rule
  392. data_process_rule_dict = data_process_rule.to_dict()
  393. response = {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}
  394. if document.data_source_type == "upload_file":
  395. data_source_info = document.data_source_info_dict
  396. if data_source_info and "upload_file_id" in data_source_info:
  397. file_id = data_source_info["upload_file_id"]
  398. file = (
  399. db.session.query(UploadFile)
  400. .where(UploadFile.tenant_id == document.tenant_id, UploadFile.id == file_id)
  401. .first()
  402. )
  403. # raise error if file not found
  404. if not file:
  405. raise NotFound("File not found.")
  406. extract_setting = ExtractSetting(
  407. datasource_type=DatasourceType.FILE.value, upload_file=file, document_model=document.doc_form
  408. )
  409. indexing_runner = IndexingRunner()
  410. try:
  411. estimate_response = indexing_runner.indexing_estimate(
  412. current_user.current_tenant_id,
  413. [extract_setting],
  414. data_process_rule_dict,
  415. document.doc_form,
  416. "English",
  417. dataset_id,
  418. )
  419. return estimate_response.model_dump(), 200
  420. except LLMBadRequestError:
  421. raise ProviderNotInitializeError(
  422. "No Embedding Model available. Please configure a valid provider "
  423. "in the Settings -> Model Provider."
  424. )
  425. except ProviderTokenNotInitError as ex:
  426. raise ProviderNotInitializeError(ex.description)
  427. except PluginDaemonClientSideError as ex:
  428. raise ProviderNotInitializeError(ex.description)
  429. except Exception as e:
  430. raise IndexingEstimateError(str(e))
  431. return response, 200
  432. @console_ns.route("/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate")
  433. class DocumentBatchIndexingEstimateApi(DocumentResource):
  434. @setup_required
  435. @login_required
  436. @account_initialization_required
  437. def get(self, dataset_id, batch):
  438. dataset_id = str(dataset_id)
  439. batch = str(batch)
  440. documents = self.get_batch_documents(dataset_id, batch)
  441. if not documents:
  442. return {"tokens": 0, "total_price": 0, "currency": "USD", "total_segments": 0, "preview": []}, 200
  443. data_process_rule = documents[0].dataset_process_rule
  444. data_process_rule_dict = data_process_rule.to_dict()
  445. extract_settings = []
  446. for document in documents:
  447. if document.indexing_status in {"completed", "error"}:
  448. raise DocumentAlreadyFinishedError()
  449. data_source_info = document.data_source_info_dict
  450. if document.data_source_type == "upload_file":
  451. if not data_source_info:
  452. continue
  453. file_id = data_source_info["upload_file_id"]
  454. file_detail = (
  455. db.session.query(UploadFile)
  456. .where(UploadFile.tenant_id == current_user.current_tenant_id, UploadFile.id == file_id)
  457. .first()
  458. )
  459. if file_detail is None:
  460. raise NotFound("File not found.")
  461. extract_setting = ExtractSetting(
  462. datasource_type=DatasourceType.FILE.value, upload_file=file_detail, document_model=document.doc_form
  463. )
  464. extract_settings.append(extract_setting)
  465. elif document.data_source_type == "notion_import":
  466. if not data_source_info:
  467. continue
  468. extract_setting = ExtractSetting(
  469. datasource_type=DatasourceType.NOTION.value,
  470. notion_info={
  471. "notion_workspace_id": data_source_info["notion_workspace_id"],
  472. "notion_obj_id": data_source_info["notion_page_id"],
  473. "notion_page_type": data_source_info["type"],
  474. "tenant_id": current_user.current_tenant_id,
  475. },
  476. document_model=document.doc_form,
  477. )
  478. extract_settings.append(extract_setting)
  479. elif document.data_source_type == "website_crawl":
  480. if not data_source_info:
  481. continue
  482. extract_setting = ExtractSetting(
  483. datasource_type=DatasourceType.WEBSITE.value,
  484. website_info={
  485. "provider": data_source_info["provider"],
  486. "job_id": data_source_info["job_id"],
  487. "url": data_source_info["url"],
  488. "tenant_id": current_user.current_tenant_id,
  489. "mode": data_source_info["mode"],
  490. "only_main_content": data_source_info["only_main_content"],
  491. },
  492. document_model=document.doc_form,
  493. )
  494. extract_settings.append(extract_setting)
  495. else:
  496. raise ValueError("Data source type not support")
  497. indexing_runner = IndexingRunner()
  498. try:
  499. response = indexing_runner.indexing_estimate(
  500. current_user.current_tenant_id,
  501. extract_settings,
  502. data_process_rule_dict,
  503. document.doc_form,
  504. "English",
  505. dataset_id,
  506. )
  507. return response.model_dump(), 200
  508. except LLMBadRequestError:
  509. raise ProviderNotInitializeError(
  510. "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
  511. )
  512. except ProviderTokenNotInitError as ex:
  513. raise ProviderNotInitializeError(ex.description)
  514. except PluginDaemonClientSideError as ex:
  515. raise ProviderNotInitializeError(ex.description)
  516. except Exception as e:
  517. raise IndexingEstimateError(str(e))
  518. @console_ns.route("/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status")
  519. class DocumentBatchIndexingStatusApi(DocumentResource):
  520. @setup_required
  521. @login_required
  522. @account_initialization_required
  523. def get(self, dataset_id, batch):
  524. dataset_id = str(dataset_id)
  525. batch = str(batch)
  526. documents = self.get_batch_documents(dataset_id, batch)
  527. documents_status = []
  528. for document in documents:
  529. completed_segments = (
  530. db.session.query(DocumentSegment)
  531. .where(
  532. DocumentSegment.completed_at.isnot(None),
  533. DocumentSegment.document_id == str(document.id),
  534. DocumentSegment.status != "re_segment",
  535. )
  536. .count()
  537. )
  538. total_segments = (
  539. db.session.query(DocumentSegment)
  540. .where(DocumentSegment.document_id == str(document.id), DocumentSegment.status != "re_segment")
  541. .count()
  542. )
  543. # Create a dictionary with document attributes and additional fields
  544. document_dict = {
  545. "id": document.id,
  546. "indexing_status": "paused" if document.is_paused else document.indexing_status,
  547. "processing_started_at": document.processing_started_at,
  548. "parsing_completed_at": document.parsing_completed_at,
  549. "cleaning_completed_at": document.cleaning_completed_at,
  550. "splitting_completed_at": document.splitting_completed_at,
  551. "completed_at": document.completed_at,
  552. "paused_at": document.paused_at,
  553. "error": document.error,
  554. "stopped_at": document.stopped_at,
  555. "completed_segments": completed_segments,
  556. "total_segments": total_segments,
  557. }
  558. documents_status.append(marshal(document_dict, document_status_fields))
  559. data = {"data": documents_status}
  560. return data
  561. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status")
  562. class DocumentIndexingStatusApi(DocumentResource):
  563. @api.doc("get_document_indexing_status")
  564. @api.doc(description="Get document indexing status")
  565. @api.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
  566. @api.response(200, "Indexing status retrieved successfully")
  567. @api.response(404, "Document not found")
  568. @setup_required
  569. @login_required
  570. @account_initialization_required
  571. def get(self, dataset_id, document_id):
  572. dataset_id = str(dataset_id)
  573. document_id = str(document_id)
  574. document = self.get_document(dataset_id, document_id)
  575. completed_segments = (
  576. db.session.query(DocumentSegment)
  577. .where(
  578. DocumentSegment.completed_at.isnot(None),
  579. DocumentSegment.document_id == str(document_id),
  580. DocumentSegment.status != "re_segment",
  581. )
  582. .count()
  583. )
  584. total_segments = (
  585. db.session.query(DocumentSegment)
  586. .where(DocumentSegment.document_id == str(document_id), DocumentSegment.status != "re_segment")
  587. .count()
  588. )
  589. # Create a dictionary with document attributes and additional fields
  590. document_dict = {
  591. "id": document.id,
  592. "indexing_status": "paused" if document.is_paused else document.indexing_status,
  593. "processing_started_at": document.processing_started_at,
  594. "parsing_completed_at": document.parsing_completed_at,
  595. "cleaning_completed_at": document.cleaning_completed_at,
  596. "splitting_completed_at": document.splitting_completed_at,
  597. "completed_at": document.completed_at,
  598. "paused_at": document.paused_at,
  599. "error": document.error,
  600. "stopped_at": document.stopped_at,
  601. "completed_segments": completed_segments,
  602. "total_segments": total_segments,
  603. }
  604. return marshal(document_dict, document_status_fields)
  605. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
  606. class DocumentApi(DocumentResource):
  607. METADATA_CHOICES = {"all", "only", "without"}
  608. @api.doc("get_document")
  609. @api.doc(description="Get document details")
  610. @api.doc(
  611. params={
  612. "dataset_id": "Dataset ID",
  613. "document_id": "Document ID",
  614. "metadata": "Metadata inclusion (all/only/without)",
  615. }
  616. )
  617. @api.response(200, "Document retrieved successfully")
  618. @api.response(404, "Document not found")
  619. @setup_required
  620. @login_required
  621. @account_initialization_required
  622. def get(self, dataset_id, document_id):
  623. dataset_id = str(dataset_id)
  624. document_id = str(document_id)
  625. document = self.get_document(dataset_id, document_id)
  626. metadata = request.args.get("metadata", "all")
  627. if metadata not in self.METADATA_CHOICES:
  628. raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
  629. if metadata == "only":
  630. response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
  631. elif metadata == "without":
  632. dataset_process_rules = DatasetService.get_process_rules(dataset_id)
  633. document_process_rules = document.dataset_process_rule.to_dict()
  634. data_source_info = document.data_source_detail_dict
  635. response = {
  636. "id": document.id,
  637. "position": document.position,
  638. "data_source_type": document.data_source_type,
  639. "data_source_info": data_source_info,
  640. "dataset_process_rule_id": document.dataset_process_rule_id,
  641. "dataset_process_rule": dataset_process_rules,
  642. "document_process_rule": document_process_rules,
  643. "name": document.name,
  644. "created_from": document.created_from,
  645. "created_by": document.created_by,
  646. "created_at": document.created_at.timestamp(),
  647. "tokens": document.tokens,
  648. "indexing_status": document.indexing_status,
  649. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  650. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  651. "indexing_latency": document.indexing_latency,
  652. "error": document.error,
  653. "enabled": document.enabled,
  654. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  655. "disabled_by": document.disabled_by,
  656. "archived": document.archived,
  657. "segment_count": document.segment_count,
  658. "average_segment_length": document.average_segment_length,
  659. "hit_count": document.hit_count,
  660. "display_status": document.display_status,
  661. "doc_form": document.doc_form,
  662. "doc_language": document.doc_language,
  663. }
  664. else:
  665. dataset_process_rules = DatasetService.get_process_rules(dataset_id)
  666. document_process_rules = document.dataset_process_rule.to_dict()
  667. data_source_info = document.data_source_detail_dict
  668. response = {
  669. "id": document.id,
  670. "position": document.position,
  671. "data_source_type": document.data_source_type,
  672. "data_source_info": data_source_info,
  673. "dataset_process_rule_id": document.dataset_process_rule_id,
  674. "dataset_process_rule": dataset_process_rules,
  675. "document_process_rule": document_process_rules,
  676. "name": document.name,
  677. "created_from": document.created_from,
  678. "created_by": document.created_by,
  679. "created_at": document.created_at.timestamp(),
  680. "tokens": document.tokens,
  681. "indexing_status": document.indexing_status,
  682. "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
  683. "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
  684. "indexing_latency": document.indexing_latency,
  685. "error": document.error,
  686. "enabled": document.enabled,
  687. "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
  688. "disabled_by": document.disabled_by,
  689. "archived": document.archived,
  690. "doc_type": document.doc_type,
  691. "doc_metadata": document.doc_metadata_details,
  692. "segment_count": document.segment_count,
  693. "average_segment_length": document.average_segment_length,
  694. "hit_count": document.hit_count,
  695. "display_status": document.display_status,
  696. "doc_form": document.doc_form,
  697. "doc_language": document.doc_language,
  698. }
  699. return response, 200
  700. @setup_required
  701. @login_required
  702. @account_initialization_required
  703. @cloud_edition_billing_rate_limit_check("knowledge")
  704. def delete(self, dataset_id, document_id):
  705. dataset_id = str(dataset_id)
  706. document_id = str(document_id)
  707. dataset = DatasetService.get_dataset(dataset_id)
  708. if dataset is None:
  709. raise NotFound("Dataset not found.")
  710. # check user's model setting
  711. DatasetService.check_dataset_model_setting(dataset)
  712. document = self.get_document(dataset_id, document_id)
  713. try:
  714. DocumentService.delete_document(document)
  715. except services.errors.document.DocumentIndexingError:
  716. raise DocumentIndexingError("Cannot delete document during indexing.")
  717. return {"result": "success"}, 204
  718. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>")
  719. class DocumentProcessingApi(DocumentResource):
  720. @api.doc("update_document_processing")
  721. @api.doc(description="Update document processing status (pause/resume)")
  722. @api.doc(
  723. params={"dataset_id": "Dataset ID", "document_id": "Document ID", "action": "Action to perform (pause/resume)"}
  724. )
  725. @api.response(200, "Processing status updated successfully")
  726. @api.response(404, "Document not found")
  727. @api.response(400, "Invalid action")
  728. @setup_required
  729. @login_required
  730. @account_initialization_required
  731. @cloud_edition_billing_rate_limit_check("knowledge")
  732. def patch(self, dataset_id, document_id, action: Literal["pause", "resume"]):
  733. dataset_id = str(dataset_id)
  734. document_id = str(document_id)
  735. document = self.get_document(dataset_id, document_id)
  736. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  737. if not current_user.is_dataset_editor:
  738. raise Forbidden()
  739. if action == "pause":
  740. if document.indexing_status != "indexing":
  741. raise InvalidActionError("Document not in indexing state.")
  742. document.paused_by = current_user.id
  743. document.paused_at = naive_utc_now()
  744. document.is_paused = True
  745. db.session.commit()
  746. elif action == "resume":
  747. if document.indexing_status not in {"paused", "error"}:
  748. raise InvalidActionError("Document not in paused or error state.")
  749. document.paused_by = None
  750. document.paused_at = None
  751. document.is_paused = False
  752. db.session.commit()
  753. return {"result": "success"}, 200
  754. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata")
  755. class DocumentMetadataApi(DocumentResource):
  756. @api.doc("update_document_metadata")
  757. @api.doc(description="Update document metadata")
  758. @api.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
  759. @api.expect(
  760. api.model(
  761. "UpdateDocumentMetadataRequest",
  762. {
  763. "doc_type": fields.String(description="Document type"),
  764. "doc_metadata": fields.Raw(description="Document metadata"),
  765. },
  766. )
  767. )
  768. @api.response(200, "Document metadata updated successfully")
  769. @api.response(404, "Document not found")
  770. @api.response(403, "Permission denied")
  771. @setup_required
  772. @login_required
  773. @account_initialization_required
  774. def put(self, dataset_id, document_id):
  775. dataset_id = str(dataset_id)
  776. document_id = str(document_id)
  777. document = self.get_document(dataset_id, document_id)
  778. req_data = request.get_json()
  779. doc_type = req_data.get("doc_type")
  780. doc_metadata = req_data.get("doc_metadata")
  781. # The role of the current user in the ta table must be admin, owner, dataset_operator, or editor
  782. if not current_user.is_dataset_editor:
  783. raise Forbidden()
  784. if doc_type is None or doc_metadata is None:
  785. raise ValueError("Both doc_type and doc_metadata must be provided.")
  786. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  787. raise ValueError("Invalid doc_type.")
  788. if not isinstance(doc_metadata, dict):
  789. raise ValueError("doc_metadata must be a dictionary.")
  790. metadata_schema: dict = cast(dict, DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type])
  791. document.doc_metadata = {}
  792. if doc_type == "others":
  793. document.doc_metadata = doc_metadata
  794. else:
  795. for key, value_type in metadata_schema.items():
  796. value = doc_metadata.get(key)
  797. if value is not None and isinstance(value, value_type):
  798. document.doc_metadata[key] = value
  799. document.doc_type = doc_type
  800. document.updated_at = naive_utc_now()
  801. db.session.commit()
  802. return {"result": "success", "message": "Document metadata updated."}, 200
  803. @console_ns.route("/datasets/<uuid:dataset_id>/documents/status/<string:action>/batch")
  804. class DocumentStatusApi(DocumentResource):
  805. @setup_required
  806. @login_required
  807. @account_initialization_required
  808. @cloud_edition_billing_resource_check("vector_space")
  809. @cloud_edition_billing_rate_limit_check("knowledge")
  810. def patch(self, dataset_id, action: Literal["enable", "disable", "archive", "un_archive"]):
  811. dataset_id = str(dataset_id)
  812. dataset = DatasetService.get_dataset(dataset_id)
  813. if dataset is None:
  814. raise NotFound("Dataset not found.")
  815. # The role of the current user in the ta table must be admin, owner, or editor
  816. if not current_user.is_dataset_editor:
  817. raise Forbidden()
  818. # check user's model setting
  819. DatasetService.check_dataset_model_setting(dataset)
  820. # check user's permission
  821. DatasetService.check_dataset_permission(dataset, current_user)
  822. document_ids = request.args.getlist("document_id")
  823. try:
  824. DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
  825. except services.errors.document.DocumentIndexingError as e:
  826. raise InvalidActionError(str(e))
  827. except ValueError as e:
  828. raise InvalidActionError(str(e))
  829. except NotFound as e:
  830. raise NotFound(str(e))
  831. return {"result": "success"}, 200
  832. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause")
  833. class DocumentPauseApi(DocumentResource):
  834. @setup_required
  835. @login_required
  836. @account_initialization_required
  837. @cloud_edition_billing_rate_limit_check("knowledge")
  838. def patch(self, dataset_id, document_id):
  839. """pause document."""
  840. dataset_id = str(dataset_id)
  841. document_id = str(document_id)
  842. dataset = DatasetService.get_dataset(dataset_id)
  843. if not dataset:
  844. raise NotFound("Dataset not found.")
  845. document = DocumentService.get_document(dataset.id, document_id)
  846. # 404 if document not found
  847. if document is None:
  848. raise NotFound("Document Not Exists.")
  849. # 403 if document is archived
  850. if DocumentService.check_archived(document):
  851. raise ArchivedDocumentImmutableError()
  852. try:
  853. # pause document
  854. DocumentService.pause_document(document)
  855. except services.errors.document.DocumentIndexingError:
  856. raise DocumentIndexingError("Cannot pause completed document.")
  857. return {"result": "success"}, 204
  858. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume")
  859. class DocumentRecoverApi(DocumentResource):
  860. @setup_required
  861. @login_required
  862. @account_initialization_required
  863. @cloud_edition_billing_rate_limit_check("knowledge")
  864. def patch(self, dataset_id, document_id):
  865. """recover document."""
  866. dataset_id = str(dataset_id)
  867. document_id = str(document_id)
  868. dataset = DatasetService.get_dataset(dataset_id)
  869. if not dataset:
  870. raise NotFound("Dataset not found.")
  871. document = DocumentService.get_document(dataset.id, document_id)
  872. # 404 if document not found
  873. if document is None:
  874. raise NotFound("Document Not Exists.")
  875. # 403 if document is archived
  876. if DocumentService.check_archived(document):
  877. raise ArchivedDocumentImmutableError()
  878. try:
  879. # pause document
  880. DocumentService.recover_document(document)
  881. except services.errors.document.DocumentIndexingError:
  882. raise DocumentIndexingError("Document is not in paused status.")
  883. return {"result": "success"}, 204
  884. @console_ns.route("/datasets/<uuid:dataset_id>/retry")
  885. class DocumentRetryApi(DocumentResource):
  886. @setup_required
  887. @login_required
  888. @account_initialization_required
  889. @cloud_edition_billing_rate_limit_check("knowledge")
  890. def post(self, dataset_id):
  891. """retry document."""
  892. parser = reqparse.RequestParser()
  893. parser.add_argument("document_ids", type=list, required=True, nullable=False, location="json")
  894. args = parser.parse_args()
  895. dataset_id = str(dataset_id)
  896. dataset = DatasetService.get_dataset(dataset_id)
  897. retry_documents = []
  898. if not dataset:
  899. raise NotFound("Dataset not found.")
  900. for document_id in args["document_ids"]:
  901. try:
  902. document_id = str(document_id)
  903. document = DocumentService.get_document(dataset.id, document_id)
  904. # 404 if document not found
  905. if document is None:
  906. raise NotFound("Document Not Exists.")
  907. # 403 if document is archived
  908. if DocumentService.check_archived(document):
  909. raise ArchivedDocumentImmutableError()
  910. # 400 if document is completed
  911. if document.indexing_status == "completed":
  912. raise DocumentAlreadyFinishedError()
  913. retry_documents.append(document)
  914. except Exception:
  915. logger.exception("Failed to retry document, document id: %s", document_id)
  916. continue
  917. # retry document
  918. DocumentService.retry_document(dataset_id, retry_documents)
  919. return {"result": "success"}, 204
  920. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename")
  921. class DocumentRenameApi(DocumentResource):
  922. @setup_required
  923. @login_required
  924. @account_initialization_required
  925. @marshal_with(document_fields)
  926. def post(self, dataset_id, document_id):
  927. # The role of the current user in the ta table must be admin, owner, editor, or dataset_operator
  928. if not current_user.is_dataset_editor:
  929. raise Forbidden()
  930. dataset = DatasetService.get_dataset(dataset_id)
  931. DatasetService.check_dataset_operator_permission(current_user, dataset)
  932. parser = reqparse.RequestParser()
  933. parser.add_argument("name", type=str, required=True, nullable=False, location="json")
  934. args = parser.parse_args()
  935. try:
  936. document = DocumentService.rename_document(dataset_id, document_id, args["name"])
  937. except services.errors.document.DocumentIndexingError:
  938. raise DocumentIndexingError("Cannot delete document during indexing.")
  939. return document
  940. @console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync")
  941. class WebsiteDocumentSyncApi(DocumentResource):
  942. @setup_required
  943. @login_required
  944. @account_initialization_required
  945. def get(self, dataset_id, document_id):
  946. """sync website document."""
  947. dataset_id = str(dataset_id)
  948. dataset = DatasetService.get_dataset(dataset_id)
  949. if not dataset:
  950. raise NotFound("Dataset not found.")
  951. document_id = str(document_id)
  952. document = DocumentService.get_document(dataset.id, document_id)
  953. if not document:
  954. raise NotFound("Document not found.")
  955. if document.tenant_id != current_user.current_tenant_id:
  956. raise Forbidden("No permission.")
  957. if document.data_source_type != "website_crawl":
  958. raise ValueError("Document is not a website document.")
  959. # 403 if document is archived
  960. if DocumentService.check_archived(document):
  961. raise ArchivedDocumentImmutableError()
  962. # sync document
  963. DocumentService.sync_website_document(dataset_id, document)
  964. return {"result": "success"}, 200