### What problem does this PR solve? Refactor Chunk API #2846 ### Type of change - [x] Refactoring --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>tags/v0.13.0
| @@ -119,13 +119,11 @@ def update_doc(tenant_id, dataset_id, document_id): | |||
| if informs: | |||
| e, file = FileService.get_by_id(informs[0].file_id) | |||
| FileService.update_by_id(file.id, {"name": req["name"]}) | |||
| if "parser_config" in req: | |||
| DocumentService.update_parser_config(doc.id, req["parser_config"]) | |||
| if "parser_method" in req: | |||
| if doc.parser_id.lower() == req["parser_method"].lower(): | |||
| if "parser_config" in req: | |||
| if req["parser_config"] == doc.parser_config: | |||
| return get_result(retcode=RetCode.SUCCESS) | |||
| else: | |||
| return get_result(retcode=RetCode.SUCCESS) | |||
| return get_result() | |||
| if doc.type == FileType.VISUAL or re.search( | |||
| r"\.(ppt|pptx|pages)$", doc.name): | |||
| @@ -146,8 +144,6 @@ def update_doc(tenant_id, dataset_id, document_id): | |||
| return get_error_data_result(retmsg="Tenant not found!") | |||
| ELASTICSEARCH.deleteByQuery( | |||
| Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) | |||
| if "parser_config" in req: | |||
| DocumentService.update_parser_config(doc.id, req["parser_config"]) | |||
| return get_result() | |||
| @@ -258,6 +254,8 @@ def parse(tenant_id,dataset_id): | |||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | |||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | |||
| req = request.json | |||
| if not req.get("document_ids"): | |||
| return get_error_data_result("`document_ids` is required") | |||
| for id in req["document_ids"]: | |||
| if not DocumentService.query(id=id,kb_id=dataset_id): | |||
| return get_error_data_result(retmsg=f"You don't own the document {id}.") | |||
| @@ -283,9 +281,14 @@ def stop_parsing(tenant_id,dataset_id): | |||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | |||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | |||
| req = request.json | |||
| if not req.get("document_ids"): | |||
| return get_error_data_result("`document_ids` is required") | |||
| for id in req["document_ids"]: | |||
| if not DocumentService.query(id=id,kb_id=dataset_id): | |||
| doc = DocumentService.query(id=id, kb_id=dataset_id) | |||
| if not doc: | |||
| return get_error_data_result(retmsg=f"You don't own the document {id}.") | |||
| if doc[0].progress == 100.0 or doc[0].progress == 0.0: | |||
| return get_error_data_result("Can't stop parsing document with progress at 0 or 100") | |||
| info = {"run": "2", "progress": 0} | |||
| DocumentService.update_by_id(id, info) | |||
| # if str(req["run"]) == TaskStatus.CANCEL.value: | |||
| @@ -297,7 +300,7 @@ def stop_parsing(tenant_id,dataset_id): | |||
| @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET']) | |||
| @token_required | |||
| def list_chunk(tenant_id,dataset_id,document_id): | |||
| def list_chunks(tenant_id,dataset_id,document_id): | |||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | |||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | |||
| doc=DocumentService.query(id=document_id, kb_id=dataset_id) | |||
| @@ -309,57 +312,58 @@ def list_chunk(tenant_id,dataset_id,document_id): | |||
| page = int(req.get("offset", 1)) | |||
| size = int(req.get("limit", 30)) | |||
| question = req.get("keywords", "") | |||
| try: | |||
| query = { | |||
| "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True | |||
| query = { | |||
| "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True | |||
| } | |||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||
| res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} | |||
| origin_chunks = [] | |||
| sign = 0 | |||
| for id in sres.ids: | |||
| d = { | |||
| "chunk_id": id, | |||
| "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ | |||
| id].get( | |||
| "content_with_weight", ""), | |||
| "doc_id": sres.field[id]["doc_id"], | |||
| "docnm_kwd": sres.field[id]["docnm_kwd"], | |||
| "important_kwd": sres.field[id].get("important_kwd", []), | |||
| "img_id": sres.field[id].get("img_id", ""), | |||
| "available_int": sres.field[id].get("available_int", 1), | |||
| "positions": sres.field[id].get("position_int", "").split("\t") | |||
| } | |||
| if "available_int" in req: | |||
| query["available_int"] = int(req["available_int"]) | |||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||
| res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} | |||
| origin_chunks = [] | |||
| for id in sres.ids: | |||
| d = { | |||
| "chunk_id": id, | |||
| "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ | |||
| id].get( | |||
| "content_with_weight", ""), | |||
| "doc_id": sres.field[id]["doc_id"], | |||
| "docnm_kwd": sres.field[id]["docnm_kwd"], | |||
| "important_kwd": sres.field[id].get("important_kwd", []), | |||
| "img_id": sres.field[id].get("img_id", ""), | |||
| "available_int": sres.field[id].get("available_int", 1), | |||
| "positions": sres.field[id].get("position_int", "").split("\t") | |||
| } | |||
| if len(d["positions"]) % 5 == 0: | |||
| poss = [] | |||
| for i in range(0, len(d["positions"]), 5): | |||
| poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), | |||
| float(d["positions"][i + 3]), float(d["positions"][i + 4])]) | |||
| d["positions"] = poss | |||
| origin_chunks.append(d) | |||
| ##rename keys | |||
| for chunk in origin_chunks: | |||
| key_mapping = { | |||
| "chunk_id": "id", | |||
| "content_with_weight": "content", | |||
| "doc_id": "document_id", | |||
| "important_kwd": "important_keywords", | |||
| "img_id": "image_id", | |||
| } | |||
| renamed_chunk = {} | |||
| for key, value in chunk.items(): | |||
| new_key = key_mapping.get(key, key) | |||
| renamed_chunk[new_key] = value | |||
| res["chunks"].append(renamed_chunk) | |||
| return get_result(data=res) | |||
| except Exception as e: | |||
| if str(e).find("not_found") > 0: | |||
| return get_result(retmsg=f'No chunk found!', | |||
| retcode=RetCode.DATA_ERROR) | |||
| return server_error_response(e) | |||
| if len(d["positions"]) % 5 == 0: | |||
| poss = [] | |||
| for i in range(0, len(d["positions"]), 5): | |||
| poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), | |||
| float(d["positions"][i + 3]), float(d["positions"][i + 4])]) | |||
| d["positions"] = poss | |||
| origin_chunks.append(d) | |||
| if req.get("id"): | |||
| if req.get("id") == id: | |||
| origin_chunks.clear() | |||
| origin_chunks.append(d) | |||
| sign = 1 | |||
| break | |||
| if req.get("id"): | |||
| if sign == 0: | |||
| return get_error_data_result(f"Can't find this chunk {req.get('id')}") | |||
| for chunk in origin_chunks: | |||
| key_mapping = { | |||
| "chunk_id": "id", | |||
| "content_with_weight": "content", | |||
| "doc_id": "document_id", | |||
| "important_kwd": "important_keywords", | |||
| "img_id": "image_id", | |||
| } | |||
| renamed_chunk = {} | |||
| for key, value in chunk.items(): | |||
| new_key = key_mapping.get(key, key) | |||
| renamed_chunk[new_key] = value | |||
| res["chunks"].append(renamed_chunk) | |||
| return get_result(data=res) | |||
| @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST']) | |||
| @@ -374,6 +378,9 @@ def create(tenant_id,dataset_id,document_id): | |||
| req = request.json | |||
| if not req.get("content"): | |||
| return get_error_data_result(retmsg="`content` is required") | |||
| if "important_keywords" in req: | |||
| if type(req["important_keywords"]) != list: | |||
| return get_error_data_result("`important_keywords` is required to be a list") | |||
| md5 = hashlib.md5() | |||
| md5.update((req["content"] + document_id).encode("utf-8")) | |||
| @@ -381,8 +388,8 @@ def create(tenant_id,dataset_id,document_id): | |||
| d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]), | |||
| "content_with_weight": req["content"]} | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["important_kwd"] = req.get("important_kwd", []) | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) | |||
| d["important_kwd"] = req.get("important_keywords", []) | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", []))) | |||
| d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | |||
| d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | |||
| d["kb_id"] = [doc.kb_id] | |||
| @@ -432,12 +439,12 @@ def rm_chunk(tenant_id,dataset_id,document_id): | |||
| req = request.json | |||
| if not req.get("chunk_ids"): | |||
| return get_error_data_result("`chunk_ids` is required") | |||
| query = { | |||
| "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True} | |||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||
| for chunk_id in req.get("chunk_ids"): | |||
| res = ELASTICSEARCH.get( | |||
| chunk_id, search.index_name( | |||
| tenant_id)) | |||
| if not res.get("found"): | |||
| return server_error_response(f"Chunk {chunk_id} not found") | |||
| if chunk_id not in sres.ids: | |||
| return get_error_data_result(f"Chunk {chunk_id} not found") | |||
| if not ELASTICSEARCH.deleteByQuery( | |||
| Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)): | |||
| return get_error_data_result(retmsg="Index updating failure") | |||
| @@ -451,24 +458,36 @@ def rm_chunk(tenant_id,dataset_id,document_id): | |||
| @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT']) | |||
| @token_required | |||
| def set(tenant_id,dataset_id,document_id,chunk_id): | |||
| res = ELASTICSEARCH.get( | |||
| try: | |||
| res = ELASTICSEARCH.get( | |||
| chunk_id, search.index_name( | |||
| tenant_id)) | |||
| if not res.get("found"): | |||
| return get_error_data_result(f"Chunk {chunk_id} not found") | |||
| except Exception as e: | |||
| return get_error_data_result(f"Can't find this chunk {chunk_id}") | |||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | |||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | |||
| doc = DocumentService.query(id=document_id, kb_id=dataset_id) | |||
| if not doc: | |||
| return get_error_data_result(retmsg=f"You don't own the document {document_id}.") | |||
| doc = doc[0] | |||
| query = { | |||
| "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True | |||
| } | |||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||
| if chunk_id not in sres.ids: | |||
| return get_error_data_result(f"You don't own the chunk {chunk_id}") | |||
| req = request.json | |||
| content=res["_source"].get("content_with_weight") | |||
| d = { | |||
| "id": chunk_id, | |||
| "content_with_weight": req.get("content",res.get["content_with_weight"])} | |||
| d["content_ltks"] = rag_tokenizer.tokenize(req["content"]) | |||
| "content_with_weight": req.get("content",content)} | |||
| d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"]) | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| d["important_kwd"] = req.get("important_keywords",[]) | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) | |||
| if "important_keywords" in req: | |||
| if type(req["important_keywords"]) != list: | |||
| return get_error_data_result("`important_keywords` is required to be a list") | |||
| d["important_kwd"] = req.get("important_keywords") | |||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) | |||
| if "available" in req: | |||
| d["available_int"] = req["available"] | |||
| embd_id = DocumentService.get_embd_id(document_id) | |||
| @@ -478,7 +497,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id): | |||
| arr = [ | |||
| t for t in re.split( | |||
| r"[\n\t]", | |||
| req["content"]) if len(t) > 1] | |||
| d["content_with_weight"]) if len(t) > 1] | |||
| if len(arr) != 2: | |||
| return get_error_data_result( | |||
| retmsg="Q&A must be separated by TAB/ENTER key.") | |||
| @@ -486,7 +505,7 @@ def set(tenant_id,dataset_id,document_id,chunk_id): | |||
| d = beAdoc(d, arr[0], arr[1], not any( | |||
| [rag_tokenizer.is_chinese(t) for t in q + a])) | |||
| v, c = embd_mdl.encode([doc.name, req["content"]]) | |||
| v, c = embd_mdl.encode([doc.name, d["content_with_weight"]]) | |||
| v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | |||
| d["q_%d_vec" % len(v)] = v.tolist() | |||
| ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | |||
| @@ -505,7 +524,7 @@ def retrieval_test(tenant_id): | |||
| for id in kb_id: | |||
| if not KnowledgebaseService.query(id=id,tenant_id=tenant_id): | |||
| return get_error_data_result(f"You don't own the dataset {id}.") | |||
| if "question" not in req_json: | |||
| if "question" not in req: | |||
| return get_error_data_result("`question` is required.") | |||
| page = int(req.get("offset", 1)) | |||
| size = int(req.get("limit", 30)) | |||
| @@ -24,10 +24,9 @@ from api.utils import get_uuid | |||
| from api.utils.api_utils import get_error_data_result | |||
| from api.utils.api_utils import get_result, token_required | |||
| @manager.route('/chat/<chat_id>/session', methods=['POST']) | |||
| @token_required | |||
| def create(tenant_id, chat_id): | |||
| def create(tenant_id,chat_id): | |||
| req = request.json | |||
| req["dialog_id"] = chat_id | |||
| dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value) | |||
| @@ -51,14 +50,13 @@ def create(tenant_id, chat_id): | |||
| del conv["reference"] | |||
| return get_result(data=conv) | |||
| @manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT']) | |||
| @token_required | |||
| def update(tenant_id, chat_id, session_id): | |||
| def update(tenant_id,chat_id,session_id): | |||
| req = request.json | |||
| req["dialog_id"] = chat_id | |||
| conv_id = session_id | |||
| conv = ConversationService.query(id=conv_id, dialog_id=chat_id) | |||
| conv = ConversationService.query(id=conv_id,dialog_id=chat_id) | |||
| if not conv: | |||
| return get_error_data_result(retmsg="Session does not exist") | |||
| if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): | |||
| @@ -74,16 +72,30 @@ def update(tenant_id, chat_id, session_id): | |||
| return get_result() | |||
| @manager.route('/chat/<chat_id>/session/<session_id>/completion', methods=['POST']) | |||
| @manager.route('/chat/<chat_id>/completion', methods=['POST']) | |||
| @token_required | |||
| def completion(tenant_id, chat_id, session_id): | |||
| def completion(tenant_id,chat_id): | |||
| req = request.json | |||
| # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [ | |||
| # {"role": "user", "content": "上海有吗?"} | |||
| # ]} | |||
| if not req.get("session_id"): | |||
| conv = { | |||
| "id": get_uuid(), | |||
| "dialog_id": chat_id, | |||
| "name": req.get("name", "New session"), | |||
| "message": [{"role": "assistant", "content": "Hi! I am your assistant,can I help you?"}] | |||
| } | |||
| if not conv.get("name"): | |||
| return get_error_data_result(retmsg="Name can not be empty.") | |||
| ConversationService.save(**conv) | |||
| e, conv = ConversationService.get_by_id(conv["id"]) | |||
| session_id=conv.id | |||
| else: | |||
| session_id = req.get("session_id") | |||
| if not req.get("question"): | |||
| return get_error_data_result(retmsg="Please input your question.") | |||
| conv = ConversationService.query(id=session_id, dialog_id=chat_id) | |||
| conv = ConversationService.query(id=session_id,dialog_id=chat_id) | |||
| if not conv: | |||
| return get_error_data_result(retmsg="Session does not exist") | |||
| conv = conv[0] | |||
| @@ -117,17 +129,18 @@ def completion(tenant_id, chat_id, session_id): | |||
| conv.message[-1] = {"role": "assistant", "content": ans["answer"], | |||
| "id": message_id, "prompt": ans.get("prompt", "")} | |||
| ans["id"] = message_id | |||
| ans["session_id"]=session_id | |||
| def stream(): | |||
| nonlocal dia, msg, req, conv | |||
| try: | |||
| for ans in chat(dia, msg, **req): | |||
| fillin_conv(ans) | |||
| yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n" | |||
| yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n" | |||
| ConversationService.update_by_id(conv.id, conv.to_dict()) | |||
| except Exception as e: | |||
| yield "data:" + json.dumps({"code": 500, "message": str(e), | |||
| "data": {"answer": "**ERROR**: " + str(e), "reference": []}}, | |||
| "data": {"answer": "**ERROR**: " + str(e),"reference": []}}, | |||
| ensure_ascii=False) + "\n\n" | |||
| yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n" | |||
| @@ -148,15 +161,14 @@ def completion(tenant_id, chat_id, session_id): | |||
| break | |||
| return get_result(data=answer) | |||
| @manager.route('/chat/<chat_id>/session', methods=['GET']) | |||
| @token_required | |||
| def list(chat_id, tenant_id): | |||
| def list(chat_id,tenant_id): | |||
| if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value): | |||
| return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.") | |||
| id = request.args.get("id") | |||
| name = request.args.get("name") | |||
| session = ConversationService.query(id=id, name=name, dialog_id=chat_id) | |||
| session = ConversationService.query(id=id,name=name,dialog_id=chat_id) | |||
| if not session: | |||
| return get_error_data_result(retmsg="The session doesn't exist") | |||
| page_number = int(request.args.get("page", 1)) | |||
| @@ -166,7 +178,7 @@ def list(chat_id, tenant_id): | |||
| desc = False | |||
| else: | |||
| desc = True | |||
| convs = ConversationService.get_list(chat_id, page_number, items_per_page, orderby, desc, id, name) | |||
| convs = ConversationService.get_list(chat_id,page_number,items_per_page,orderby,desc,id,name) | |||
| if not convs: | |||
| return get_result(data=[]) | |||
| for conv in convs: | |||
| @@ -201,17 +213,16 @@ def list(chat_id, tenant_id): | |||
| del conv["reference"] | |||
| return get_result(data=convs) | |||
| @manager.route('/chat/<chat_id>/session', methods=["DELETE"]) | |||
| @token_required | |||
| def delete(tenant_id, chat_id): | |||
| def delete(tenant_id,chat_id): | |||
| if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): | |||
| return get_error_data_result(retmsg="You don't own the chat") | |||
| ids = request.json.get("ids") | |||
| if not ids: | |||
| return get_error_data_result(retmsg="`ids` is required in deleting operation") | |||
| for id in ids: | |||
| conv = ConversationService.query(id=id, dialog_id=chat_id) | |||
| conv = ConversationService.query(id=id,dialog_id=chat_id) | |||
| if not conv: | |||
| return get_error_data_result(retmsg="The chat doesn't own the session") | |||
| ConversationService.delete_by_id(id) | |||
| @@ -61,14 +61,13 @@ class DocumentService(CommonService): | |||
| docs = docs.where( | |||
| fn.LOWER(cls.model.name).contains(keywords.lower()) | |||
| ) | |||
| count = docs.count() | |||
| if desc: | |||
| docs = docs.order_by(cls.model.getter_by(orderby).desc()) | |||
| else: | |||
| docs = docs.order_by(cls.model.getter_by(orderby).asc()) | |||
| docs = docs.paginate(page_number, items_per_page) | |||
| count = docs.count() | |||
| return list(docs.dicts()), count | |||
| @@ -432,18 +432,71 @@ The error response includes a JSON object like the following: | |||
| } | |||
| ``` | |||
| ## Delete files from a dataset | |||
| **DELETE** `/api/v1/dataset/{dataset_id}/document ` | |||
| Delete files from a dataset | |||
| ### Request | |||
| - Method: DELETE | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document` | |||
| - Headers: | |||
| - 'Content-Type: application/json' | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `ids`:List[str] | |||
| #### Request example | |||
| ```bash | |||
| curl --request DELETE \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: {YOUR ACCESS TOKEN}' \ | |||
| --data '{ | |||
| "ids": ["id_1","id_2"] | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| - `"ids"`: (*Body parameter*) | |||
| The ids of teh documents to be deleted | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| }. | |||
| ``` | |||
| - `"error_code"`: `integer` | |||
| `0`: The operation succeeds. | |||
| The error response includes a JSON object like the following: | |||
| ```json | |||
| { | |||
| "code": 102, | |||
| "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005." | |||
| } | |||
| ``` | |||
| ## Download a file from a dataset | |||
| **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}` | |||
| Downloads files from a dataset. | |||
| Downloads a file from a dataset. | |||
| ### Request | |||
| - Method: GET | |||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}` | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Output: | |||
| - '{FILE_NAME}' | |||
| @@ -451,10 +504,9 @@ Downloads files from a dataset. | |||
| ```bash | |||
| curl --request GET \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --output '{FILE_NAME}' | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --output ./ragflow.txt | |||
| ``` | |||
| #### Request parameters | |||
| @@ -466,7 +518,7 @@ curl --request GET \ | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| The successful response includes a text object like the following: | |||
| ```text | |||
| test_2. | |||
| @@ -596,92 +648,39 @@ Update a file in a dataset | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `name`:`string` | |||
| - `parser_method`:`string` | |||
| - `parser_config`:`dict` | |||
| #### Request example | |||
| ```bash | |||
| curl --request PUT \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \ | |||
| --header 'Content-Type: application/json' \ | |||
| --data '{ | |||
| "name": "manual.txt", | |||
| "thumbnail": null, | |||
| "knowledgebase_id": "779333c0758611ef910f0242ac120004", | |||
| "parser_method": "manual", | |||
| "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12}, | |||
| "source_type": "local", "type": "doc", | |||
| "created_by": "134408906b6811efbcd20242ac120005", | |||
| "size": 0, "token_count": 0, "chunk_count": 0, | |||
| "progress": 0.0, | |||
| "progress_msg": "", | |||
| "process_begin_at": null, | |||
| "process_duration": 0.0 | |||
| "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12} | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| - `"thumbnail"`: (*Body parameter*) | |||
| Thumbnail image of the document. | |||
| - `""` | |||
| - `"knowledgebase_id"`: (*Body parameter*) | |||
| Knowledge base ID related to the document. | |||
| - `""` | |||
| - `"parser_method"`: (*Body parameter*) | |||
| Method used to parse the document. | |||
| - `""` | |||
| - `"parser_config"`: (*Body parameter*) | |||
| Configuration object for the parser. | |||
| - If the value is `None`, a dictionary with default values will be generated. | |||
| - `"source_type"`: (*Body parameter*) | |||
| Source type of the document. | |||
| - `""` | |||
| - `"type"`: (*Body parameter*) | |||
| Type or category of the document. | |||
| - `""` | |||
| - `"created_by"`: (*Body parameter*) | |||
| Creator of the document. | |||
| - `""` | |||
| - `"name"`: (*Body parameter*) | |||
| Name or title of the document. | |||
| - `""` | |||
| - `"size"`: (*Body parameter*) | |||
| Size of the document in bytes or some other unit. | |||
| - `0` | |||
| - `"token_count"`: (*Body parameter*) | |||
| Number of tokens in the document. | |||
| - `0` | |||
| - `"chunk_count"`: (*Body parameter*) | |||
| Number of chunks the document is split into. | |||
| - `0` | |||
| - `"progress"`: (*Body parameter*) | |||
| Current processing progress as a percentage. | |||
| - `0.0` | |||
| - `"progress_msg"`: (*Body parameter*) | |||
| Message indicating current progress status. | |||
| - `""` | |||
| - `"process_begin_at"`: (*Body parameter*) | |||
| Start time of the document processing. | |||
| - `None` | |||
| - `"process_duration"`: (*Body parameter*) | |||
| Duration of the processing in seconds or minutes. | |||
| - `0.0` | |||
| ### Response | |||
| @@ -712,34 +711,34 @@ Parse files into chunks in a dataset | |||
| ### Request | |||
| - Method: POST | |||
| - URL: `/api/v1/dataset/{dataset_id}/chunk` | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk ` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `document_ids`:List[str] | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| curl --request POST \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --raw '{ | |||
| "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] | |||
| }' | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}' | |||
| ``` | |||
| #### Request parameters | |||
| - `"dataset_id"`: (*Path parameter*) | |||
| - `"documents"`: (*Body parameter*) | |||
| - Documents to parse | |||
| - `"document_ids"`:(*Body parameter*) | |||
| The ids of the documents to be parsed | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| } | |||
| @@ -747,10 +746,10 @@ The successful response includes a JSON object like the following: | |||
| The error response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 3016, | |||
| "message": "Can't connect database" | |||
| "code": 102, | |||
| "message": "`document_ids` is required" | |||
| } | |||
| ``` | |||
| @@ -762,35 +761,35 @@ Stop file parsing | |||
| ### Request | |||
| - Method: POST | |||
| - URL: `/api/v1/dataset/{dataset_id}/chunk` | |||
| - Method: DELETE | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `document_ids`:List[str] | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| curl --request DELETE \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --raw '{ | |||
| "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] | |||
| }' | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}' | |||
| ``` | |||
| #### Request parameters | |||
| - `"dataset_id"`: (*Path parameter*) | |||
| - `"documents"`: (*Body parameter*) | |||
| - Documents to stop parsing | |||
| - `"document_ids"`:(*Body parameter*) | |||
| The ids of the documents to be parsed | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| } | |||
| @@ -798,104 +797,98 @@ The successful response includes a JSON object like the following: | |||
| The error response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 3016, | |||
| "message": "Can't connect database" | |||
| "code": 102, | |||
| "message": "`document_ids` is required" | |||
| } | |||
| ``` | |||
| ## Get document chunk list | |||
| **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}` | |||
| Get document chunk list | |||
| ### Request | |||
| - Method: GET | |||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| curl --request GET \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id} \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| ``` | |||
| #### Request parameters | |||
| - `"dataset_id"`: (*Path parameter*) | |||
| - `"document_id"`: (*Path parameter*) | |||
| - `"offset"`(*Filter parameter*) | |||
| The beginning number of records for paging. | |||
| - `"keywords"`(*Filter parameter*) | |||
| List chunks whose name has the given keywords | |||
| - `"limit"`(*Filter parameter*) | |||
| Records number to return | |||
| - `"id"`(*Filter parameter*) | |||
| The id of chunk to be got | |||
| ### Response | |||
| The successful response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| "code": 0, | |||
| "data": { | |||
| "chunks": [ | |||
| { | |||
| "available_int": 1, | |||
| "content": "<em>advantag</em>of ragflow increas accuraci and relev:by incorpor retriev inform , ragflow can gener respons that are more accur", | |||
| "document_keyword": "ragflow_test.txt", | |||
| "document_id": "77df9ef4759a11ef8bdd0242ac120004", | |||
| "id": "4ab8c77cfac1a829c8d5ed022a0808c0", | |||
| "image_id": "", | |||
| "important_keywords": [], | |||
| "positions": [ | |||
| "" | |||
| ] | |||
| } | |||
| ], | |||
| "chunks": [], | |||
| "doc": { | |||
| "chunk_count": 5, | |||
| "create_date": "Wed, 18 Sep 2024 08:46:16 GMT", | |||
| "create_time": 1726649176833, | |||
| "created_by": "134408906b6811efbcd20242ac120005", | |||
| "id": "77df9ef4759a11ef8bdd0242ac120004", | |||
| "knowledgebase_id": "77d9d24e759a11ef880c0242ac120004", | |||
| "location": "ragflow_test.txt", | |||
| "name": "ragflow_test.txt", | |||
| "chunk_num": 0, | |||
| "create_date": "Sun, 29 Sep 2024 03:47:29 GMT", | |||
| "create_time": 1727581649216, | |||
| "created_by": "69736c5e723611efb51b0242ac120007", | |||
| "id": "8cb781ec7e1511ef98ac0242ac120006", | |||
| "kb_id": "c7ee74067a2c11efb21c0242ac120006", | |||
| "location": "明天的天气是晴天.txt", | |||
| "name": "明天的天气是晴天.txt", | |||
| "parser_config": { | |||
| "chunk_token_count": 128, | |||
| "delimiter": "\n!?。;!?", | |||
| "layout_recognize": true, | |||
| "task_page_size": 12 | |||
| "pages": [ | |||
| [ | |||
| 1, | |||
| 1000000 | |||
| ] | |||
| ] | |||
| }, | |||
| "parser_method": "naive", | |||
| "process_begin_at": "Wed, 18 Sep 2024 08:46:16 GMT", | |||
| "process_duation": 7.3213, | |||
| "progress": 1.0, | |||
| "progress_msg": "\nTask has been received.\nStart to parse.\nFinish parsing.\nFinished slicing files(5). Start to embedding the content.\nFinished embedding(6.16)! Start to build index!\nDone!", | |||
| "run": "3", | |||
| "size": 4209, | |||
| "parser_id": "naive", | |||
| "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT", | |||
| "process_duation": 1435.37, | |||
| "progress": 0.0370833, | |||
| "progress_msg": "\nTask has been received.", | |||
| "run": "1", | |||
| "size": 24, | |||
| "source_type": "local", | |||
| "status": "1", | |||
| "thumbnail": null, | |||
| "token_count": 746, | |||
| "token_num": 0, | |||
| "type": "doc", | |||
| "update_date": "Wed, 18 Sep 2024 08:46:23 GMT", | |||
| "update_time": 1726649183321 | |||
| "update_date": "Tue, 15 Oct 2024 10:47:46 GMT", | |||
| "update_time": 1728989266371 | |||
| }, | |||
| "total": 1 | |||
| }, | |||
| "total": 0 | |||
| } | |||
| } | |||
| ``` | |||
| The error response includes a JSON object like the following: | |||
| ```shell | |||
| ```json | |||
| { | |||
| "code": 3016, | |||
| "message": "Can't connect database" | |||
| "code": 102, | |||
| "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5." | |||
| } | |||
| ``` | |||
| @@ -908,55 +901,96 @@ Delete document chunks | |||
| ### Request | |||
| - Method: DELETE | |||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `chunk_ids`:List[str] | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| curl --request DELETE \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --raw '{ | |||
| "chunks": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] | |||
| }' | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{ | |||
| "chunk_ids": ["test_1", "test_2"] | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| - `"chunk_ids"`:(*Body parameter*) | |||
| The chunks of the document to be deleted | |||
| ### Response | |||
| Success | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| } | |||
| ``` | |||
| Error | |||
| ```json | |||
| { | |||
| "code": 102, | |||
| "message": "`chunk_ids` is required" | |||
| } | |||
| ``` | |||
| ## Update document chunk | |||
| **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}` | |||
| Update document chunk | |||
| ### Request | |||
| - Method: PUT | |||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `content`:str | |||
| - `important_keywords`:str | |||
| - `available`:int | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| curl --request PUT \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --raw '{ | |||
| "chunk_id": "d87fb0b7212c15c18d0831677552d7de", | |||
| "knowledgebase_id": null, | |||
| "name": "", | |||
| "content": "ragflow123", | |||
| "important_keywords": [], | |||
| "document_id": "e6bbba92759511efaa900242ac120004", | |||
| "status": "1" | |||
| }' | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id} \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{ | |||
| "content": "ragflow123", | |||
| "important_keywords": [], | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| - `"content"`:(*Body parameter*) | |||
| Contains the main text or information of the chunk. | |||
| - `"important_keywords"`:(*Body parameter*) | |||
| list the key terms or phrases that are significant or central to the chunk's content. | |||
| - `"available"`:(*Body parameter*) | |||
| Indicating the availability status, 0 means unavailable and 1 means available. | |||
| ### Response | |||
| Success | |||
| ```json | |||
| { | |||
| "code": 0 | |||
| } | |||
| ``` | |||
| Error | |||
| ```json | |||
| { | |||
| "code": 102, | |||
| "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2" | |||
| } | |||
| ``` | |||
| ## Insert document chunks | |||
| **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| @@ -966,50 +1000,187 @@ Insert document chunks | |||
| ### Request | |||
| - Method: POST | |||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `content`: str | |||
| - `important_keywords`:List[str] | |||
| #### Request example | |||
| ```shell | |||
| ```bash | |||
| curl --request POST \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --raw '{ | |||
| "document_id": "97ad64b6759811ef9fc30242ac120004", | |||
| "content": ["ragflow content", "ragflow content"] | |||
| }' | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{ | |||
| "content": "ragflow content" | |||
| }' | |||
| ``` | |||
| #### Request parameters | |||
| - `content`:(*Body parameter*) | |||
| Contains the main text or information of the chunk. | |||
| - `important_keywords`(*Body parameter*) | |||
| list the key terms or phrases that are significant or central to the chunk's content. | |||
| ### Response | |||
| Success | |||
| ```json | |||
| { | |||
| "code": 0, | |||
| "data": { | |||
| "chunk": { | |||
| "content": "ragflow content", | |||
| "create_time": "2024-10-16 08:05:04", | |||
| "create_timestamp": 1729065904.581025, | |||
| "dataset_id": [ | |||
| "c7ee74067a2c11efb21c0242ac120006" | |||
| ], | |||
| "document_id": "5c5999ec7be811ef9cab0242ac120005", | |||
| "id": "d78435d142bd5cf6704da62c778795c5", | |||
| "important_keywords": [] | |||
| } | |||
| } | |||
| } | |||
| ``` | |||
| Error | |||
| ```json | |||
| { | |||
| "code": 102, | |||
| "message": "`content` is required" | |||
| } | |||
| ``` | |||
| ## Dataset retrieval test | |||
| **GET** `/api/v1/dataset/{dataset_id}/retrieval` | |||
| **GET** `/api/v1/retrieval` | |||
| Retrieval test of a dataset | |||
| ### Request | |||
| - Method: GET | |||
| - URL: `/api/v1/dataset/{dataset_id}/retrieval` | |||
| - Method: POST | |||
| - URL: `http://{address}/api/v1/retrieval` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `question`: str | |||
| - `datasets`: List[str] | |||
| - `documents`: List[str] | |||
| - `offset`: int | |||
| - `limit`: int | |||
| - `similarity_threshold`: float | |||
| - `vector_similarity_weight`: float | |||
| - `top_k`: int | |||
| - `rerank_id`: string | |||
| - `keyword`: bool | |||
| - `highlight`: bool | |||
| #### Request example | |||
| ```shell | |||
| curl --request GET \ | |||
| --url http://{address}/api/v1/dataset/{dataset_id}/retrieval \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| --raw '{ | |||
| "query_text": "This is a cat." | |||
| }' | |||
| ```bash | |||
| curl --request POST \ | |||
| --url http://{address}/api/v1/retrieval \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: {YOUR_ACCESS_TOKEN}' \ | |||
| --data '{ | |||
| "question": "What is advantage of ragflow?", | |||
| "datasets": [ | |||
| "b2a62730759d11ef987d0242ac120004" | |||
| ], | |||
| "documents": [ | |||
| "77df9ef4759a11ef8bdd0242ac120004" | |||
| ] | |||
| }' | |||
| ``` | |||
| #### Request parameter | |||
| - `"question"`: (*Body parameter*) | |||
| User's question, search keywords | |||
| `""` | |||
| - `"datasets"`: (*Body parameter*) | |||
| The scope of datasets | |||
| `None` | |||
| - `"documents"`: (*Body parameter*) | |||
| The scope of document. `None` means no limitation | |||
| `None` | |||
| - `"offset"`: (*Body parameter*) | |||
| The beginning point of retrieved records | |||
| `1` | |||
| - `"limit"`: (*Body parameter*) | |||
| The maximum number of records needed to return | |||
| `30` | |||
| - `"similarity_threshold"`: (*Body parameter*) | |||
| The minimum similarity score | |||
| `0.2` | |||
| - `"vector_similarity_weight"`: (*Body parameter*) | |||
| The weight of vector cosine similarity, `1 - x` is the term similarity weight | |||
| `0.3` | |||
| - `"top_k"`: (*Body parameter*) | |||
| Number of records engaged in vector cosine computation | |||
| `1024` | |||
| - `"rerank_id"`: (*Body parameter*) | |||
| ID of the rerank model | |||
| `None` | |||
| - `"keyword"`: (*Body parameter*) | |||
| Whether keyword-based matching is enabled | |||
| `False` | |||
| - `"highlight"`: (*Body parameter*) | |||
| Whether to enable highlighting of matched terms in the results | |||
| `False` | |||
| ### Response | |||
| Success | |||
| ```json | |||
| { | |||
| "code": 0, | |||
| "data": { | |||
| "chunks": [ | |||
| { | |||
| "content": "ragflow content", | |||
| "content_ltks": "ragflow content", | |||
| "document_id": "5c5999ec7be811ef9cab0242ac120005", | |||
| "document_keyword": "1.txt", | |||
| "highlight": "<em>ragflow</em> content", | |||
| "id": "d78435d142bd5cf6704da62c778795c5", | |||
| "img_id": "", | |||
| "important_keywords": [ | |||
| "" | |||
| ], | |||
| "kb_id": "c7ee74067a2c11efb21c0242ac120006", | |||
| "positions": [ | |||
| "" | |||
| ], | |||
| "similarity": 0.9669436601210759, | |||
| "term_similarity": 1.0, | |||
| "vector_similarity": 0.8898122004035864 | |||
| } | |||
| ], | |||
| "doc_aggs": [ | |||
| { | |||
| "count": 1, | |||
| "doc_id": "5c5999ec7be811ef9cab0242ac120005", | |||
| "doc_name": "1.txt" | |||
| } | |||
| ], | |||
| "total": 1 | |||
| } | |||
| } | |||
| ``` | |||
| Error | |||
| ```json | |||
| { | |||
| "code": 102, | |||
| "message": "`datasets` is required." | |||
| } | |||
| ``` | |||
| ## Create chat | |||
| **POST** `/api/v1/chat` | |||
| @@ -1708,26 +1879,27 @@ Error | |||
| ## Chat with a chat session | |||
| **POST** `/api/v1/chat/{chat_id}/session/{session_id}/completion` | |||
| **POST** `/api/v1/chat/{chat_id}/completion` | |||
| Chat with a chat session | |||
| ### Request | |||
| - Method: POST | |||
| - URL: `http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion` | |||
| - URL: `http://{address} /api/v1/chat/{chat_id}/completion` | |||
| - Headers: | |||
| - `content-Type: application/json` | |||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||
| - Body: | |||
| - `question`: string | |||
| - `stream`: bool | |||
| - `session_id`: str | |||
| #### Request example | |||
| ```bash | |||
| curl --request POST \ | |||
| --url http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion \ | |||
| --url http://{address} /api/v1/chat/{chat_id}/completion \ | |||
| --header 'Content-Type: application/json' \ | |||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||
| --data-binary '{ | |||
| @@ -1743,6 +1915,8 @@ curl --request POST \ | |||
| - `stream`: (*Body Parameter*) | |||
| The approach of streaming text generation. | |||
| `False` | |||
| - `session_id`: (*Body Parameter*) | |||
| The id of session.If not provided, a new session will be generated. | |||
| ### Response | |||
| Success | |||
| ```json | |||
| @@ -244,42 +244,117 @@ File management inside knowledge base | |||
| ## Upload document | |||
| ```python | |||
| RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool | |||
| DataSet.upload_documents(document_list: List[dict]) | |||
| ``` | |||
| ### Parameters | |||
| #### name | |||
| #### document_list:`List[dict]` | |||
| A list composed of dicts containing `name` and `blob`. | |||
| #### blob | |||
| ### Returns | |||
| no return | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.create_dataset(name="kb_1") | |||
| ds.upload_documents([{name="1.txt", blob="123"}, ...] } | |||
| ``` | |||
| --- | |||
| ## Update document | |||
| ```python | |||
| Document.update(update_message:dict) | |||
| ``` | |||
| ### Parameters | |||
| #### update_message:`dict` | |||
| only `name`,`parser_config`,`parser_method` can be changed | |||
| ### Returns | |||
| no return | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds=rag.list_datasets(id='id') | |||
| ds=ds[0] | |||
| doc = ds.list_documents(id="wdfxb5t547d") | |||
| doc = doc[0] | |||
| doc.update([{"parser_method": "manual"...}]) | |||
| ``` | |||
| --- | |||
| ## Download document | |||
| ```python | |||
| Document.download() -> bytes | |||
| ``` | |||
| ### Returns | |||
| bytes of the document. | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds=rag.list_datasets(id="id") | |||
| ds=ds[0] | |||
| doc = ds.list_documents(id="wdfxb5t547d") | |||
| doc = doc[0] | |||
| open("~/ragflow.txt", "wb+").write(doc.download()) | |||
| print(doc) | |||
| ``` | |||
| --- | |||
| ## Retrieve document | |||
| ## List documents | |||
| ```python | |||
| RAGFlow.get_document(id:str=None,name:str=None) -> Document | |||
| Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document] | |||
| ``` | |||
| ### Parameters | |||
| #### id: `str`, *Required* | |||
| #### id: `str` | |||
| ID of the document to retrieve. | |||
| The id of the document to be got | |||
| #### name: `str` | |||
| #### keywords: `str` | |||
| List documents whose name has the given keywords. Defaults to `None`. | |||
| #### offset: `int` | |||
| The beginning number of records for paging. Defaults to `0`. | |||
| Name or title of the document. | |||
| #### limit: `int` | |||
| Records number to return, -1 means all of them. Records number to return, -1 means all of them. | |||
| #### orderby: `str` | |||
| The field by which the records should be sorted. This specifies the attribute or column used to order the results. | |||
| #### desc:`bool` | |||
| A boolean flag indicating whether the sorting should be in descending order. | |||
| ### Returns | |||
| List[Document] | |||
| A document object containing the following attributes: | |||
| #### id: `str` | |||
| @@ -352,98 +427,14 @@ Duration of the processing in seconds or minutes. Defaults to `0.0`. | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt') | |||
| print(doc) | |||
| ``` | |||
| --- | |||
| ## Save document settings | |||
| ```python | |||
| Document.save() -> bool | |||
| ``` | |||
| ### Returns | |||
| bool | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| doc = rag.get_document(id="wdfxb5t547d") | |||
| doc.parser_method= "manual" | |||
| doc.save() | |||
| ``` | |||
| --- | |||
| ## Download document | |||
| ```python | |||
| Document.download() -> bytes | |||
| ``` | |||
| ### Returns | |||
| bytes of the document. | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| doc = rag.get_document(id="wdfxb5t547d") | |||
| open("~/ragflow.txt", "w+").write(doc.download()) | |||
| print(doc) | |||
| ``` | |||
| --- | |||
| ## List documents | |||
| ```python | |||
| Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document] | |||
| ``` | |||
| ### Parameters | |||
| #### keywords: `str` | |||
| List documents whose name has the given keywords. Defaults to `None`. | |||
| #### offset: `int` | |||
| The beginning number of records for paging. Defaults to `0`. | |||
| #### limit: `int` | |||
| Records number to return, -1 means all of them. Records number to return, -1 means all of them. | |||
| ### Returns | |||
| List[Document] | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.create_dataset(name="kb_1") | |||
| filename1 = "~/ragflow.txt" | |||
| rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read()) | |||
| filename2 = "~/infinity.txt" | |||
| rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read()) | |||
| for d in ds.list_docs(keywords="rag", offset=0, limit=12): | |||
| blob=open(filename1 , "rb").read() | |||
| list_files=[{"name":filename1,"blob":blob}] | |||
| ds.upload_documents(list_files) | |||
| for d in ds.list_documents(keywords="rag", offset=0, limit=12): | |||
| print(d) | |||
| ``` | |||
| @@ -452,12 +443,11 @@ for d in ds.list_docs(keywords="rag", offset=0, limit=12): | |||
| ## Delete documents | |||
| ```python | |||
| Document.delete() -> bool | |||
| DataSet.delete_documents(ids: List[str] = None) | |||
| ``` | |||
| ### Returns | |||
| bool | |||
| description: delete success or not | |||
| no return | |||
| ### Examples | |||
| @@ -465,119 +455,87 @@ description: delete success or not | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.create_dataset(name="kb_1") | |||
| filename1 = "~/ragflow.txt" | |||
| rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read()) | |||
| filename2 = "~/infinity.txt" | |||
| rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read()) | |||
| for d in ds.list_docs(keywords="rag", offset=0, limit=12): | |||
| d.delete() | |||
| ds = rag.list_datasets(name="kb_1") | |||
| ds = ds[0] | |||
| ds.delete_documents(ids=["id_1","id_2"]) | |||
| ``` | |||
| --- | |||
| ## Parse document | |||
| ## Parse and stop parsing document | |||
| ```python | |||
| Document.async_parse() -> None | |||
| RAGFLOW.async_parse_documents() -> None | |||
| DataSet.async_parse_documents(document_ids:List[str]) -> None | |||
| DataSet.async_cancel_parse_documents(document_ids:List[str])-> None | |||
| ``` | |||
| ### Parameters | |||
| #### document_ids:`List[str]` | |||
| The ids of the documents to be parsed | |||
| ???????????????????????????????????????????????????? | |||
| ### Returns | |||
| no return | |||
| ???????????????????????????????????????????????????? | |||
| ### Examples | |||
| ```python | |||
| #document parse and cancel | |||
| rag = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| ds = rag.create_dataset(name="dataset_name") | |||
| name3 = 'ai.pdf' | |||
| path = 'test_data/ai.pdf' | |||
| rag.create_document(ds, name=name3, blob=open(path, "rb").read()) | |||
| doc = rag.get_document(name="ai.pdf") | |||
| doc.async_parse() | |||
| print("Async parsing initiated") | |||
| ``` | |||
| --- | |||
| ## Cancel document parsing | |||
| ```python | |||
| rag.async_cancel_parse_documents(ids) | |||
| RAGFLOW.async_cancel_parse_documents()-> None | |||
| ``` | |||
| ### Parameters | |||
| #### ids, `list[]` | |||
| ### Returns | |||
| ????????????????????????????????????????????????? | |||
| ### Examples | |||
| ```python | |||
| #documents parse and cancel | |||
| rag = RAGFlow(API_KEY, HOST_ADDRESS) | |||
| ds = rag.create_dataset(name="God5") | |||
| documents = [ | |||
| {'name': 'test1.txt', 'path': 'test_data/test1.txt'}, | |||
| {'name': 'test2.txt', 'path': 'test_data/test2.txt'}, | |||
| {'name': 'test3.txt', 'path': 'test_data/test3.txt'} | |||
| {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()}, | |||
| {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()}, | |||
| {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()} | |||
| ] | |||
| # Create documents in bulk | |||
| for doc_info in documents: | |||
| with open(doc_info['path'], "rb") as file: | |||
| created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read()) | |||
| docs = [rag.get_document(name=doc_info['name']) for doc_info in documents] | |||
| ids = [doc.id for doc in docs] | |||
| rag.async_parse_documents(ids) | |||
| ds.upload_documents(documents) | |||
| documents=ds.list_documents(keywords="test") | |||
| ids=[] | |||
| for document in documents: | |||
| ids.append(document.id) | |||
| ds.async_parse_documents(ids) | |||
| print("Async bulk parsing initiated") | |||
| for doc in docs: | |||
| for progress, msg in doc.join(interval=5, timeout=10): | |||
| print(f"{doc.name}: Progress: {progress}, Message: {msg}") | |||
| cancel_result = rag.async_cancel_parse_documents(ids) | |||
| ds.async_cancel_parse_documents(ids) | |||
| print("Async bulk parsing cancelled") | |||
| ``` | |||
| --- | |||
| ## Join document | |||
| ?????????????????? | |||
| ## List chunks | |||
| ```python | |||
| Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]] | |||
| Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk] | |||
| ``` | |||
| ### Parameters | |||
| #### interval: `int` | |||
| - `keywords`: `str` | |||
| List chunks whose name has the given keywords | |||
| default: `None` | |||
| Time interval in seconds for progress report. Defaults to `15`. | |||
| - `offset`: `int` | |||
| The beginning number of records for paging | |||
| default: `1` | |||
| #### timeout: `int` | |||
| Timeout in seconds. Defaults to `3600`. | |||
| - `limit`: `int` | |||
| Records number to return | |||
| default: `30` | |||
| - `id`: `str` | |||
| The ID of the chunk to be retrieved | |||
| default: `None` | |||
| ### Returns | |||
| List[chunk] | |||
| iteral[Tuple[float, str]] | |||
| ### Examples | |||
| ```python | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.list_datasets("123") | |||
| ds = ds[0] | |||
| ds.async_parse_documents(["wdfxb5t547d"]) | |||
| for c in doc.list_chunks(keywords="rag", offset=0, limit=12): | |||
| print(c) | |||
| ``` | |||
| ## Add chunk | |||
| ```python | |||
| @@ -587,6 +545,9 @@ Document.add_chunk(content:str) -> Chunk | |||
| ### Parameters | |||
| #### content: `str`, *Required* | |||
| Contains the main text or information of the chunk. | |||
| #### important_keywords :`List[str]` | |||
| list the key terms or phrases that are significant or central to the chunk's content. | |||
| ### Returns | |||
| @@ -598,7 +559,10 @@ chunk | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| doc = rag.get_document(id="wdfxb5t547d") | |||
| ds = rag.list_datasets(id="123") | |||
| ds = ds[0] | |||
| doc = ds.list_documents(id="wdfxb5t547d") | |||
| doc = doc[0] | |||
| chunk = doc.add_chunk(content="xxxxxxx") | |||
| ``` | |||
| @@ -607,12 +571,15 @@ chunk = doc.add_chunk(content="xxxxxxx") | |||
| ## Delete chunk | |||
| ```python | |||
| Chunk.delete() -> bool | |||
| Document.delete_chunks(chunk_ids: List[str]) | |||
| ``` | |||
| ### Parameters | |||
| #### chunk_ids:`List[str]` | |||
| The list of chunk_id | |||
| ### Returns | |||
| bool | |||
| no return | |||
| ### Examples | |||
| @@ -620,22 +587,34 @@ bool | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| doc = rag.get_document(id="wdfxb5t547d") | |||
| ds = rag.list_datasets(id="123") | |||
| ds = ds[0] | |||
| doc = ds.list_documents(id="wdfxb5t547d") | |||
| doc = doc[0] | |||
| chunk = doc.add_chunk(content="xxxxxxx") | |||
| chunk.delete() | |||
| doc.delete_chunks(["id_1","id_2"]) | |||
| ``` | |||
| --- | |||
| ## Save chunk contents | |||
| ## Update chunk | |||
| ```python | |||
| Chunk.save() -> bool | |||
| Chunk.update(update_message: dict) | |||
| ``` | |||
| ### Parameters | |||
| - `content`: `str` | |||
| Contains the main text or information of the chunk | |||
| - `important_keywords`: `List[str]` | |||
| List the key terms or phrases that are significant or central to the chunk's content | |||
| - `available`: `int` | |||
| Indicating the availability status, `0` means unavailable and `1` means available | |||
| ### Returns | |||
| bool | |||
| no return | |||
| ### Examples | |||
| @@ -643,10 +622,12 @@ bool | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| doc = rag.get_document(id="wdfxb5t547d") | |||
| ds = rag.list_datasets(id="123") | |||
| ds = ds[0] | |||
| doc = ds.list_documents(id="wdfxb5t547d") | |||
| doc = doc[0] | |||
| chunk = doc.add_chunk(content="xxxxxxx") | |||
| chunk.content = "sdfx" | |||
| chunk.save() | |||
| chunk.update({"content":"sdfx...}) | |||
| ``` | |||
| --- | |||
| @@ -654,7 +635,7 @@ chunk.save() | |||
| ## Retrieval | |||
| ```python | |||
| RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None, offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk] | |||
| RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk] | |||
| ``` | |||
| ### Parameters | |||
| @@ -691,6 +672,15 @@ The weight of vector cosine similarity, 1 - x is the term similarity weight. Def | |||
| Number of records engaged in vector cosine computaton. Defaults to `1024`. | |||
| #### rerank_id:`str` | |||
| ID of the rerank model. Defaults to `None`. | |||
| #### keyword:`bool` | |||
| Indicating whether keyword-based matching is enabled (True) or disabled (False). | |||
| #### highlight:`bool` | |||
| Specifying whether to enable highlighting of matched terms in the results (True) or not (False). | |||
| ### Returns | |||
| List[Chunk] | |||
| @@ -701,18 +691,17 @@ List[Chunk] | |||
| from ragflow import RAGFlow | |||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||
| ds = rag.get_dataset(name="ragflow") | |||
| ds = rag.list_datasets(name="ragflow") | |||
| ds = ds[0] | |||
| name = 'ragflow_test.txt' | |||
| path = 'test_data/ragflow_test.txt' | |||
| path = './test_data/ragflow_test.txt' | |||
| rag.create_document(ds, name=name, blob=open(path, "rb").read()) | |||
| doc = rag.get_document(name=name) | |||
| doc.async_parse() | |||
| # Wait for parsing to complete | |||
| for progress, msg in doc.join(interval=5, timeout=30): | |||
| print(progress, msg) | |||
| for c in rag.retrieval(question="What's ragflow?", | |||
| datasets=[ds], documents=[doc], | |||
| offset=0, limit=6, similarity_threshold=0.1, | |||
| doc = ds.list_documents(name=name) | |||
| doc = doc[0] | |||
| ds.async_parse_documents([doc.id]) | |||
| for c in rag.retrieve(question="What's ragflow?", | |||
| datasets=[ds.id], documents=[doc.id], | |||
| offset=1, limit=30, similarity_threshold=0.2, | |||
| vector_similarity_weight=0.3, | |||
| top_k=1024 | |||
| ): | |||
| @@ -17,32 +17,11 @@ class Chunk(Base): | |||
| res_dict.pop(k) | |||
| super().__init__(rag, res_dict) | |||
| def delete(self) -> bool: | |||
| """ | |||
| Delete the chunk in the document. | |||
| """ | |||
| res = self.post('/doc/chunk/rm', | |||
| {"document_id": self.document_id, 'chunk_ids': [self.id]}) | |||
| res = res.json() | |||
| if res.get("retmsg") == "success": | |||
| return True | |||
| raise Exception(res["retmsg"]) | |||
| def save(self) -> bool: | |||
| """ | |||
| Save the document details to the server. | |||
| """ | |||
| res = self.post('/doc/chunk/set', | |||
| {"chunk_id": self.id, | |||
| "knowledgebase_id": self.knowledgebase_id, | |||
| "name": self.document_name, | |||
| "content": self.content, | |||
| "important_keywords": self.important_keywords, | |||
| "document_id": self.document_id, | |||
| "available": self.available, | |||
| }) | |||
| def update(self,update_message:dict): | |||
| res = self.put(f"/dataset/{self.knowledgebase_id}/document/{self.document_id}/chunk/{self.id}",update_message) | |||
| res = res.json() | |||
| if res.get("retmsg") == "success": | |||
| return True | |||
| raise Exception(res["retmsg"]) | |||
| if res.get("code") != 0 : | |||
| raise Exception(res["message"]) | |||
| @@ -65,3 +65,14 @@ class DataSet(Base): | |||
| if res.get("code") != 0: | |||
| raise Exception(res["message"]) | |||
| def async_parse_documents(self,document_ids): | |||
| res = self.post(f"/dataset/{self.id}/chunk",{"document_ids":document_ids}) | |||
| res = res.json() | |||
| if res.get("code") != 0: | |||
| raise Exception(res.get("message")) | |||
| def async_cancel_parse_documents(self,document_ids): | |||
| res = self.rm(f"/dataset/{self.id}/chunk",{"document_ids":document_ids}) | |||
| res = res.json() | |||
| if res.get("code") != 0: | |||
| raise Exception(res.get("message")) | |||
| @@ -1,7 +1,10 @@ | |||
| import time | |||
| from PIL.ImageFile import raise_oserror | |||
| from .base import Base | |||
| from .chunk import Chunk | |||
| from typing import List | |||
| class Document(Base): | |||
| @@ -29,160 +32,28 @@ class Document(Base): | |||
| res_dict.pop(k) | |||
| super().__init__(rag, res_dict) | |||
| def update(self,update_message:dict) -> bool: | |||
| """ | |||
| Save the document details to the server. | |||
| """ | |||
| res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message) | |||
| res = res.json() | |||
| if res.get("code") != 0: | |||
| raise Exception(res["message"]) | |||
| def delete(self) -> bool: | |||
| """ | |||
| Delete the document from the server. | |||
| """ | |||
| res = self.rm('/doc/delete', | |||
| {"document_id": self.id}) | |||
| def list_chunks(self,offset=0, limit=30, keywords="", id:str=None): | |||
| data={"document_id": self.id,"keywords": keywords,"offset":offset,"limit":limit,"id":id} | |||
| res = self.get(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', data) | |||
| res = res.json() | |||
| if res.get("retmsg") == "success": | |||
| return True | |||
| raise Exception(res["retmsg"]) | |||
| def download(self) -> bytes: | |||
| """ | |||
| Download the document content from the server using the Flask API. | |||
| :return: The downloaded document content in bytes. | |||
| """ | |||
| # Construct the URL for the API request using the document ID and knowledge base ID | |||
| res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}") | |||
| # Check the response status code to ensure the request was successful | |||
| if res.status_code == 200: | |||
| # Return the document content as bytes | |||
| return res.content | |||
| else: | |||
| # Handle the error and raise an exception | |||
| raise Exception( | |||
| f"Failed to download document. Server responded with: {res.status_code}, {res.text}" | |||
| ) | |||
| def async_parse(self): | |||
| """ | |||
| Initiate document parsing asynchronously without waiting for completion. | |||
| """ | |||
| try: | |||
| # Construct request data including document ID and run status (assuming 1 means to run) | |||
| data = {"document_ids": [self.id], "run": 1} | |||
| # Send a POST request to the specified parsing status endpoint to start parsing | |||
| res = self.post(f'/doc/run', data) | |||
| # Check the server response status code | |||
| if res.status_code != 200: | |||
| raise Exception(f"Failed to start async parsing: {res.text}") | |||
| print("Async parsing started successfully.") | |||
| except Exception as e: | |||
| # Catch and handle exceptions | |||
| print(f"Error occurred during async parsing: {str(e)}") | |||
| raise | |||
| import time | |||
| def join(self, interval=5, timeout=3600): | |||
| """ | |||
| Wait for the asynchronous parsing to complete and yield parsing progress periodically. | |||
| :param interval: The time interval (in seconds) for progress reports. | |||
| :param timeout: The timeout (in seconds) for the parsing operation. | |||
| :return: An iterator yielding parsing progress and messages. | |||
| """ | |||
| start_time = time.time() | |||
| while time.time() - start_time < timeout: | |||
| # Check the parsing status | |||
| res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]}) | |||
| res_data = res.json() | |||
| data = res_data.get("data", []) | |||
| # Retrieve progress and status message | |||
| progress = data.get("progress", 0) | |||
| progress_msg = data.get("status", "") | |||
| if res.get("code") == 0: | |||
| chunks=[] | |||
| for data in res["data"].get("chunks"): | |||
| chunk = Chunk(self.rag,data) | |||
| chunks.append(chunk) | |||
| return chunks | |||
| raise Exception(res.get("message")) | |||
| yield progress, progress_msg # Yield progress and message | |||
| if progress == 100: # Parsing completed | |||
| break | |||
| time.sleep(interval) | |||
| def cancel(self): | |||
| """ | |||
| Cancel the parsing task for the document. | |||
| """ | |||
| try: | |||
| # Construct request data, including document ID and action to cancel (assuming 2 means cancel) | |||
| data = {"document_ids": [self.id], "run": 2} | |||
| # Send a POST request to the specified parsing status endpoint to cancel parsing | |||
| res = self.post(f'/doc/run', data) | |||
| # Check the server response status code | |||
| if res.status_code != 200: | |||
| print("Failed to cancel parsing. Server response:", res.text) | |||
| else: | |||
| print("Parsing cancelled successfully.") | |||
| except Exception as e: | |||
| print(f"Error occurred during async parsing cancellation: {str(e)}") | |||
| raise | |||
| def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None): | |||
| """ | |||
| List all chunks associated with this document by calling the external API. | |||
| Args: | |||
| page (int): The page number to retrieve (default 1). | |||
| size (int): The number of chunks per page (default 30). | |||
| keywords (str): Keywords for searching specific chunks (default ""). | |||
| available_int (int): Filter for available chunks (optional). | |||
| Returns: | |||
| list: A list of chunks returned from the API. | |||
| """ | |||
| data = { | |||
| "document_id": self.id, | |||
| "page": page, | |||
| "size": size, | |||
| "keywords": keywords, | |||
| "offset":offset, | |||
| "limit":limit | |||
| } | |||
| if available_int is not None: | |||
| data["available_int"] = available_int | |||
| res = self.post(f'/doc/chunk/list', data) | |||
| if res.status_code == 200: | |||
| res_data = res.json() | |||
| if res_data.get("retmsg") == "success": | |||
| chunks=[] | |||
| for chunk_data in res_data["data"].get("chunks", []): | |||
| chunk=Chunk(self.rag,chunk_data) | |||
| chunks.append(chunk) | |||
| return chunks | |||
| else: | |||
| raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}") | |||
| else: | |||
| raise Exception(f"API request failed with status code {res.status_code}") | |||
| def add_chunk(self, content: str): | |||
| res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content}) | |||
| if res.status_code == 200: | |||
| res_data = res.json().get("data") | |||
| chunk_data = res_data.get("chunk") | |||
| return Chunk(self.rag,chunk_data) | |||
| else: | |||
| raise Exception(f"Failed to add chunk: {res.status_code} {res.text}") | |||
| res = self.post(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', {"content":content}) | |||
| res = res.json() | |||
| if res.get("code") == 0: | |||
| return Chunk(self.rag,res["data"].get("chunk")) | |||
| raise Exception(res.get("message")) | |||
| def delete_chunks(self,ids:List[str]): | |||
| res = self.rm(f"dataset/{self.knowledgebase_id}/document/{self.id}/chunk",{"ids":ids}) | |||
| res = res.json() | |||
| if res.get("code")!=0: | |||
| raise Exception(res.get("message")) | |||
| @@ -15,8 +15,8 @@ class Session(Base): | |||
| for message in self.messages: | |||
| if "reference" in message: | |||
| message.pop("reference") | |||
| res = self.post(f"/chat/{self.chat_id}/session/{self.id}/completion", | |||
| {"question": question, "stream": True}, stream=stream) | |||
| res = self.post(f"/chat/{self.chat_id}/completion", | |||
| {"question": question, "stream": True,"session_id":self.id}, stream=stream) | |||
| for line in res.iter_lines(): | |||
| line = line.decode("utf-8") | |||
| if line.startswith("{"): | |||
| @@ -82,3 +82,4 @@ class Chunk(Base): | |||
| self.term_similarity = None | |||
| self.positions = None | |||
| super().__init__(rag, res_dict) | |||
| @@ -158,105 +158,30 @@ class RAGFlow: | |||
| raise Exception(res["message"]) | |||
| def async_parse_documents(self, doc_ids): | |||
| """ | |||
| Asynchronously start parsing multiple documents without waiting for completion. | |||
| :param doc_ids: A list containing multiple document IDs. | |||
| """ | |||
| try: | |||
| if not doc_ids or not isinstance(doc_ids, list): | |||
| raise ValueError("doc_ids must be a non-empty list of document IDs") | |||
| data = {"document_ids": doc_ids, "run": 1} | |||
| res = self.post(f'/doc/run', data) | |||
| if res.status_code != 200: | |||
| raise Exception(f"Failed to start async parsing for documents: {res.text}") | |||
| print(f"Async parsing started successfully for documents: {doc_ids}") | |||
| except Exception as e: | |||
| print(f"Error occurred during async parsing for documents: {str(e)}") | |||
| raise | |||
| def async_cancel_parse_documents(self, doc_ids): | |||
| """ | |||
| Cancel the asynchronous parsing of multiple documents. | |||
| :param doc_ids: A list containing multiple document IDs. | |||
| """ | |||
| try: | |||
| if not doc_ids or not isinstance(doc_ids, list): | |||
| raise ValueError("doc_ids must be a non-empty list of document IDs") | |||
| data = {"document_ids": doc_ids, "run": 2} | |||
| res = self.post(f'/doc/run', data) | |||
| if res.status_code != 200: | |||
| raise Exception(f"Failed to cancel async parsing for documents: {res.text}") | |||
| print(f"Async parsing canceled successfully for documents: {doc_ids}") | |||
| except Exception as e: | |||
| print(f"Error occurred during canceling parsing for documents: {str(e)}") | |||
| raise | |||
| def retrieval(self, | |||
| question, | |||
| datasets=None, | |||
| documents=None, | |||
| offset=0, | |||
| limit=6, | |||
| similarity_threshold=0.1, | |||
| vector_similarity_weight=0.3, | |||
| top_k=1024): | |||
| """ | |||
| Perform document retrieval based on the given parameters. | |||
| :param question: The query question. | |||
| :param datasets: A list of datasets (optional, as documents may be provided directly). | |||
| :param documents: A list of documents (if specific documents are provided). | |||
| :param offset: Offset for the retrieval results. | |||
| :param limit: Maximum number of retrieval results. | |||
| :param similarity_threshold: Similarity threshold. | |||
| :param vector_similarity_weight: Weight of vector similarity. | |||
| :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking). | |||
| Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API. | |||
| """ | |||
| try: | |||
| data = { | |||
| "question": question, | |||
| "datasets": datasets if datasets is not None else [], | |||
| "documents": [doc.id if hasattr(doc, 'id') else doc for doc in | |||
| documents] if documents is not None else [], | |||
| def retrieve(self, question="",datasets=None,documents=None, offset=1, limit=30, similarity_threshold=0.2,vector_similarity_weight=0.3,top_k=1024,rerank_id:str=None,keyword:bool=False,): | |||
| data_params = { | |||
| "offset": offset, | |||
| "limit": limit, | |||
| "similarity_threshold": similarity_threshold, | |||
| "vector_similarity_weight": vector_similarity_weight, | |||
| "top_k": top_k, | |||
| "knowledgebase_id": datasets, | |||
| "rerank_id":rerank_id, | |||
| "keyword":keyword | |||
| } | |||
| data_json ={ | |||
| "question": question, | |||
| "datasets": datasets, | |||
| "documents": documents | |||
| } | |||
| # Send a POST request to the backend service (using requests library as an example, actual implementation may vary) | |||
| res = self.post(f'/doc/retrieval_test', data) | |||
| # Check the response status code | |||
| if res.status_code == 200: | |||
| res_data = res.json() | |||
| if res_data.get("retmsg") == "success": | |||
| chunks = [] | |||
| for chunk_data in res_data["data"].get("chunks", []): | |||
| chunk = Chunk(self, chunk_data) | |||
| chunks.append(chunk) | |||
| return chunks | |||
| else: | |||
| raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}") | |||
| else: | |||
| raise Exception(f"API request failed with status code {res.status_code}") | |||
| except Exception as e: | |||
| print(f"An error occurred during retrieval: {e}") | |||
| raise | |||
| res = self.get(f'/retrieval', data_params,data_json) | |||
| res = res.json() | |||
| if res.get("code") ==0: | |||
| chunks=[] | |||
| for chunk_data in res["data"].get("chunks"): | |||
| chunk=Chunk(self,chunk_data) | |||
| chunks.append(chunk) | |||
| return chunks | |||
| raise Exception(res.get("message")) | |||
| @@ -63,17 +63,13 @@ class TestDocument(TestSdk): | |||
| # Check if the retrieved document is of type Document | |||
| if isinstance(doc, Document): | |||
| # Download the document content and save it to a file | |||
| try: | |||
| with open("ragflow.txt", "wb+") as file: | |||
| file.write(doc.download()) | |||
| # Print the document object for debugging | |||
| print(doc) | |||
| # Assert that the download was successful | |||
| assert True, "Document downloaded successfully." | |||
| except Exception as e: | |||
| # If an error occurs, raise an assertion error | |||
| assert False, f"Failed to download document, error: {str(e)}" | |||
| with open("./ragflow.txt", "wb+") as file: | |||
| file.write(doc.download()) | |||
| # Print the document object for debugging | |||
| print(doc) | |||
| # Assert that the download was successful | |||
| assert True, f"Failed to download document, error: {doc}" | |||
| else: | |||
| # If the document retrieval fails, assert failure | |||
| assert False, f"Failed to get document, error: {doc}" | |||
| @@ -100,7 +96,7 @@ class TestDocument(TestSdk): | |||
| blob2 = b"Sample document content for ingestion test222." | |||
| list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}] | |||
| ds.upload_documents(list_1) | |||
| for d in ds.list_docs(keywords="test", offset=0, limit=12): | |||
| for d in ds.list_documents(keywords="test", offset=0, limit=12): | |||
| assert isinstance(d, Document), "Failed to upload documents" | |||
| def test_delete_documents_in_dataset_with_success(self): | |||
| @@ -123,16 +119,11 @@ class TestDocument(TestSdk): | |||
| blob1 = b"Sample document content for ingestion test333." | |||
| name2 = "Test Document444.txt" | |||
| blob2 = b"Sample document content for ingestion test444." | |||
| name3 = 'test.txt' | |||
| path = 'test_data/test.txt' | |||
| rag.create_document(ds, name=name3, blob=open(path, "rb").read()) | |||
| rag.create_document(ds, name=name1, blob=blob1) | |||
| rag.create_document(ds, name=name2, blob=blob2) | |||
| for d in ds.list_docs(keywords="document", offset=0, limit=12): | |||
| ds.upload_documents([{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]) | |||
| for d in ds.list_documents(keywords="document", offset=0, limit=12): | |||
| assert isinstance(d, Document) | |||
| d.delete() | |||
| print(d) | |||
| remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12) | |||
| ds.delete_documents([d.id]) | |||
| remaining_docs = ds.list_documents(keywords="rag", offset=0, limit=12) | |||
| assert len(remaining_docs) == 0, "Documents were not properly deleted." | |||
| def test_parse_and_cancel_document(self): | |||
| @@ -144,16 +135,15 @@ class TestDocument(TestSdk): | |||
| # Define the document name and path | |||
| name3 = 'westworld.pdf' | |||
| path = 'test_data/westworld.pdf' | |||
| path = './test_data/westworld.pdf' | |||
| # Create a document in the dataset using the file path | |||
| rag.create_document(ds, name=name3, blob=open(path, "rb").read()) | |||
| ds.upload_documents({"name":name3, "blob":open(path, "rb").read()}) | |||
| # Retrieve the document by name | |||
| doc = rag.get_document(name="westworld.pdf") | |||
| # Initiate asynchronous parsing | |||
| doc.async_parse() | |||
| doc = rag.list_documents(name="westworld.pdf") | |||
| doc = doc[0] | |||
| ds.async_parse_documents(document_ids=[]) | |||
| # Print message to confirm asynchronous parsing has been initiated | |||
| print("Async parsing initiated") | |||