### What problem does this PR solve? Refactor Chunk API #2846 ### Type of change - [x] Refactoring --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>tags/v0.13.0
| if informs: | if informs: | ||||
| e, file = FileService.get_by_id(informs[0].file_id) | e, file = FileService.get_by_id(informs[0].file_id) | ||||
| FileService.update_by_id(file.id, {"name": req["name"]}) | FileService.update_by_id(file.id, {"name": req["name"]}) | ||||
| if "parser_config" in req: | |||||
| DocumentService.update_parser_config(doc.id, req["parser_config"]) | |||||
| if "parser_method" in req: | if "parser_method" in req: | ||||
| if doc.parser_id.lower() == req["parser_method"].lower(): | if doc.parser_id.lower() == req["parser_method"].lower(): | ||||
| if "parser_config" in req: | |||||
| if req["parser_config"] == doc.parser_config: | |||||
| return get_result(retcode=RetCode.SUCCESS) | |||||
| else: | |||||
| return get_result(retcode=RetCode.SUCCESS) | |||||
| return get_result() | |||||
| if doc.type == FileType.VISUAL or re.search( | if doc.type == FileType.VISUAL or re.search( | ||||
| r"\.(ppt|pptx|pages)$", doc.name): | r"\.(ppt|pptx|pages)$", doc.name): | ||||
| return get_error_data_result(retmsg="Tenant not found!") | return get_error_data_result(retmsg="Tenant not found!") | ||||
| ELASTICSEARCH.deleteByQuery( | ELASTICSEARCH.deleteByQuery( | ||||
| Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) | Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)) | ||||
| if "parser_config" in req: | |||||
| DocumentService.update_parser_config(doc.id, req["parser_config"]) | |||||
| return get_result() | return get_result() | ||||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | ||||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | ||||
| req = request.json | req = request.json | ||||
| if not req.get("document_ids"): | |||||
| return get_error_data_result("`document_ids` is required") | |||||
| for id in req["document_ids"]: | for id in req["document_ids"]: | ||||
| if not DocumentService.query(id=id,kb_id=dataset_id): | if not DocumentService.query(id=id,kb_id=dataset_id): | ||||
| return get_error_data_result(retmsg=f"You don't own the document {id}.") | return get_error_data_result(retmsg=f"You don't own the document {id}.") | ||||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | ||||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | ||||
| req = request.json | req = request.json | ||||
| if not req.get("document_ids"): | |||||
| return get_error_data_result("`document_ids` is required") | |||||
| for id in req["document_ids"]: | for id in req["document_ids"]: | ||||
| if not DocumentService.query(id=id,kb_id=dataset_id): | |||||
| doc = DocumentService.query(id=id, kb_id=dataset_id) | |||||
| if not doc: | |||||
| return get_error_data_result(retmsg=f"You don't own the document {id}.") | return get_error_data_result(retmsg=f"You don't own the document {id}.") | ||||
| if doc[0].progress == 100.0 or doc[0].progress == 0.0: | |||||
| return get_error_data_result("Can't stop parsing document with progress at 0 or 100") | |||||
| info = {"run": "2", "progress": 0} | info = {"run": "2", "progress": 0} | ||||
| DocumentService.update_by_id(id, info) | DocumentService.update_by_id(id, info) | ||||
| # if str(req["run"]) == TaskStatus.CANCEL.value: | # if str(req["run"]) == TaskStatus.CANCEL.value: | ||||
| @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET']) | @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET']) | ||||
| @token_required | @token_required | ||||
| def list_chunk(tenant_id,dataset_id,document_id): | |||||
| def list_chunks(tenant_id,dataset_id,document_id): | |||||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | ||||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | ||||
| doc=DocumentService.query(id=document_id, kb_id=dataset_id) | doc=DocumentService.query(id=document_id, kb_id=dataset_id) | ||||
| page = int(req.get("offset", 1)) | page = int(req.get("offset", 1)) | ||||
| size = int(req.get("limit", 30)) | size = int(req.get("limit", 30)) | ||||
| question = req.get("keywords", "") | question = req.get("keywords", "") | ||||
| try: | |||||
| query = { | |||||
| "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True | |||||
| query = { | |||||
| "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True | |||||
| } | |||||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||||
| res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} | |||||
| origin_chunks = [] | |||||
| sign = 0 | |||||
| for id in sres.ids: | |||||
| d = { | |||||
| "chunk_id": id, | |||||
| "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ | |||||
| id].get( | |||||
| "content_with_weight", ""), | |||||
| "doc_id": sres.field[id]["doc_id"], | |||||
| "docnm_kwd": sres.field[id]["docnm_kwd"], | |||||
| "important_kwd": sres.field[id].get("important_kwd", []), | |||||
| "img_id": sres.field[id].get("img_id", ""), | |||||
| "available_int": sres.field[id].get("available_int", 1), | |||||
| "positions": sres.field[id].get("position_int", "").split("\t") | |||||
| } | } | ||||
| if "available_int" in req: | |||||
| query["available_int"] = int(req["available_int"]) | |||||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||||
| res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()} | |||||
| origin_chunks = [] | |||||
| for id in sres.ids: | |||||
| d = { | |||||
| "chunk_id": id, | |||||
| "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[ | |||||
| id].get( | |||||
| "content_with_weight", ""), | |||||
| "doc_id": sres.field[id]["doc_id"], | |||||
| "docnm_kwd": sres.field[id]["docnm_kwd"], | |||||
| "important_kwd": sres.field[id].get("important_kwd", []), | |||||
| "img_id": sres.field[id].get("img_id", ""), | |||||
| "available_int": sres.field[id].get("available_int", 1), | |||||
| "positions": sres.field[id].get("position_int", "").split("\t") | |||||
| } | |||||
| if len(d["positions"]) % 5 == 0: | |||||
| poss = [] | |||||
| for i in range(0, len(d["positions"]), 5): | |||||
| poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), | |||||
| float(d["positions"][i + 3]), float(d["positions"][i + 4])]) | |||||
| d["positions"] = poss | |||||
| origin_chunks.append(d) | |||||
| ##rename keys | |||||
| for chunk in origin_chunks: | |||||
| key_mapping = { | |||||
| "chunk_id": "id", | |||||
| "content_with_weight": "content", | |||||
| "doc_id": "document_id", | |||||
| "important_kwd": "important_keywords", | |||||
| "img_id": "image_id", | |||||
| } | |||||
| renamed_chunk = {} | |||||
| for key, value in chunk.items(): | |||||
| new_key = key_mapping.get(key, key) | |||||
| renamed_chunk[new_key] = value | |||||
| res["chunks"].append(renamed_chunk) | |||||
| return get_result(data=res) | |||||
| except Exception as e: | |||||
| if str(e).find("not_found") > 0: | |||||
| return get_result(retmsg=f'No chunk found!', | |||||
| retcode=RetCode.DATA_ERROR) | |||||
| return server_error_response(e) | |||||
| if len(d["positions"]) % 5 == 0: | |||||
| poss = [] | |||||
| for i in range(0, len(d["positions"]), 5): | |||||
| poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), | |||||
| float(d["positions"][i + 3]), float(d["positions"][i + 4])]) | |||||
| d["positions"] = poss | |||||
| origin_chunks.append(d) | |||||
| if req.get("id"): | |||||
| if req.get("id") == id: | |||||
| origin_chunks.clear() | |||||
| origin_chunks.append(d) | |||||
| sign = 1 | |||||
| break | |||||
| if req.get("id"): | |||||
| if sign == 0: | |||||
| return get_error_data_result(f"Can't find this chunk {req.get('id')}") | |||||
| for chunk in origin_chunks: | |||||
| key_mapping = { | |||||
| "chunk_id": "id", | |||||
| "content_with_weight": "content", | |||||
| "doc_id": "document_id", | |||||
| "important_kwd": "important_keywords", | |||||
| "img_id": "image_id", | |||||
| } | |||||
| renamed_chunk = {} | |||||
| for key, value in chunk.items(): | |||||
| new_key = key_mapping.get(key, key) | |||||
| renamed_chunk[new_key] = value | |||||
| res["chunks"].append(renamed_chunk) | |||||
| return get_result(data=res) | |||||
| @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST']) | @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST']) | ||||
| req = request.json | req = request.json | ||||
| if not req.get("content"): | if not req.get("content"): | ||||
| return get_error_data_result(retmsg="`content` is required") | return get_error_data_result(retmsg="`content` is required") | ||||
| if "important_keywords" in req: | |||||
| if type(req["important_keywords"]) != list: | |||||
| return get_error_data_result("`important_keywords` is required to be a list") | |||||
| md5 = hashlib.md5() | md5 = hashlib.md5() | ||||
| md5.update((req["content"] + document_id).encode("utf-8")) | md5.update((req["content"] + document_id).encode("utf-8")) | ||||
| d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]), | d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]), | ||||
| "content_with_weight": req["content"]} | "content_with_weight": req["content"]} | ||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | ||||
| d["important_kwd"] = req.get("important_kwd", []) | |||||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) | |||||
| d["important_kwd"] = req.get("important_keywords", []) | |||||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", []))) | |||||
| d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | ||||
| d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | ||||
| d["kb_id"] = [doc.kb_id] | d["kb_id"] = [doc.kb_id] | ||||
| req = request.json | req = request.json | ||||
| if not req.get("chunk_ids"): | if not req.get("chunk_ids"): | ||||
| return get_error_data_result("`chunk_ids` is required") | return get_error_data_result("`chunk_ids` is required") | ||||
| query = { | |||||
| "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True} | |||||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||||
| for chunk_id in req.get("chunk_ids"): | for chunk_id in req.get("chunk_ids"): | ||||
| res = ELASTICSEARCH.get( | |||||
| chunk_id, search.index_name( | |||||
| tenant_id)) | |||||
| if not res.get("found"): | |||||
| return server_error_response(f"Chunk {chunk_id} not found") | |||||
| if chunk_id not in sres.ids: | |||||
| return get_error_data_result(f"Chunk {chunk_id} not found") | |||||
| if not ELASTICSEARCH.deleteByQuery( | if not ELASTICSEARCH.deleteByQuery( | ||||
| Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)): | Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)): | ||||
| return get_error_data_result(retmsg="Index updating failure") | return get_error_data_result(retmsg="Index updating failure") | ||||
| @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT']) | @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT']) | ||||
| @token_required | @token_required | ||||
| def set(tenant_id,dataset_id,document_id,chunk_id): | def set(tenant_id,dataset_id,document_id,chunk_id): | ||||
| res = ELASTICSEARCH.get( | |||||
| try: | |||||
| res = ELASTICSEARCH.get( | |||||
| chunk_id, search.index_name( | chunk_id, search.index_name( | ||||
| tenant_id)) | tenant_id)) | ||||
| if not res.get("found"): | |||||
| return get_error_data_result(f"Chunk {chunk_id} not found") | |||||
| except Exception as e: | |||||
| return get_error_data_result(f"Can't find this chunk {chunk_id}") | |||||
| if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id): | ||||
| return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.") | ||||
| doc = DocumentService.query(id=document_id, kb_id=dataset_id) | doc = DocumentService.query(id=document_id, kb_id=dataset_id) | ||||
| if not doc: | if not doc: | ||||
| return get_error_data_result(retmsg=f"You don't own the document {document_id}.") | return get_error_data_result(retmsg=f"You don't own the document {document_id}.") | ||||
| doc = doc[0] | |||||
| query = { | |||||
| "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True | |||||
| } | |||||
| sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True) | |||||
| if chunk_id not in sres.ids: | |||||
| return get_error_data_result(f"You don't own the chunk {chunk_id}") | |||||
| req = request.json | req = request.json | ||||
| content=res["_source"].get("content_with_weight") | |||||
| d = { | d = { | ||||
| "id": chunk_id, | "id": chunk_id, | ||||
| "content_with_weight": req.get("content",res.get["content_with_weight"])} | |||||
| d["content_ltks"] = rag_tokenizer.tokenize(req["content"]) | |||||
| "content_with_weight": req.get("content",content)} | |||||
| d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"]) | |||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | ||||
| d["important_kwd"] = req.get("important_keywords",[]) | |||||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) | |||||
| if "important_keywords" in req: | |||||
| if type(req["important_keywords"]) != list: | |||||
| return get_error_data_result("`important_keywords` is required to be a list") | |||||
| d["important_kwd"] = req.get("important_keywords") | |||||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) | |||||
| if "available" in req: | if "available" in req: | ||||
| d["available_int"] = req["available"] | d["available_int"] = req["available"] | ||||
| embd_id = DocumentService.get_embd_id(document_id) | embd_id = DocumentService.get_embd_id(document_id) | ||||
| arr = [ | arr = [ | ||||
| t for t in re.split( | t for t in re.split( | ||||
| r"[\n\t]", | r"[\n\t]", | ||||
| req["content"]) if len(t) > 1] | |||||
| d["content_with_weight"]) if len(t) > 1] | |||||
| if len(arr) != 2: | if len(arr) != 2: | ||||
| return get_error_data_result( | return get_error_data_result( | ||||
| retmsg="Q&A must be separated by TAB/ENTER key.") | retmsg="Q&A must be separated by TAB/ENTER key.") | ||||
| d = beAdoc(d, arr[0], arr[1], not any( | d = beAdoc(d, arr[0], arr[1], not any( | ||||
| [rag_tokenizer.is_chinese(t) for t in q + a])) | [rag_tokenizer.is_chinese(t) for t in q + a])) | ||||
| v, c = embd_mdl.encode([doc.name, req["content"]]) | |||||
| v, c = embd_mdl.encode([doc.name, d["content_with_weight"]]) | |||||
| v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | ||||
| d["q_%d_vec" % len(v)] = v.tolist() | d["q_%d_vec" % len(v)] = v.tolist() | ||||
| ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) | ||||
| for id in kb_id: | for id in kb_id: | ||||
| if not KnowledgebaseService.query(id=id,tenant_id=tenant_id): | if not KnowledgebaseService.query(id=id,tenant_id=tenant_id): | ||||
| return get_error_data_result(f"You don't own the dataset {id}.") | return get_error_data_result(f"You don't own the dataset {id}.") | ||||
| if "question" not in req_json: | |||||
| if "question" not in req: | |||||
| return get_error_data_result("`question` is required.") | return get_error_data_result("`question` is required.") | ||||
| page = int(req.get("offset", 1)) | page = int(req.get("offset", 1)) | ||||
| size = int(req.get("limit", 30)) | size = int(req.get("limit", 30)) |
| from api.utils.api_utils import get_error_data_result | from api.utils.api_utils import get_error_data_result | ||||
| from api.utils.api_utils import get_result, token_required | from api.utils.api_utils import get_result, token_required | ||||
| @manager.route('/chat/<chat_id>/session', methods=['POST']) | @manager.route('/chat/<chat_id>/session', methods=['POST']) | ||||
| @token_required | @token_required | ||||
| def create(tenant_id, chat_id): | |||||
| def create(tenant_id,chat_id): | |||||
| req = request.json | req = request.json | ||||
| req["dialog_id"] = chat_id | req["dialog_id"] = chat_id | ||||
| dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value) | dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value) | ||||
| del conv["reference"] | del conv["reference"] | ||||
| return get_result(data=conv) | return get_result(data=conv) | ||||
| @manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT']) | @manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT']) | ||||
| @token_required | @token_required | ||||
| def update(tenant_id, chat_id, session_id): | |||||
| def update(tenant_id,chat_id,session_id): | |||||
| req = request.json | req = request.json | ||||
| req["dialog_id"] = chat_id | req["dialog_id"] = chat_id | ||||
| conv_id = session_id | conv_id = session_id | ||||
| conv = ConversationService.query(id=conv_id, dialog_id=chat_id) | |||||
| conv = ConversationService.query(id=conv_id,dialog_id=chat_id) | |||||
| if not conv: | if not conv: | ||||
| return get_error_data_result(retmsg="Session does not exist") | return get_error_data_result(retmsg="Session does not exist") | ||||
| if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): | if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): | ||||
| return get_result() | return get_result() | ||||
| @manager.route('/chat/<chat_id>/session/<session_id>/completion', methods=['POST']) | |||||
| @manager.route('/chat/<chat_id>/completion', methods=['POST']) | |||||
| @token_required | @token_required | ||||
| def completion(tenant_id, chat_id, session_id): | |||||
| def completion(tenant_id,chat_id): | |||||
| req = request.json | req = request.json | ||||
| # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [ | # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [ | ||||
| # {"role": "user", "content": "上海有吗?"} | # {"role": "user", "content": "上海有吗?"} | ||||
| # ]} | # ]} | ||||
| if not req.get("session_id"): | |||||
| conv = { | |||||
| "id": get_uuid(), | |||||
| "dialog_id": chat_id, | |||||
| "name": req.get("name", "New session"), | |||||
| "message": [{"role": "assistant", "content": "Hi! I am your assistant,can I help you?"}] | |||||
| } | |||||
| if not conv.get("name"): | |||||
| return get_error_data_result(retmsg="Name can not be empty.") | |||||
| ConversationService.save(**conv) | |||||
| e, conv = ConversationService.get_by_id(conv["id"]) | |||||
| session_id=conv.id | |||||
| else: | |||||
| session_id = req.get("session_id") | |||||
| if not req.get("question"): | if not req.get("question"): | ||||
| return get_error_data_result(retmsg="Please input your question.") | return get_error_data_result(retmsg="Please input your question.") | ||||
| conv = ConversationService.query(id=session_id, dialog_id=chat_id) | |||||
| conv = ConversationService.query(id=session_id,dialog_id=chat_id) | |||||
| if not conv: | if not conv: | ||||
| return get_error_data_result(retmsg="Session does not exist") | return get_error_data_result(retmsg="Session does not exist") | ||||
| conv = conv[0] | conv = conv[0] | ||||
| conv.message[-1] = {"role": "assistant", "content": ans["answer"], | conv.message[-1] = {"role": "assistant", "content": ans["answer"], | ||||
| "id": message_id, "prompt": ans.get("prompt", "")} | "id": message_id, "prompt": ans.get("prompt", "")} | ||||
| ans["id"] = message_id | ans["id"] = message_id | ||||
| ans["session_id"]=session_id | |||||
| def stream(): | def stream(): | ||||
| nonlocal dia, msg, req, conv | nonlocal dia, msg, req, conv | ||||
| try: | try: | ||||
| for ans in chat(dia, msg, **req): | for ans in chat(dia, msg, **req): | ||||
| fillin_conv(ans) | fillin_conv(ans) | ||||
| yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n" | |||||
| yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n" | |||||
| ConversationService.update_by_id(conv.id, conv.to_dict()) | ConversationService.update_by_id(conv.id, conv.to_dict()) | ||||
| except Exception as e: | except Exception as e: | ||||
| yield "data:" + json.dumps({"code": 500, "message": str(e), | yield "data:" + json.dumps({"code": 500, "message": str(e), | ||||
| "data": {"answer": "**ERROR**: " + str(e), "reference": []}}, | |||||
| "data": {"answer": "**ERROR**: " + str(e),"reference": []}}, | |||||
| ensure_ascii=False) + "\n\n" | ensure_ascii=False) + "\n\n" | ||||
| yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n" | yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n" | ||||
| break | break | ||||
| return get_result(data=answer) | return get_result(data=answer) | ||||
| @manager.route('/chat/<chat_id>/session', methods=['GET']) | @manager.route('/chat/<chat_id>/session', methods=['GET']) | ||||
| @token_required | @token_required | ||||
| def list(chat_id, tenant_id): | |||||
| def list(chat_id,tenant_id): | |||||
| if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value): | if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value): | ||||
| return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.") | return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.") | ||||
| id = request.args.get("id") | id = request.args.get("id") | ||||
| name = request.args.get("name") | name = request.args.get("name") | ||||
| session = ConversationService.query(id=id, name=name, dialog_id=chat_id) | |||||
| session = ConversationService.query(id=id,name=name,dialog_id=chat_id) | |||||
| if not session: | if not session: | ||||
| return get_error_data_result(retmsg="The session doesn't exist") | return get_error_data_result(retmsg="The session doesn't exist") | ||||
| page_number = int(request.args.get("page", 1)) | page_number = int(request.args.get("page", 1)) | ||||
| desc = False | desc = False | ||||
| else: | else: | ||||
| desc = True | desc = True | ||||
| convs = ConversationService.get_list(chat_id, page_number, items_per_page, orderby, desc, id, name) | |||||
| convs = ConversationService.get_list(chat_id,page_number,items_per_page,orderby,desc,id,name) | |||||
| if not convs: | if not convs: | ||||
| return get_result(data=[]) | return get_result(data=[]) | ||||
| for conv in convs: | for conv in convs: | ||||
| del conv["reference"] | del conv["reference"] | ||||
| return get_result(data=convs) | return get_result(data=convs) | ||||
| @manager.route('/chat/<chat_id>/session', methods=["DELETE"]) | @manager.route('/chat/<chat_id>/session', methods=["DELETE"]) | ||||
| @token_required | @token_required | ||||
| def delete(tenant_id, chat_id): | |||||
| def delete(tenant_id,chat_id): | |||||
| if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): | if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value): | ||||
| return get_error_data_result(retmsg="You don't own the chat") | return get_error_data_result(retmsg="You don't own the chat") | ||||
| ids = request.json.get("ids") | ids = request.json.get("ids") | ||||
| if not ids: | if not ids: | ||||
| return get_error_data_result(retmsg="`ids` is required in deleting operation") | return get_error_data_result(retmsg="`ids` is required in deleting operation") | ||||
| for id in ids: | for id in ids: | ||||
| conv = ConversationService.query(id=id, dialog_id=chat_id) | |||||
| conv = ConversationService.query(id=id,dialog_id=chat_id) | |||||
| if not conv: | if not conv: | ||||
| return get_error_data_result(retmsg="The chat doesn't own the session") | return get_error_data_result(retmsg="The chat doesn't own the session") | ||||
| ConversationService.delete_by_id(id) | ConversationService.delete_by_id(id) |
| docs = docs.where( | docs = docs.where( | ||||
| fn.LOWER(cls.model.name).contains(keywords.lower()) | fn.LOWER(cls.model.name).contains(keywords.lower()) | ||||
| ) | ) | ||||
| count = docs.count() | |||||
| if desc: | if desc: | ||||
| docs = docs.order_by(cls.model.getter_by(orderby).desc()) | docs = docs.order_by(cls.model.getter_by(orderby).desc()) | ||||
| else: | else: | ||||
| docs = docs.order_by(cls.model.getter_by(orderby).asc()) | docs = docs.order_by(cls.model.getter_by(orderby).asc()) | ||||
| docs = docs.paginate(page_number, items_per_page) | docs = docs.paginate(page_number, items_per_page) | ||||
| count = docs.count() | |||||
| return list(docs.dicts()), count | return list(docs.dicts()), count | ||||
| } | } | ||||
| ``` | ``` | ||||
| ## Delete files from a dataset | |||||
| **DELETE** `/api/v1/dataset/{dataset_id}/document ` | |||||
| Delete files from a dataset | |||||
| ### Request | |||||
| - Method: DELETE | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document` | |||||
| - Headers: | |||||
| - 'Content-Type: application/json' | |||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| - Body: | |||||
| - `ids`:List[str] | |||||
| #### Request example | |||||
| ```bash | |||||
| curl --request DELETE \ | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: {YOUR ACCESS TOKEN}' \ | |||||
| --data '{ | |||||
| "ids": ["id_1","id_2"] | |||||
| }' | |||||
| ``` | |||||
| #### Request parameters | |||||
| - `"ids"`: (*Body parameter*) | |||||
| The ids of teh documents to be deleted | |||||
| ### Response | |||||
| The successful response includes a JSON object like the following: | |||||
| ```json | |||||
| { | |||||
| "code": 0 | |||||
| }. | |||||
| ``` | |||||
| - `"error_code"`: `integer` | |||||
| `0`: The operation succeeds. | |||||
| The error response includes a JSON object like the following: | |||||
| ```json | |||||
| { | |||||
| "code": 102, | |||||
| "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005." | |||||
| } | |||||
| ``` | |||||
| ## Download a file from a dataset | ## Download a file from a dataset | ||||
| **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}` | **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}` | ||||
| Downloads files from a dataset. | |||||
| Downloads a file from a dataset. | |||||
| ### Request | ### Request | ||||
| - Method: GET | - Method: GET | ||||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}` | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | |||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Output: | - Output: | ||||
| - '{FILE_NAME}' | - '{FILE_NAME}' | ||||
| ```bash | ```bash | ||||
| curl --request GET \ | curl --request GET \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --output '{FILE_NAME}' | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||||
| --output ./ragflow.txt | |||||
| ``` | ``` | ||||
| #### Request parameters | #### Request parameters | ||||
| ### Response | ### Response | ||||
| The successful response includes a JSON object like the following: | |||||
| The successful response includes a text object like the following: | |||||
| ```text | ```text | ||||
| test_2. | test_2. | ||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | |||||
| - `name`:`string` | |||||
| - `parser_method`:`string` | |||||
| - `parser_config`:`dict` | |||||
| #### Request example | #### Request example | ||||
| ```bash | ```bash | ||||
| curl --request PUT \ | curl --request PUT \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \ | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \ | --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \ | ||||
| --header 'Content-Type: application/json' \ | --header 'Content-Type: application/json' \ | ||||
| --data '{ | --data '{ | ||||
| "name": "manual.txt", | "name": "manual.txt", | ||||
| "thumbnail": null, | |||||
| "knowledgebase_id": "779333c0758611ef910f0242ac120004", | |||||
| "parser_method": "manual", | "parser_method": "manual", | ||||
| "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12}, | |||||
| "source_type": "local", "type": "doc", | |||||
| "created_by": "134408906b6811efbcd20242ac120005", | |||||
| "size": 0, "token_count": 0, "chunk_count": 0, | |||||
| "progress": 0.0, | |||||
| "progress_msg": "", | |||||
| "process_begin_at": null, | |||||
| "process_duration": 0.0 | |||||
| "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。;!?", "layout_recognize": true, "task_page_size": 12} | |||||
| }' | }' | ||||
| ``` | ``` | ||||
| #### Request parameters | #### Request parameters | ||||
| - `"thumbnail"`: (*Body parameter*) | |||||
| Thumbnail image of the document. | |||||
| - `""` | |||||
| - `"knowledgebase_id"`: (*Body parameter*) | |||||
| Knowledge base ID related to the document. | |||||
| - `""` | |||||
| - `"parser_method"`: (*Body parameter*) | - `"parser_method"`: (*Body parameter*) | ||||
| Method used to parse the document. | Method used to parse the document. | ||||
| - `""` | |||||
| - `"parser_config"`: (*Body parameter*) | - `"parser_config"`: (*Body parameter*) | ||||
| Configuration object for the parser. | Configuration object for the parser. | ||||
| - If the value is `None`, a dictionary with default values will be generated. | - If the value is `None`, a dictionary with default values will be generated. | ||||
| - `"source_type"`: (*Body parameter*) | |||||
| Source type of the document. | |||||
| - `""` | |||||
| - `"type"`: (*Body parameter*) | |||||
| Type or category of the document. | |||||
| - `""` | |||||
| - `"created_by"`: (*Body parameter*) | |||||
| Creator of the document. | |||||
| - `""` | |||||
| - `"name"`: (*Body parameter*) | - `"name"`: (*Body parameter*) | ||||
| Name or title of the document. | Name or title of the document. | ||||
| - `""` | |||||
| - `"size"`: (*Body parameter*) | |||||
| Size of the document in bytes or some other unit. | |||||
| - `0` | |||||
| - `"token_count"`: (*Body parameter*) | |||||
| Number of tokens in the document. | |||||
| - `0` | |||||
| - `"chunk_count"`: (*Body parameter*) | |||||
| Number of chunks the document is split into. | |||||
| - `0` | |||||
| - `"progress"`: (*Body parameter*) | |||||
| Current processing progress as a percentage. | |||||
| - `0.0` | |||||
| - `"progress_msg"`: (*Body parameter*) | |||||
| Message indicating current progress status. | |||||
| - `""` | |||||
| - `"process_begin_at"`: (*Body parameter*) | |||||
| Start time of the document processing. | |||||
| - `None` | |||||
| - `"process_duration"`: (*Body parameter*) | |||||
| Duration of the processing in seconds or minutes. | |||||
| - `0.0` | |||||
| ### Response | ### Response | ||||
| ### Request | ### Request | ||||
| - Method: POST | - Method: POST | ||||
| - URL: `/api/v1/dataset/{dataset_id}/chunk` | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk ` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | |||||
| - `document_ids`:List[str] | |||||
| #### Request example | #### Request example | ||||
| ```shell | |||||
| ```bash | |||||
| curl --request POST \ | curl --request POST \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --raw '{ | |||||
| "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] | |||||
| }' | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||||
| --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}' | |||||
| ``` | ``` | ||||
| #### Request parameters | #### Request parameters | ||||
| - `"dataset_id"`: (*Path parameter*) | - `"dataset_id"`: (*Path parameter*) | ||||
| - `"documents"`: (*Body parameter*) | |||||
| - Documents to parse | |||||
| - `"document_ids"`:(*Body parameter*) | |||||
| The ids of the documents to be parsed | |||||
| ### Response | ### Response | ||||
| The successful response includes a JSON object like the following: | The successful response includes a JSON object like the following: | ||||
| ```shell | |||||
| ```json | |||||
| { | { | ||||
| "code": 0 | "code": 0 | ||||
| } | } | ||||
| The error response includes a JSON object like the following: | The error response includes a JSON object like the following: | ||||
| ```shell | |||||
| ```json | |||||
| { | { | ||||
| "code": 3016, | |||||
| "message": "Can't connect database" | |||||
| "code": 102, | |||||
| "message": "`document_ids` is required" | |||||
| } | } | ||||
| ``` | ``` | ||||
| ### Request | ### Request | ||||
| - Method: POST | |||||
| - URL: `/api/v1/dataset/{dataset_id}/chunk` | |||||
| - Method: DELETE | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | |||||
| - `document_ids`:List[str] | |||||
| #### Request example | #### Request example | ||||
| ```shell | |||||
| ```bash | |||||
| curl --request DELETE \ | curl --request DELETE \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --raw '{ | |||||
| "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] | |||||
| }' | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||||
| --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}' | |||||
| ``` | ``` | ||||
| #### Request parameters | #### Request parameters | ||||
| - `"dataset_id"`: (*Path parameter*) | - `"dataset_id"`: (*Path parameter*) | ||||
| - `"documents"`: (*Body parameter*) | |||||
| - Documents to stop parsing | |||||
| - `"document_ids"`:(*Body parameter*) | |||||
| The ids of the documents to be parsed | |||||
| ### Response | ### Response | ||||
| The successful response includes a JSON object like the following: | The successful response includes a JSON object like the following: | ||||
| ```shell | |||||
| ```json | |||||
| { | { | ||||
| "code": 0 | "code": 0 | ||||
| } | } | ||||
| The error response includes a JSON object like the following: | The error response includes a JSON object like the following: | ||||
| ```shell | |||||
| ```json | |||||
| { | { | ||||
| "code": 3016, | |||||
| "message": "Can't connect database" | |||||
| "code": 102, | |||||
| "message": "`document_ids` is required" | |||||
| } | } | ||||
| ``` | ``` | ||||
| ## Get document chunk list | ## Get document chunk list | ||||
| **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}` | |||||
| Get document chunk list | Get document chunk list | ||||
| ### Request | ### Request | ||||
| - Method: GET | - Method: GET | ||||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | |||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| #### Request example | #### Request example | ||||
| ```shell | |||||
| ```bash | |||||
| curl --request GET \ | curl --request GET \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id} \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| ``` | ``` | ||||
| #### Request parameters | #### Request parameters | ||||
| - `"dataset_id"`: (*Path parameter*) | - `"dataset_id"`: (*Path parameter*) | ||||
| - `"document_id"`: (*Path parameter*) | - `"document_id"`: (*Path parameter*) | ||||
| - `"offset"`(*Filter parameter*) | |||||
| The beginning number of records for paging. | |||||
| - `"keywords"`(*Filter parameter*) | |||||
| List chunks whose name has the given keywords | |||||
| - `"limit"`(*Filter parameter*) | |||||
| Records number to return | |||||
| - `"id"`(*Filter parameter*) | |||||
| The id of chunk to be got | |||||
| ### Response | ### Response | ||||
| The successful response includes a JSON object like the following: | The successful response includes a JSON object like the following: | ||||
| ```shell | |||||
| ```json | |||||
| { | { | ||||
| "code": 0 | |||||
| "code": 0, | |||||
| "data": { | "data": { | ||||
| "chunks": [ | |||||
| { | |||||
| "available_int": 1, | |||||
| "content": "<em>advantag</em>of ragflow increas accuraci and relev:by incorpor retriev inform , ragflow can gener respons that are more accur", | |||||
| "document_keyword": "ragflow_test.txt", | |||||
| "document_id": "77df9ef4759a11ef8bdd0242ac120004", | |||||
| "id": "4ab8c77cfac1a829c8d5ed022a0808c0", | |||||
| "image_id": "", | |||||
| "important_keywords": [], | |||||
| "positions": [ | |||||
| "" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "chunks": [], | |||||
| "doc": { | "doc": { | ||||
| "chunk_count": 5, | |||||
| "create_date": "Wed, 18 Sep 2024 08:46:16 GMT", | |||||
| "create_time": 1726649176833, | |||||
| "created_by": "134408906b6811efbcd20242ac120005", | |||||
| "id": "77df9ef4759a11ef8bdd0242ac120004", | |||||
| "knowledgebase_id": "77d9d24e759a11ef880c0242ac120004", | |||||
| "location": "ragflow_test.txt", | |||||
| "name": "ragflow_test.txt", | |||||
| "chunk_num": 0, | |||||
| "create_date": "Sun, 29 Sep 2024 03:47:29 GMT", | |||||
| "create_time": 1727581649216, | |||||
| "created_by": "69736c5e723611efb51b0242ac120007", | |||||
| "id": "8cb781ec7e1511ef98ac0242ac120006", | |||||
| "kb_id": "c7ee74067a2c11efb21c0242ac120006", | |||||
| "location": "明天的天气是晴天.txt", | |||||
| "name": "明天的天气是晴天.txt", | |||||
| "parser_config": { | "parser_config": { | ||||
| "chunk_token_count": 128, | |||||
| "delimiter": "\n!?。;!?", | |||||
| "layout_recognize": true, | |||||
| "task_page_size": 12 | |||||
| "pages": [ | |||||
| [ | |||||
| 1, | |||||
| 1000000 | |||||
| ] | |||||
| ] | |||||
| }, | }, | ||||
| "parser_method": "naive", | |||||
| "process_begin_at": "Wed, 18 Sep 2024 08:46:16 GMT", | |||||
| "process_duation": 7.3213, | |||||
| "progress": 1.0, | |||||
| "progress_msg": "\nTask has been received.\nStart to parse.\nFinish parsing.\nFinished slicing files(5). Start to embedding the content.\nFinished embedding(6.16)! Start to build index!\nDone!", | |||||
| "run": "3", | |||||
| "size": 4209, | |||||
| "parser_id": "naive", | |||||
| "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT", | |||||
| "process_duation": 1435.37, | |||||
| "progress": 0.0370833, | |||||
| "progress_msg": "\nTask has been received.", | |||||
| "run": "1", | |||||
| "size": 24, | |||||
| "source_type": "local", | "source_type": "local", | ||||
| "status": "1", | "status": "1", | ||||
| "thumbnail": null, | "thumbnail": null, | ||||
| "token_count": 746, | |||||
| "token_num": 0, | |||||
| "type": "doc", | "type": "doc", | ||||
| "update_date": "Wed, 18 Sep 2024 08:46:23 GMT", | |||||
| "update_time": 1726649183321 | |||||
| "update_date": "Tue, 15 Oct 2024 10:47:46 GMT", | |||||
| "update_time": 1728989266371 | |||||
| }, | }, | ||||
| "total": 1 | |||||
| }, | |||||
| "total": 0 | |||||
| } | |||||
| } | } | ||||
| ``` | ``` | ||||
| The error response includes a JSON object like the following: | The error response includes a JSON object like the following: | ||||
| ```shell | |||||
| ```json | |||||
| { | { | ||||
| "code": 3016, | |||||
| "message": "Can't connect database" | |||||
| "code": 102, | |||||
| "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5." | |||||
| } | } | ||||
| ``` | ``` | ||||
| ### Request | ### Request | ||||
| - Method: DELETE | - Method: DELETE | ||||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | |||||
| - `chunk_ids`:List[str] | |||||
| #### Request example | #### Request example | ||||
| ```shell | |||||
| ```bash | |||||
| curl --request DELETE \ | curl --request DELETE \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --raw '{ | |||||
| "chunks": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"] | |||||
| }' | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||||
| --data '{ | |||||
| "chunk_ids": ["test_1", "test_2"] | |||||
| }' | |||||
| ``` | ``` | ||||
| #### Request parameters | |||||
| - `"chunk_ids"`:(*Body parameter*) | |||||
| The chunks of the document to be deleted | |||||
| ### Response | |||||
| Success | |||||
| ```json | |||||
| { | |||||
| "code": 0 | |||||
| } | |||||
| ``` | |||||
| Error | |||||
| ```json | |||||
| { | |||||
| "code": 102, | |||||
| "message": "`chunk_ids` is required" | |||||
| } | |||||
| ``` | |||||
| ## Update document chunk | ## Update document chunk | ||||
| **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}` | |||||
| Update document chunk | Update document chunk | ||||
| ### Request | ### Request | ||||
| - Method: PUT | - Method: PUT | ||||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | |||||
| - `content`:str | |||||
| - `important_keywords`:str | |||||
| - `available`:int | |||||
| #### Request example | #### Request example | ||||
| ```shell | |||||
| ```bash | |||||
| curl --request PUT \ | curl --request PUT \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --raw '{ | |||||
| "chunk_id": "d87fb0b7212c15c18d0831677552d7de", | |||||
| "knowledgebase_id": null, | |||||
| "name": "", | |||||
| "content": "ragflow123", | |||||
| "important_keywords": [], | |||||
| "document_id": "e6bbba92759511efaa900242ac120004", | |||||
| "status": "1" | |||||
| }' | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id} \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: {YOUR_ACCESS_TOKEN}' \ | |||||
| --data '{ | |||||
| "content": "ragflow123", | |||||
| "important_keywords": [], | |||||
| }' | |||||
| ``` | ``` | ||||
| #### Request parameters | |||||
| - `"content"`:(*Body parameter*) | |||||
| Contains the main text or information of the chunk. | |||||
| - `"important_keywords"`:(*Body parameter*) | |||||
| list the key terms or phrases that are significant or central to the chunk's content. | |||||
| - `"available"`:(*Body parameter*) | |||||
| Indicating the availability status, 0 means unavailable and 1 means available. | |||||
| ### Response | |||||
| Success | |||||
| ```json | |||||
| { | |||||
| "code": 0 | |||||
| } | |||||
| ``` | |||||
| Error | |||||
| ```json | |||||
| { | |||||
| "code": 102, | |||||
| "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2" | |||||
| } | |||||
| ``` | |||||
| ## Insert document chunks | ## Insert document chunks | ||||
| **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | ||||
| ### Request | ### Request | ||||
| - Method: POST | - Method: POST | ||||
| - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | |||||
| - `content`: str | |||||
| - `important_keywords`:List[str] | |||||
| #### Request example | #### Request example | ||||
| ```shell | |||||
| ```bash | |||||
| curl --request POST \ | curl --request POST \ | ||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --raw '{ | |||||
| "document_id": "97ad64b6759811ef9fc30242ac120004", | |||||
| "content": ["ragflow content", "ragflow content"] | |||||
| }' | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | |||||
| --data '{ | |||||
| "content": "ragflow content" | |||||
| }' | |||||
| ``` | ``` | ||||
| #### Request parameters | |||||
| - `content`:(*Body parameter*) | |||||
| Contains the main text or information of the chunk. | |||||
| - `important_keywords`(*Body parameter*) | |||||
| list the key terms or phrases that are significant or central to the chunk's content. | |||||
| ### Response | |||||
| Success | |||||
| ```json | |||||
| { | |||||
| "code": 0, | |||||
| "data": { | |||||
| "chunk": { | |||||
| "content": "ragflow content", | |||||
| "create_time": "2024-10-16 08:05:04", | |||||
| "create_timestamp": 1729065904.581025, | |||||
| "dataset_id": [ | |||||
| "c7ee74067a2c11efb21c0242ac120006" | |||||
| ], | |||||
| "document_id": "5c5999ec7be811ef9cab0242ac120005", | |||||
| "id": "d78435d142bd5cf6704da62c778795c5", | |||||
| "important_keywords": [] | |||||
| } | |||||
| } | |||||
| } | |||||
| ``` | |||||
| Error | |||||
| ```json | |||||
| { | |||||
| "code": 102, | |||||
| "message": "`content` is required" | |||||
| } | |||||
| ``` | |||||
| ## Dataset retrieval test | ## Dataset retrieval test | ||||
| **GET** `/api/v1/dataset/{dataset_id}/retrieval` | |||||
| **GET** `/api/v1/retrieval` | |||||
| Retrieval test of a dataset | Retrieval test of a dataset | ||||
| ### Request | ### Request | ||||
| - Method: GET | |||||
| - URL: `/api/v1/dataset/{dataset_id}/retrieval` | |||||
| - Method: POST | |||||
| - URL: `http://{address}/api/v1/retrieval` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | |||||
| - `question`: str | |||||
| - `datasets`: List[str] | |||||
| - `documents`: List[str] | |||||
| - `offset`: int | |||||
| - `limit`: int | |||||
| - `similarity_threshold`: float | |||||
| - `vector_similarity_weight`: float | |||||
| - `top_k`: int | |||||
| - `rerank_id`: string | |||||
| - `keyword`: bool | |||||
| - `highlight`: bool | |||||
| #### Request example | #### Request example | ||||
| ```shell | |||||
| curl --request GET \ | |||||
| --url http://{address}/api/v1/dataset/{dataset_id}/retrieval \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | |||||
| --raw '{ | |||||
| "query_text": "This is a cat." | |||||
| }' | |||||
| ```bash | |||||
| curl --request POST \ | |||||
| --url http://{address}/api/v1/retrieval \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --header 'Authorization: {YOUR_ACCESS_TOKEN}' \ | |||||
| --data '{ | |||||
| "question": "What is advantage of ragflow?", | |||||
| "datasets": [ | |||||
| "b2a62730759d11ef987d0242ac120004" | |||||
| ], | |||||
| "documents": [ | |||||
| "77df9ef4759a11ef8bdd0242ac120004" | |||||
| ] | |||||
| }' | |||||
| ``` | ``` | ||||
| #### Request parameter | |||||
| - `"question"`: (*Body parameter*) | |||||
| User's question, search keywords | |||||
| `""` | |||||
| - `"datasets"`: (*Body parameter*) | |||||
| The scope of datasets | |||||
| `None` | |||||
| - `"documents"`: (*Body parameter*) | |||||
| The scope of document. `None` means no limitation | |||||
| `None` | |||||
| - `"offset"`: (*Body parameter*) | |||||
| The beginning point of retrieved records | |||||
| `1` | |||||
| - `"limit"`: (*Body parameter*) | |||||
| The maximum number of records needed to return | |||||
| `30` | |||||
| - `"similarity_threshold"`: (*Body parameter*) | |||||
| The minimum similarity score | |||||
| `0.2` | |||||
| - `"vector_similarity_weight"`: (*Body parameter*) | |||||
| The weight of vector cosine similarity, `1 - x` is the term similarity weight | |||||
| `0.3` | |||||
| - `"top_k"`: (*Body parameter*) | |||||
| Number of records engaged in vector cosine computation | |||||
| `1024` | |||||
| - `"rerank_id"`: (*Body parameter*) | |||||
| ID of the rerank model | |||||
| `None` | |||||
| - `"keyword"`: (*Body parameter*) | |||||
| Whether keyword-based matching is enabled | |||||
| `False` | |||||
| - `"highlight"`: (*Body parameter*) | |||||
| Whether to enable highlighting of matched terms in the results | |||||
| `False` | |||||
| ### Response | |||||
| Success | |||||
| ```json | |||||
| { | |||||
| "code": 0, | |||||
| "data": { | |||||
| "chunks": [ | |||||
| { | |||||
| "content": "ragflow content", | |||||
| "content_ltks": "ragflow content", | |||||
| "document_id": "5c5999ec7be811ef9cab0242ac120005", | |||||
| "document_keyword": "1.txt", | |||||
| "highlight": "<em>ragflow</em> content", | |||||
| "id": "d78435d142bd5cf6704da62c778795c5", | |||||
| "img_id": "", | |||||
| "important_keywords": [ | |||||
| "" | |||||
| ], | |||||
| "kb_id": "c7ee74067a2c11efb21c0242ac120006", | |||||
| "positions": [ | |||||
| "" | |||||
| ], | |||||
| "similarity": 0.9669436601210759, | |||||
| "term_similarity": 1.0, | |||||
| "vector_similarity": 0.8898122004035864 | |||||
| } | |||||
| ], | |||||
| "doc_aggs": [ | |||||
| { | |||||
| "count": 1, | |||||
| "doc_id": "5c5999ec7be811ef9cab0242ac120005", | |||||
| "doc_name": "1.txt" | |||||
| } | |||||
| ], | |||||
| "total": 1 | |||||
| } | |||||
| } | |||||
| ``` | |||||
| Error | |||||
| ```json | |||||
| { | |||||
| "code": 102, | |||||
| "message": "`datasets` is required." | |||||
| } | |||||
| ``` | |||||
| ## Create chat | ## Create chat | ||||
| **POST** `/api/v1/chat` | **POST** `/api/v1/chat` | ||||
| ## Chat with a chat session | ## Chat with a chat session | ||||
| **POST** `/api/v1/chat/{chat_id}/session/{session_id}/completion` | |||||
| **POST** `/api/v1/chat/{chat_id}/completion` | |||||
| Chat with a chat session | Chat with a chat session | ||||
| ### Request | ### Request | ||||
| - Method: POST | - Method: POST | ||||
| - URL: `http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion` | |||||
| - URL: `http://{address} /api/v1/chat/{chat_id}/completion` | |||||
| - Headers: | - Headers: | ||||
| - `content-Type: application/json` | - `content-Type: application/json` | ||||
| - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' | ||||
| - Body: | - Body: | ||||
| - `question`: string | - `question`: string | ||||
| - `stream`: bool | - `stream`: bool | ||||
| - `session_id`: str | |||||
| #### Request example | #### Request example | ||||
| ```bash | ```bash | ||||
| curl --request POST \ | curl --request POST \ | ||||
| --url http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion \ | |||||
| --url http://{address} /api/v1/chat/{chat_id}/completion \ | |||||
| --header 'Content-Type: application/json' \ | --header 'Content-Type: application/json' \ | ||||
| --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \ | ||||
| --data-binary '{ | --data-binary '{ | ||||
| - `stream`: (*Body Parameter*) | - `stream`: (*Body Parameter*) | ||||
| The approach of streaming text generation. | The approach of streaming text generation. | ||||
| `False` | `False` | ||||
| - `session_id`: (*Body Parameter*) | |||||
| The id of session.If not provided, a new session will be generated. | |||||
| ### Response | ### Response | ||||
| Success | Success | ||||
| ```json | ```json |
| ## Upload document | ## Upload document | ||||
| ```python | ```python | ||||
| RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool | |||||
| DataSet.upload_documents(document_list: List[dict]) | |||||
| ``` | ``` | ||||
| ### Parameters | ### Parameters | ||||
| #### name | |||||
| #### document_list:`List[dict]` | |||||
| A list composed of dicts containing `name` and `blob`. | |||||
| #### blob | |||||
| ### Returns | |||||
| no return | |||||
| ### Examples | |||||
| ```python | |||||
| from ragflow import RAGFlow | |||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||||
| ds = rag.create_dataset(name="kb_1") | |||||
| ds.upload_documents([{name="1.txt", blob="123"}, ...] } | |||||
| ``` | |||||
| --- | |||||
| ## Update document | |||||
| ```python | |||||
| Document.update(update_message:dict) | |||||
| ``` | |||||
| ### Parameters | |||||
| #### update_message:`dict` | |||||
| only `name`,`parser_config`,`parser_method` can be changed | |||||
| ### Returns | |||||
| no return | |||||
| ### Examples | |||||
| ```python | |||||
| from ragflow import RAGFlow | |||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||||
| ds=rag.list_datasets(id='id') | |||||
| ds=ds[0] | |||||
| doc = ds.list_documents(id="wdfxb5t547d") | |||||
| doc = doc[0] | |||||
| doc.update([{"parser_method": "manual"...}]) | |||||
| ``` | |||||
| --- | |||||
| ## Download document | |||||
| ```python | |||||
| Document.download() -> bytes | |||||
| ``` | |||||
| ### Returns | ### Returns | ||||
| bytes of the document. | |||||
| ### Examples | ### Examples | ||||
| ```python | |||||
| from ragflow import RAGFlow | |||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||||
| ds=rag.list_datasets(id="id") | |||||
| ds=ds[0] | |||||
| doc = ds.list_documents(id="wdfxb5t547d") | |||||
| doc = doc[0] | |||||
| open("~/ragflow.txt", "wb+").write(doc.download()) | |||||
| print(doc) | |||||
| ``` | |||||
| --- | --- | ||||
| ## Retrieve document | |||||
| ## List documents | |||||
| ```python | ```python | ||||
| RAGFlow.get_document(id:str=None,name:str=None) -> Document | |||||
| Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document] | |||||
| ``` | ``` | ||||
| ### Parameters | ### Parameters | ||||
| #### id: `str`, *Required* | |||||
| #### id: `str` | |||||
| ID of the document to retrieve. | |||||
| The id of the document to be got | |||||
| #### name: `str` | |||||
| #### keywords: `str` | |||||
| List documents whose name has the given keywords. Defaults to `None`. | |||||
| #### offset: `int` | |||||
| The beginning number of records for paging. Defaults to `0`. | |||||
| Name or title of the document. | |||||
| #### limit: `int` | |||||
| Records number to return, -1 means all of them. Records number to return, -1 means all of them. | |||||
| #### orderby: `str` | |||||
| The field by which the records should be sorted. This specifies the attribute or column used to order the results. | |||||
| #### desc:`bool` | |||||
| A boolean flag indicating whether the sorting should be in descending order. | |||||
| ### Returns | ### Returns | ||||
| List[Document] | |||||
| A document object containing the following attributes: | A document object containing the following attributes: | ||||
| #### id: `str` | #### id: `str` | ||||
| ```python | ```python | ||||
| from ragflow import RAGFlow | from ragflow import RAGFlow | ||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||||
| doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt') | |||||
| print(doc) | |||||
| ``` | |||||
| --- | |||||
| ## Save document settings | |||||
| ```python | |||||
| Document.save() -> bool | |||||
| ``` | |||||
| ### Returns | |||||
| bool | |||||
| ### Examples | |||||
| ```python | |||||
| from ragflow import RAGFlow | |||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||||
| doc = rag.get_document(id="wdfxb5t547d") | |||||
| doc.parser_method= "manual" | |||||
| doc.save() | |||||
| ``` | |||||
| --- | |||||
| ## Download document | |||||
| ```python | |||||
| Document.download() -> bytes | |||||
| ``` | |||||
| ### Returns | |||||
| bytes of the document. | |||||
| ### Examples | |||||
| ```python | |||||
| from ragflow import RAGFlow | |||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||||
| doc = rag.get_document(id="wdfxb5t547d") | |||||
| open("~/ragflow.txt", "w+").write(doc.download()) | |||||
| print(doc) | |||||
| ``` | |||||
| --- | |||||
| ## List documents | |||||
| ```python | |||||
| Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document] | |||||
| ``` | |||||
| ### Parameters | |||||
| #### keywords: `str` | |||||
| List documents whose name has the given keywords. Defaults to `None`. | |||||
| #### offset: `int` | |||||
| The beginning number of records for paging. Defaults to `0`. | |||||
| #### limit: `int` | |||||
| Records number to return, -1 means all of them. Records number to return, -1 means all of them. | |||||
| ### Returns | |||||
| List[Document] | |||||
| ### Examples | |||||
| ```python | |||||
| from ragflow import RAGFlow | |||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | ||||
| ds = rag.create_dataset(name="kb_1") | ds = rag.create_dataset(name="kb_1") | ||||
| filename1 = "~/ragflow.txt" | filename1 = "~/ragflow.txt" | ||||
| rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read()) | |||||
| filename2 = "~/infinity.txt" | |||||
| rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read()) | |||||
| for d in ds.list_docs(keywords="rag", offset=0, limit=12): | |||||
| blob=open(filename1 , "rb").read() | |||||
| list_files=[{"name":filename1,"blob":blob}] | |||||
| ds.upload_documents(list_files) | |||||
| for d in ds.list_documents(keywords="rag", offset=0, limit=12): | |||||
| print(d) | print(d) | ||||
| ``` | ``` | ||||
| ## Delete documents | ## Delete documents | ||||
| ```python | ```python | ||||
| Document.delete() -> bool | |||||
| DataSet.delete_documents(ids: List[str] = None) | |||||
| ``` | ``` | ||||
| ### Returns | ### Returns | ||||
| bool | |||||
| description: delete success or not | |||||
| no return | |||||
| ### Examples | ### Examples | ||||
| from ragflow import RAGFlow | from ragflow import RAGFlow | ||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | ||||
| ds = rag.create_dataset(name="kb_1") | |||||
| filename1 = "~/ragflow.txt" | |||||
| rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read()) | |||||
| filename2 = "~/infinity.txt" | |||||
| rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read()) | |||||
| for d in ds.list_docs(keywords="rag", offset=0, limit=12): | |||||
| d.delete() | |||||
| ds = rag.list_datasets(name="kb_1") | |||||
| ds = ds[0] | |||||
| ds.delete_documents(ids=["id_1","id_2"]) | |||||
| ``` | ``` | ||||
| --- | --- | ||||
| ## Parse document | |||||
| ## Parse and stop parsing document | |||||
| ```python | ```python | ||||
| Document.async_parse() -> None | |||||
| RAGFLOW.async_parse_documents() -> None | |||||
| DataSet.async_parse_documents(document_ids:List[str]) -> None | |||||
| DataSet.async_cancel_parse_documents(document_ids:List[str])-> None | |||||
| ``` | ``` | ||||
| ### Parameters | ### Parameters | ||||
| #### document_ids:`List[str]` | |||||
| The ids of the documents to be parsed | |||||
| ???????????????????????????????????????????????????? | ???????????????????????????????????????????????????? | ||||
| ### Returns | ### Returns | ||||
| no return | |||||
| ???????????????????????????????????????????????????? | ???????????????????????????????????????????????????? | ||||
| ### Examples | ### Examples | ||||
| ```python | |||||
| #document parse and cancel | |||||
| rag = RAGFlow(API_KEY, HOST_ADDRESS) | |||||
| ds = rag.create_dataset(name="dataset_name") | |||||
| name3 = 'ai.pdf' | |||||
| path = 'test_data/ai.pdf' | |||||
| rag.create_document(ds, name=name3, blob=open(path, "rb").read()) | |||||
| doc = rag.get_document(name="ai.pdf") | |||||
| doc.async_parse() | |||||
| print("Async parsing initiated") | |||||
| ``` | |||||
| --- | |||||
| ## Cancel document parsing | |||||
| ```python | |||||
| rag.async_cancel_parse_documents(ids) | |||||
| RAGFLOW.async_cancel_parse_documents()-> None | |||||
| ``` | |||||
| ### Parameters | |||||
| #### ids, `list[]` | |||||
| ### Returns | |||||
| ????????????????????????????????????????????????? | |||||
| ### Examples | |||||
| ```python | ```python | ||||
| #documents parse and cancel | #documents parse and cancel | ||||
| rag = RAGFlow(API_KEY, HOST_ADDRESS) | rag = RAGFlow(API_KEY, HOST_ADDRESS) | ||||
| ds = rag.create_dataset(name="God5") | ds = rag.create_dataset(name="God5") | ||||
| documents = [ | documents = [ | ||||
| {'name': 'test1.txt', 'path': 'test_data/test1.txt'}, | |||||
| {'name': 'test2.txt', 'path': 'test_data/test2.txt'}, | |||||
| {'name': 'test3.txt', 'path': 'test_data/test3.txt'} | |||||
| {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()}, | |||||
| {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()}, | |||||
| {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()} | |||||
| ] | ] | ||||
| # Create documents in bulk | |||||
| for doc_info in documents: | |||||
| with open(doc_info['path'], "rb") as file: | |||||
| created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read()) | |||||
| docs = [rag.get_document(name=doc_info['name']) for doc_info in documents] | |||||
| ids = [doc.id for doc in docs] | |||||
| rag.async_parse_documents(ids) | |||||
| ds.upload_documents(documents) | |||||
| documents=ds.list_documents(keywords="test") | |||||
| ids=[] | |||||
| for document in documents: | |||||
| ids.append(document.id) | |||||
| ds.async_parse_documents(ids) | |||||
| print("Async bulk parsing initiated") | print("Async bulk parsing initiated") | ||||
| for doc in docs: | |||||
| for progress, msg in doc.join(interval=5, timeout=10): | |||||
| print(f"{doc.name}: Progress: {progress}, Message: {msg}") | |||||
| cancel_result = rag.async_cancel_parse_documents(ids) | |||||
| ds.async_cancel_parse_documents(ids) | |||||
| print("Async bulk parsing cancelled") | print("Async bulk parsing cancelled") | ||||
| ``` | ``` | ||||
| --- | |||||
| ## Join document | |||||
| ?????????????????? | |||||
| ## List chunks | |||||
| ```python | ```python | ||||
| Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]] | |||||
| Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk] | |||||
| ``` | ``` | ||||
| ### Parameters | ### Parameters | ||||
| #### interval: `int` | |||||
| - `keywords`: `str` | |||||
| List chunks whose name has the given keywords | |||||
| default: `None` | |||||
| Time interval in seconds for progress report. Defaults to `15`. | |||||
| - `offset`: `int` | |||||
| The beginning number of records for paging | |||||
| default: `1` | |||||
| #### timeout: `int` | |||||
| Timeout in seconds. Defaults to `3600`. | |||||
| - `limit`: `int` | |||||
| Records number to return | |||||
| default: `30` | |||||
| - `id`: `str` | |||||
| The ID of the chunk to be retrieved | |||||
| default: `None` | |||||
| ### Returns | ### Returns | ||||
| List[chunk] | |||||
| iteral[Tuple[float, str]] | |||||
| ### Examples | |||||
| ```python | |||||
| from ragflow import RAGFlow | |||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | |||||
| ds = rag.list_datasets("123") | |||||
| ds = ds[0] | |||||
| ds.async_parse_documents(["wdfxb5t547d"]) | |||||
| for c in doc.list_chunks(keywords="rag", offset=0, limit=12): | |||||
| print(c) | |||||
| ``` | |||||
| ## Add chunk | ## Add chunk | ||||
| ```python | ```python | ||||
| ### Parameters | ### Parameters | ||||
| #### content: `str`, *Required* | #### content: `str`, *Required* | ||||
| Contains the main text or information of the chunk. | |||||
| #### important_keywords :`List[str]` | |||||
| list the key terms or phrases that are significant or central to the chunk's content. | |||||
| ### Returns | ### Returns | ||||
| from ragflow import RAGFlow | from ragflow import RAGFlow | ||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | ||||
| doc = rag.get_document(id="wdfxb5t547d") | |||||
| ds = rag.list_datasets(id="123") | |||||
| ds = ds[0] | |||||
| doc = ds.list_documents(id="wdfxb5t547d") | |||||
| doc = doc[0] | |||||
| chunk = doc.add_chunk(content="xxxxxxx") | chunk = doc.add_chunk(content="xxxxxxx") | ||||
| ``` | ``` | ||||
| ## Delete chunk | ## Delete chunk | ||||
| ```python | ```python | ||||
| Chunk.delete() -> bool | |||||
| Document.delete_chunks(chunk_ids: List[str]) | |||||
| ``` | ``` | ||||
| ### Parameters | |||||
| #### chunk_ids:`List[str]` | |||||
| The list of chunk_id | |||||
| ### Returns | ### Returns | ||||
| bool | |||||
| no return | |||||
| ### Examples | ### Examples | ||||
| from ragflow import RAGFlow | from ragflow import RAGFlow | ||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | ||||
| doc = rag.get_document(id="wdfxb5t547d") | |||||
| ds = rag.list_datasets(id="123") | |||||
| ds = ds[0] | |||||
| doc = ds.list_documents(id="wdfxb5t547d") | |||||
| doc = doc[0] | |||||
| chunk = doc.add_chunk(content="xxxxxxx") | chunk = doc.add_chunk(content="xxxxxxx") | ||||
| chunk.delete() | |||||
| doc.delete_chunks(["id_1","id_2"]) | |||||
| ``` | ``` | ||||
| --- | --- | ||||
| ## Save chunk contents | |||||
| ## Update chunk | |||||
| ```python | ```python | ||||
| Chunk.save() -> bool | |||||
| Chunk.update(update_message: dict) | |||||
| ``` | ``` | ||||
| ### Parameters | |||||
| - `content`: `str` | |||||
| Contains the main text or information of the chunk | |||||
| - `important_keywords`: `List[str]` | |||||
| List the key terms or phrases that are significant or central to the chunk's content | |||||
| - `available`: `int` | |||||
| Indicating the availability status, `0` means unavailable and `1` means available | |||||
| ### Returns | ### Returns | ||||
| bool | |||||
| no return | |||||
| ### Examples | ### Examples | ||||
| from ragflow import RAGFlow | from ragflow import RAGFlow | ||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | ||||
| doc = rag.get_document(id="wdfxb5t547d") | |||||
| ds = rag.list_datasets(id="123") | |||||
| ds = ds[0] | |||||
| doc = ds.list_documents(id="wdfxb5t547d") | |||||
| doc = doc[0] | |||||
| chunk = doc.add_chunk(content="xxxxxxx") | chunk = doc.add_chunk(content="xxxxxxx") | ||||
| chunk.content = "sdfx" | |||||
| chunk.save() | |||||
| chunk.update({"content":"sdfx...}) | |||||
| ``` | ``` | ||||
| --- | --- | ||||
| ## Retrieval | ## Retrieval | ||||
| ```python | ```python | ||||
| RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None, offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk] | |||||
| RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk] | |||||
| ``` | ``` | ||||
| ### Parameters | ### Parameters | ||||
| Number of records engaged in vector cosine computaton. Defaults to `1024`. | Number of records engaged in vector cosine computaton. Defaults to `1024`. | ||||
| #### rerank_id:`str` | |||||
| ID of the rerank model. Defaults to `None`. | |||||
| #### keyword:`bool` | |||||
| Indicating whether keyword-based matching is enabled (True) or disabled (False). | |||||
| #### highlight:`bool` | |||||
| Specifying whether to enable highlighting of matched terms in the results (True) or not (False). | |||||
| ### Returns | ### Returns | ||||
| List[Chunk] | List[Chunk] | ||||
| from ragflow import RAGFlow | from ragflow import RAGFlow | ||||
| rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380") | ||||
| ds = rag.get_dataset(name="ragflow") | |||||
| ds = rag.list_datasets(name="ragflow") | |||||
| ds = ds[0] | |||||
| name = 'ragflow_test.txt' | name = 'ragflow_test.txt' | ||||
| path = 'test_data/ragflow_test.txt' | |||||
| path = './test_data/ragflow_test.txt' | |||||
| rag.create_document(ds, name=name, blob=open(path, "rb").read()) | rag.create_document(ds, name=name, blob=open(path, "rb").read()) | ||||
| doc = rag.get_document(name=name) | |||||
| doc.async_parse() | |||||
| # Wait for parsing to complete | |||||
| for progress, msg in doc.join(interval=5, timeout=30): | |||||
| print(progress, msg) | |||||
| for c in rag.retrieval(question="What's ragflow?", | |||||
| datasets=[ds], documents=[doc], | |||||
| offset=0, limit=6, similarity_threshold=0.1, | |||||
| doc = ds.list_documents(name=name) | |||||
| doc = doc[0] | |||||
| ds.async_parse_documents([doc.id]) | |||||
| for c in rag.retrieve(question="What's ragflow?", | |||||
| datasets=[ds.id], documents=[doc.id], | |||||
| offset=1, limit=30, similarity_threshold=0.2, | |||||
| vector_similarity_weight=0.3, | vector_similarity_weight=0.3, | ||||
| top_k=1024 | top_k=1024 | ||||
| ): | ): |
| res_dict.pop(k) | res_dict.pop(k) | ||||
| super().__init__(rag, res_dict) | super().__init__(rag, res_dict) | ||||
| def delete(self) -> bool: | |||||
| """ | |||||
| Delete the chunk in the document. | |||||
| """ | |||||
| res = self.post('/doc/chunk/rm', | |||||
| {"document_id": self.document_id, 'chunk_ids': [self.id]}) | |||||
| res = res.json() | |||||
| if res.get("retmsg") == "success": | |||||
| return True | |||||
| raise Exception(res["retmsg"]) | |||||
| def save(self) -> bool: | |||||
| """ | |||||
| Save the document details to the server. | |||||
| """ | |||||
| res = self.post('/doc/chunk/set', | |||||
| {"chunk_id": self.id, | |||||
| "knowledgebase_id": self.knowledgebase_id, | |||||
| "name": self.document_name, | |||||
| "content": self.content, | |||||
| "important_keywords": self.important_keywords, | |||||
| "document_id": self.document_id, | |||||
| "available": self.available, | |||||
| }) | |||||
| def update(self,update_message:dict): | |||||
| res = self.put(f"/dataset/{self.knowledgebase_id}/document/{self.document_id}/chunk/{self.id}",update_message) | |||||
| res = res.json() | res = res.json() | ||||
| if res.get("retmsg") == "success": | |||||
| return True | |||||
| raise Exception(res["retmsg"]) | |||||
| if res.get("code") != 0 : | |||||
| raise Exception(res["message"]) | |||||
| if res.get("code") != 0: | if res.get("code") != 0: | ||||
| raise Exception(res["message"]) | raise Exception(res["message"]) | ||||
| def async_parse_documents(self,document_ids): | |||||
| res = self.post(f"/dataset/{self.id}/chunk",{"document_ids":document_ids}) | |||||
| res = res.json() | |||||
| if res.get("code") != 0: | |||||
| raise Exception(res.get("message")) | |||||
| def async_cancel_parse_documents(self,document_ids): | |||||
| res = self.rm(f"/dataset/{self.id}/chunk",{"document_ids":document_ids}) | |||||
| res = res.json() | |||||
| if res.get("code") != 0: | |||||
| raise Exception(res.get("message")) |
| import time | import time | ||||
| from PIL.ImageFile import raise_oserror | |||||
| from .base import Base | from .base import Base | ||||
| from .chunk import Chunk | from .chunk import Chunk | ||||
| from typing import List | |||||
| class Document(Base): | class Document(Base): | ||||
| res_dict.pop(k) | res_dict.pop(k) | ||||
| super().__init__(rag, res_dict) | super().__init__(rag, res_dict) | ||||
| def update(self,update_message:dict) -> bool: | |||||
| """ | |||||
| Save the document details to the server. | |||||
| """ | |||||
| res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message) | |||||
| res = res.json() | |||||
| if res.get("code") != 0: | |||||
| raise Exception(res["message"]) | |||||
| def delete(self) -> bool: | |||||
| """ | |||||
| Delete the document from the server. | |||||
| """ | |||||
| res = self.rm('/doc/delete', | |||||
| {"document_id": self.id}) | |||||
| def list_chunks(self,offset=0, limit=30, keywords="", id:str=None): | |||||
| data={"document_id": self.id,"keywords": keywords,"offset":offset,"limit":limit,"id":id} | |||||
| res = self.get(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', data) | |||||
| res = res.json() | res = res.json() | ||||
| if res.get("retmsg") == "success": | |||||
| return True | |||||
| raise Exception(res["retmsg"]) | |||||
| def download(self) -> bytes: | |||||
| """ | |||||
| Download the document content from the server using the Flask API. | |||||
| :return: The downloaded document content in bytes. | |||||
| """ | |||||
| # Construct the URL for the API request using the document ID and knowledge base ID | |||||
| res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}") | |||||
| # Check the response status code to ensure the request was successful | |||||
| if res.status_code == 200: | |||||
| # Return the document content as bytes | |||||
| return res.content | |||||
| else: | |||||
| # Handle the error and raise an exception | |||||
| raise Exception( | |||||
| f"Failed to download document. Server responded with: {res.status_code}, {res.text}" | |||||
| ) | |||||
| def async_parse(self): | |||||
| """ | |||||
| Initiate document parsing asynchronously without waiting for completion. | |||||
| """ | |||||
| try: | |||||
| # Construct request data including document ID and run status (assuming 1 means to run) | |||||
| data = {"document_ids": [self.id], "run": 1} | |||||
| # Send a POST request to the specified parsing status endpoint to start parsing | |||||
| res = self.post(f'/doc/run', data) | |||||
| # Check the server response status code | |||||
| if res.status_code != 200: | |||||
| raise Exception(f"Failed to start async parsing: {res.text}") | |||||
| print("Async parsing started successfully.") | |||||
| except Exception as e: | |||||
| # Catch and handle exceptions | |||||
| print(f"Error occurred during async parsing: {str(e)}") | |||||
| raise | |||||
| import time | |||||
| def join(self, interval=5, timeout=3600): | |||||
| """ | |||||
| Wait for the asynchronous parsing to complete and yield parsing progress periodically. | |||||
| :param interval: The time interval (in seconds) for progress reports. | |||||
| :param timeout: The timeout (in seconds) for the parsing operation. | |||||
| :return: An iterator yielding parsing progress and messages. | |||||
| """ | |||||
| start_time = time.time() | |||||
| while time.time() - start_time < timeout: | |||||
| # Check the parsing status | |||||
| res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]}) | |||||
| res_data = res.json() | |||||
| data = res_data.get("data", []) | |||||
| # Retrieve progress and status message | |||||
| progress = data.get("progress", 0) | |||||
| progress_msg = data.get("status", "") | |||||
| if res.get("code") == 0: | |||||
| chunks=[] | |||||
| for data in res["data"].get("chunks"): | |||||
| chunk = Chunk(self.rag,data) | |||||
| chunks.append(chunk) | |||||
| return chunks | |||||
| raise Exception(res.get("message")) | |||||
| yield progress, progress_msg # Yield progress and message | |||||
| if progress == 100: # Parsing completed | |||||
| break | |||||
| time.sleep(interval) | |||||
| def cancel(self): | |||||
| """ | |||||
| Cancel the parsing task for the document. | |||||
| """ | |||||
| try: | |||||
| # Construct request data, including document ID and action to cancel (assuming 2 means cancel) | |||||
| data = {"document_ids": [self.id], "run": 2} | |||||
| # Send a POST request to the specified parsing status endpoint to cancel parsing | |||||
| res = self.post(f'/doc/run', data) | |||||
| # Check the server response status code | |||||
| if res.status_code != 200: | |||||
| print("Failed to cancel parsing. Server response:", res.text) | |||||
| else: | |||||
| print("Parsing cancelled successfully.") | |||||
| except Exception as e: | |||||
| print(f"Error occurred during async parsing cancellation: {str(e)}") | |||||
| raise | |||||
| def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None): | |||||
| """ | |||||
| List all chunks associated with this document by calling the external API. | |||||
| Args: | |||||
| page (int): The page number to retrieve (default 1). | |||||
| size (int): The number of chunks per page (default 30). | |||||
| keywords (str): Keywords for searching specific chunks (default ""). | |||||
| available_int (int): Filter for available chunks (optional). | |||||
| Returns: | |||||
| list: A list of chunks returned from the API. | |||||
| """ | |||||
| data = { | |||||
| "document_id": self.id, | |||||
| "page": page, | |||||
| "size": size, | |||||
| "keywords": keywords, | |||||
| "offset":offset, | |||||
| "limit":limit | |||||
| } | |||||
| if available_int is not None: | |||||
| data["available_int"] = available_int | |||||
| res = self.post(f'/doc/chunk/list', data) | |||||
| if res.status_code == 200: | |||||
| res_data = res.json() | |||||
| if res_data.get("retmsg") == "success": | |||||
| chunks=[] | |||||
| for chunk_data in res_data["data"].get("chunks", []): | |||||
| chunk=Chunk(self.rag,chunk_data) | |||||
| chunks.append(chunk) | |||||
| return chunks | |||||
| else: | |||||
| raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}") | |||||
| else: | |||||
| raise Exception(f"API request failed with status code {res.status_code}") | |||||
| def add_chunk(self, content: str): | def add_chunk(self, content: str): | ||||
| res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content}) | |||||
| if res.status_code == 200: | |||||
| res_data = res.json().get("data") | |||||
| chunk_data = res_data.get("chunk") | |||||
| return Chunk(self.rag,chunk_data) | |||||
| else: | |||||
| raise Exception(f"Failed to add chunk: {res.status_code} {res.text}") | |||||
| res = self.post(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', {"content":content}) | |||||
| res = res.json() | |||||
| if res.get("code") == 0: | |||||
| return Chunk(self.rag,res["data"].get("chunk")) | |||||
| raise Exception(res.get("message")) | |||||
| def delete_chunks(self,ids:List[str]): | |||||
| res = self.rm(f"dataset/{self.knowledgebase_id}/document/{self.id}/chunk",{"ids":ids}) | |||||
| res = res.json() | |||||
| if res.get("code")!=0: | |||||
| raise Exception(res.get("message")) |
| for message in self.messages: | for message in self.messages: | ||||
| if "reference" in message: | if "reference" in message: | ||||
| message.pop("reference") | message.pop("reference") | ||||
| res = self.post(f"/chat/{self.chat_id}/session/{self.id}/completion", | |||||
| {"question": question, "stream": True}, stream=stream) | |||||
| res = self.post(f"/chat/{self.chat_id}/completion", | |||||
| {"question": question, "stream": True,"session_id":self.id}, stream=stream) | |||||
| for line in res.iter_lines(): | for line in res.iter_lines(): | ||||
| line = line.decode("utf-8") | line = line.decode("utf-8") | ||||
| if line.startswith("{"): | if line.startswith("{"): | ||||
| self.term_similarity = None | self.term_similarity = None | ||||
| self.positions = None | self.positions = None | ||||
| super().__init__(rag, res_dict) | super().__init__(rag, res_dict) | ||||
| raise Exception(res["message"]) | raise Exception(res["message"]) | ||||
| def async_parse_documents(self, doc_ids): | |||||
| """ | |||||
| Asynchronously start parsing multiple documents without waiting for completion. | |||||
| :param doc_ids: A list containing multiple document IDs. | |||||
| """ | |||||
| try: | |||||
| if not doc_ids or not isinstance(doc_ids, list): | |||||
| raise ValueError("doc_ids must be a non-empty list of document IDs") | |||||
| data = {"document_ids": doc_ids, "run": 1} | |||||
| res = self.post(f'/doc/run', data) | |||||
| if res.status_code != 200: | |||||
| raise Exception(f"Failed to start async parsing for documents: {res.text}") | |||||
| print(f"Async parsing started successfully for documents: {doc_ids}") | |||||
| except Exception as e: | |||||
| print(f"Error occurred during async parsing for documents: {str(e)}") | |||||
| raise | |||||
| def async_cancel_parse_documents(self, doc_ids): | |||||
| """ | |||||
| Cancel the asynchronous parsing of multiple documents. | |||||
| :param doc_ids: A list containing multiple document IDs. | |||||
| """ | |||||
| try: | |||||
| if not doc_ids or not isinstance(doc_ids, list): | |||||
| raise ValueError("doc_ids must be a non-empty list of document IDs") | |||||
| data = {"document_ids": doc_ids, "run": 2} | |||||
| res = self.post(f'/doc/run', data) | |||||
| if res.status_code != 200: | |||||
| raise Exception(f"Failed to cancel async parsing for documents: {res.text}") | |||||
| print(f"Async parsing canceled successfully for documents: {doc_ids}") | |||||
| except Exception as e: | |||||
| print(f"Error occurred during canceling parsing for documents: {str(e)}") | |||||
| raise | |||||
| def retrieval(self, | |||||
| question, | |||||
| datasets=None, | |||||
| documents=None, | |||||
| offset=0, | |||||
| limit=6, | |||||
| similarity_threshold=0.1, | |||||
| vector_similarity_weight=0.3, | |||||
| top_k=1024): | |||||
| """ | |||||
| Perform document retrieval based on the given parameters. | |||||
| :param question: The query question. | |||||
| :param datasets: A list of datasets (optional, as documents may be provided directly). | |||||
| :param documents: A list of documents (if specific documents are provided). | |||||
| :param offset: Offset for the retrieval results. | |||||
| :param limit: Maximum number of retrieval results. | |||||
| :param similarity_threshold: Similarity threshold. | |||||
| :param vector_similarity_weight: Weight of vector similarity. | |||||
| :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking). | |||||
| Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API. | |||||
| """ | |||||
| try: | |||||
| data = { | |||||
| "question": question, | |||||
| "datasets": datasets if datasets is not None else [], | |||||
| "documents": [doc.id if hasattr(doc, 'id') else doc for doc in | |||||
| documents] if documents is not None else [], | |||||
| def retrieve(self, question="",datasets=None,documents=None, offset=1, limit=30, similarity_threshold=0.2,vector_similarity_weight=0.3,top_k=1024,rerank_id:str=None,keyword:bool=False,): | |||||
| data_params = { | |||||
| "offset": offset, | "offset": offset, | ||||
| "limit": limit, | "limit": limit, | ||||
| "similarity_threshold": similarity_threshold, | "similarity_threshold": similarity_threshold, | ||||
| "vector_similarity_weight": vector_similarity_weight, | "vector_similarity_weight": vector_similarity_weight, | ||||
| "top_k": top_k, | "top_k": top_k, | ||||
| "knowledgebase_id": datasets, | "knowledgebase_id": datasets, | ||||
| "rerank_id":rerank_id, | |||||
| "keyword":keyword | |||||
| } | |||||
| data_json ={ | |||||
| "question": question, | |||||
| "datasets": datasets, | |||||
| "documents": documents | |||||
| } | } | ||||
| # Send a POST request to the backend service (using requests library as an example, actual implementation may vary) | # Send a POST request to the backend service (using requests library as an example, actual implementation may vary) | ||||
| res = self.post(f'/doc/retrieval_test', data) | |||||
| # Check the response status code | |||||
| if res.status_code == 200: | |||||
| res_data = res.json() | |||||
| if res_data.get("retmsg") == "success": | |||||
| chunks = [] | |||||
| for chunk_data in res_data["data"].get("chunks", []): | |||||
| chunk = Chunk(self, chunk_data) | |||||
| chunks.append(chunk) | |||||
| return chunks | |||||
| else: | |||||
| raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}") | |||||
| else: | |||||
| raise Exception(f"API request failed with status code {res.status_code}") | |||||
| except Exception as e: | |||||
| print(f"An error occurred during retrieval: {e}") | |||||
| raise | |||||
| res = self.get(f'/retrieval', data_params,data_json) | |||||
| res = res.json() | |||||
| if res.get("code") ==0: | |||||
| chunks=[] | |||||
| for chunk_data in res["data"].get("chunks"): | |||||
| chunk=Chunk(self,chunk_data) | |||||
| chunks.append(chunk) | |||||
| return chunks | |||||
| raise Exception(res.get("message")) |
| # Check if the retrieved document is of type Document | # Check if the retrieved document is of type Document | ||||
| if isinstance(doc, Document): | if isinstance(doc, Document): | ||||
| # Download the document content and save it to a file | # Download the document content and save it to a file | ||||
| try: | |||||
| with open("ragflow.txt", "wb+") as file: | |||||
| file.write(doc.download()) | |||||
| # Print the document object for debugging | |||||
| print(doc) | |||||
| # Assert that the download was successful | |||||
| assert True, "Document downloaded successfully." | |||||
| except Exception as e: | |||||
| # If an error occurs, raise an assertion error | |||||
| assert False, f"Failed to download document, error: {str(e)}" | |||||
| with open("./ragflow.txt", "wb+") as file: | |||||
| file.write(doc.download()) | |||||
| # Print the document object for debugging | |||||
| print(doc) | |||||
| # Assert that the download was successful | |||||
| assert True, f"Failed to download document, error: {doc}" | |||||
| else: | else: | ||||
| # If the document retrieval fails, assert failure | # If the document retrieval fails, assert failure | ||||
| assert False, f"Failed to get document, error: {doc}" | assert False, f"Failed to get document, error: {doc}" | ||||
| blob2 = b"Sample document content for ingestion test222." | blob2 = b"Sample document content for ingestion test222." | ||||
| list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}] | list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}] | ||||
| ds.upload_documents(list_1) | ds.upload_documents(list_1) | ||||
| for d in ds.list_docs(keywords="test", offset=0, limit=12): | |||||
| for d in ds.list_documents(keywords="test", offset=0, limit=12): | |||||
| assert isinstance(d, Document), "Failed to upload documents" | assert isinstance(d, Document), "Failed to upload documents" | ||||
| def test_delete_documents_in_dataset_with_success(self): | def test_delete_documents_in_dataset_with_success(self): | ||||
| blob1 = b"Sample document content for ingestion test333." | blob1 = b"Sample document content for ingestion test333." | ||||
| name2 = "Test Document444.txt" | name2 = "Test Document444.txt" | ||||
| blob2 = b"Sample document content for ingestion test444." | blob2 = b"Sample document content for ingestion test444." | ||||
| name3 = 'test.txt' | |||||
| path = 'test_data/test.txt' | |||||
| rag.create_document(ds, name=name3, blob=open(path, "rb").read()) | |||||
| rag.create_document(ds, name=name1, blob=blob1) | |||||
| rag.create_document(ds, name=name2, blob=blob2) | |||||
| for d in ds.list_docs(keywords="document", offset=0, limit=12): | |||||
| ds.upload_documents([{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]) | |||||
| for d in ds.list_documents(keywords="document", offset=0, limit=12): | |||||
| assert isinstance(d, Document) | assert isinstance(d, Document) | ||||
| d.delete() | |||||
| print(d) | |||||
| remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12) | |||||
| ds.delete_documents([d.id]) | |||||
| remaining_docs = ds.list_documents(keywords="rag", offset=0, limit=12) | |||||
| assert len(remaining_docs) == 0, "Documents were not properly deleted." | assert len(remaining_docs) == 0, "Documents were not properly deleted." | ||||
| def test_parse_and_cancel_document(self): | def test_parse_and_cancel_document(self): | ||||
| # Define the document name and path | # Define the document name and path | ||||
| name3 = 'westworld.pdf' | name3 = 'westworld.pdf' | ||||
| path = 'test_data/westworld.pdf' | |||||
| path = './test_data/westworld.pdf' | |||||
| # Create a document in the dataset using the file path | # Create a document in the dataset using the file path | ||||
| rag.create_document(ds, name=name3, blob=open(path, "rb").read()) | |||||
| ds.upload_documents({"name":name3, "blob":open(path, "rb").read()}) | |||||
| # Retrieve the document by name | # Retrieve the document by name | ||||
| doc = rag.get_document(name="westworld.pdf") | |||||
| # Initiate asynchronous parsing | |||||
| doc.async_parse() | |||||
| doc = rag.list_documents(name="westworld.pdf") | |||||
| doc = doc[0] | |||||
| ds.async_parse_documents(document_ids=[]) | |||||
| # Print message to confirm asynchronous parsing has been initiated | # Print message to confirm asynchronous parsing has been initiated | ||||
| print("Async parsing initiated") | print("Async parsing initiated") |