### What problem does this PR solve? Refactor Chunk API #2846 ### Type of change - [x] Refactoring --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>

1 year ago · dab92ac1e8
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
        if informs:
            e, file = FileService.get_by_id(informs[0].file_id)
            FileService.update_by_id(file.id, {"name": req["name"]})
    if "parser_config" in req:
        DocumentService.update_parser_config(doc.id, req["parser_config"])
    if "parser_method" in req:
        if doc.parser_id.lower() == req["parser_method"].lower():
            if "parser_config" in req:
                if req["parser_config"] == doc.parser_config:
                    return get_result(retcode=RetCode.SUCCESS)
            else:
                return get_result(retcode=RetCode.SUCCESS)
                return get_result()
        if doc.type == FileType.VISUAL or re.search(
                r"\.(ppt|pptx|pages)$", doc.name):
                return get_error_data_result(retmsg="Tenant not found!")
            ELASTICSEARCH.deleteByQuery(
                Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
    if "parser_config" in req:
        DocumentService.update_parser_config(doc.id, req["parser_config"])
    return get_result()
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    req = request.json
    if not req.get("document_ids"):
        return get_error_data_result("`document_ids` is required")
    for id in req["document_ids"]:
        if not DocumentService.query(id=id,kb_id=dataset_id):
            return get_error_data_result(retmsg=f"You don't own the document {id}.")
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    req = request.json
    if not req.get("document_ids"):
        return get_error_data_result("`document_ids` is required")
    for id in req["document_ids"]:
        if not DocumentService.query(id=id,kb_id=dataset_id):
        doc = DocumentService.query(id=id, kb_id=dataset_id)
        if not doc:
            return get_error_data_result(retmsg=f"You don't own the document {id}.")
        if doc[0].progress == 100.0 or doc[0].progress == 0.0:
            return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
        info = {"run": "2", "progress": 0}
        DocumentService.update_by_id(id, info)
        # if str(req["run"]) == TaskStatus.CANCEL.value:
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
@token_required
 def list_chunk(tenant_id,dataset_id,document_id):
 def list_chunks(tenant_id,dataset_id,document_id):
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    doc=DocumentService.query(id=document_id, kb_id=dataset_id)
    page = int(req.get("offset", 1))
    size = int(req.get("limit", 30))
    question = req.get("keywords", "")
    try:
        query = {
            "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
    query = {
        "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
    }
    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
    res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
    origin_chunks = []
    sign = 0
    for id in sres.ids:
        d = {
            "chunk_id": id,
            "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
                id].get(
                "content_with_weight", ""),
            "doc_id": sres.field[id]["doc_id"],
            "docnm_kwd": sres.field[id]["docnm_kwd"],
            "important_kwd": sres.field[id].get("important_kwd", []),
            "img_id": sres.field[id].get("img_id", ""),
            "available_int": sres.field[id].get("available_int", 1),
            "positions": sres.field[id].get("position_int", "").split("\t")
        }
        if "available_int" in req:
            query["available_int"] = int(req["available_int"])
        sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
        res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
        origin_chunks = []
        for id in sres.ids:
            d = {
                "chunk_id": id,
                "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
                    id].get(
                    "content_with_weight", ""),
                "doc_id": sres.field[id]["doc_id"],
                "docnm_kwd": sres.field[id]["docnm_kwd"],
                "important_kwd": sres.field[id].get("important_kwd", []),
                "img_id": sres.field[id].get("img_id", ""),
                "available_int": sres.field[id].get("available_int", 1),
                "positions": sres.field[id].get("position_int", "").split("\t")
            }
            if len(d["positions"]) % 5 == 0:
                poss = []
                for i in range(0, len(d["positions"]), 5):
                    poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
                                 float(d["positions"][i + 3]), float(d["positions"][i + 4])])
                d["positions"] = poss
            origin_chunks.append(d)
            ##rename keys
            for chunk in origin_chunks:
                key_mapping = {
                    "chunk_id": "id",
                    "content_with_weight": "content",
                    "doc_id": "document_id",
                    "important_kwd": "important_keywords",
                    "img_id": "image_id",
                }
                renamed_chunk = {}
                for key, value in chunk.items():
                    new_key = key_mapping.get(key, key)
                    renamed_chunk[new_key] = value
                res["chunks"].append(renamed_chunk)
        return get_result(data=res)
    except Exception as e:
        if str(e).find("not_found") > 0:
            return get_result(retmsg=f'No chunk found!',
                                   retcode=RetCode.DATA_ERROR)
        return server_error_response(e)
        if len(d["positions"]) % 5 == 0:
            poss = []
            for i in range(0, len(d["positions"]), 5):
                poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
                             float(d["positions"][i + 3]), float(d["positions"][i + 4])])
            d["positions"] = poss
        origin_chunks.append(d)
        if req.get("id"):
            if req.get("id") == id:
                origin_chunks.clear()
                origin_chunks.append(d)
                sign = 1
                break
    if req.get("id"):
        if sign == 0:
            return get_error_data_result(f"Can't find this chunk {req.get('id')}")
    for chunk in origin_chunks:
        key_mapping = {
            "chunk_id": "id",
            "content_with_weight": "content",
            "doc_id": "document_id",
            "important_kwd": "important_keywords",
            "img_id": "image_id",
        }
        renamed_chunk = {}
        for key, value in chunk.items():
            new_key = key_mapping.get(key, key)
            renamed_chunk[new_key] = value
        res["chunks"].append(renamed_chunk)
    return get_result(data=res)
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
    req = request.json
    if not req.get("content"):
        return get_error_data_result(retmsg="`content` is required")
    if "important_keywords" in req:
        if type(req["important_keywords"]) != list:
            return get_error_data_result("`important_keywords` is required to be a list")
    md5 = hashlib.md5()
    md5.update((req["content"] + document_id).encode("utf-8"))
    d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
         "content_with_weight": req["content"]}
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    d["important_kwd"] = req.get("important_kwd", [])
    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
    d["important_kwd"] = req.get("important_keywords", [])
    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_keywords", [])))
    d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
    d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
    d["kb_id"] = [doc.kb_id]
    req = request.json
    if not req.get("chunk_ids"):
        return get_error_data_result("`chunk_ids` is required")
    query = {
        "doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
    for chunk_id in req.get("chunk_ids"):
        res = ELASTICSEARCH.get(
            chunk_id, search.index_name(
                tenant_id))
        if not res.get("found"):
            return server_error_response(f"Chunk {chunk_id} not found")
        if chunk_id not in sres.ids:
            return get_error_data_result(f"Chunk {chunk_id} not found")
    if not ELASTICSEARCH.deleteByQuery(
            Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
        return get_error_data_result(retmsg="Index updating failure")
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
@token_required
 def set(tenant_id,dataset_id,document_id,chunk_id):
    res = ELASTICSEARCH.get(
    try:
        res = ELASTICSEARCH.get(
        chunk_id, search.index_name(
            tenant_id))
    if not res.get("found"):
        return get_error_data_result(f"Chunk {chunk_id} not found")
    except Exception as e:
        return get_error_data_result(f"Can't find this chunk {chunk_id}")
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
    doc = DocumentService.query(id=document_id, kb_id=dataset_id)
    if not doc:
        return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
    doc = doc[0]
    query = {
        "doc_ids": [document_id], "page": 1, "size": 1024, "question": "", "sort": True
    }
    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
    if chunk_id not in sres.ids:
        return get_error_data_result(f"You don't own the chunk {chunk_id}")
    req = request.json
    content=res["_source"].get("content_with_weight")
    d = {
        "id": chunk_id,
        "content_with_weight": req.get("content",res.get["content_with_weight"])}
    d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
        "content_with_weight": req.get("content",content)}
    d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    d["important_kwd"] = req.get("important_keywords",[])
    d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
    if "important_keywords" in req:
        if type(req["important_keywords"]) != list:
            return get_error_data_result("`important_keywords` is required to be a list")
        d["important_kwd"] = req.get("important_keywords")
        d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
    if "available" in req:
        d["available_int"] = req["available"]
    embd_id = DocumentService.get_embd_id(document_id)
        arr = [
            t for t in re.split(
                r"[\n\t]",
                req["content"]) if len(t) > 1]
                d["content_with_weight"]) if len(t) > 1]
        if len(arr) != 2:
            return get_error_data_result(
                retmsg="Q&A must be separated by TAB/ENTER key.")
        d = beAdoc(d, arr[0], arr[1], not any(
            [rag_tokenizer.is_chinese(t) for t in q + a]))
    v, c = embd_mdl.encode([doc.name, req["content"]])
    v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
    v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
    d["q_%d_vec" % len(v)] = v.tolist()
    ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
    for id in kb_id:
        if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
            return get_error_data_result(f"You don't own the dataset {id}.")
    if "question" not in req_json:
    if "question" not in req:
        return get_error_data_result("`question` is required.")
    page = int(req.get("offset", 1))
    size = int(req.get("limit", 30))
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
 from api.utils.api_utils import get_error_data_result
 from api.utils.api_utils import get_result, token_required
@manager.route('/chat/<chat_id>/session', methods=['POST'])
@token_required
 def create(tenant_id, chat_id):
 def create(tenant_id,chat_id):
    req = request.json
    req["dialog_id"] = chat_id
    dia = DialogService.query(tenant_id=tenant_id, id=req["dialog_id"], status=StatusEnum.VALID.value)
    del conv["reference"]
    return get_result(data=conv)
@manager.route('/chat/<chat_id>/session/<session_id>', methods=['PUT'])
@token_required
 def update(tenant_id, chat_id, session_id):
 def update(tenant_id,chat_id,session_id):
    req = request.json
    req["dialog_id"] = chat_id
    conv_id = session_id
    conv = ConversationService.query(id=conv_id, dialog_id=chat_id)
    conv = ConversationService.query(id=conv_id,dialog_id=chat_id)
    if not conv:
        return get_error_data_result(retmsg="Session does not exist")
    if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
    return get_result()
@manager.route('/chat/<chat_id>/session/<session_id>/completion', methods=['POST'])
@manager.route('/chat/<chat_id>/completion', methods=['POST'])
@token_required
 def completion(tenant_id, chat_id, session_id):
 def completion(tenant_id,chat_id):
    req = request.json
    # req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [
    #    {"role": "user", "content": "上海有吗？"}
    # ]}
    if not req.get("session_id"):
        conv = {
            "id": get_uuid(),
            "dialog_id": chat_id,
            "name": req.get("name", "New session"),
            "message": [{"role": "assistant", "content": "Hi! I am your assistant，can I help you?"}]
        }
        if not conv.get("name"):
            return get_error_data_result(retmsg="Name can not be empty.")
        ConversationService.save(**conv)
        e, conv = ConversationService.get_by_id(conv["id"])
        session_id=conv.id
    else:
        session_id = req.get("session_id")
    if not req.get("question"):
        return get_error_data_result(retmsg="Please input your question.")
    conv = ConversationService.query(id=session_id, dialog_id=chat_id)
    conv = ConversationService.query(id=session_id,dialog_id=chat_id)
    if not conv:
        return get_error_data_result(retmsg="Session does not exist")
    conv = conv[0]
        conv.message[-1] = {"role": "assistant", "content": ans["answer"],
                            "id": message_id, "prompt": ans.get("prompt", "")}
        ans["id"] = message_id
        ans["session_id"]=session_id
    def stream():
        nonlocal dia, msg, req, conv
        try:
            for ans in chat(dia, msg, **req):
                fillin_conv(ans)
                yield "data:" + json.dumps({"code": 0, "data": ans}, ensure_ascii=False) + "\n\n"
                yield "data:" + json.dumps({"code": 0,  "data": ans}, ensure_ascii=False) + "\n\n"
            ConversationService.update_by_id(conv.id, conv.to_dict())
        except Exception as e:
            yield "data:" + json.dumps({"code": 500, "message": str(e),
                                        "data": {"answer": "**ERROR**: " + str(e), "reference": []}},
                                        "data": {"answer": "**ERROR**: " + str(e),"reference": []}},
                                       ensure_ascii=False) + "\n\n"
        yield "data:" + json.dumps({"code": 0, "data": True}, ensure_ascii=False) + "\n\n"
            break
        return get_result(data=answer)
@manager.route('/chat/<chat_id>/session', methods=['GET'])
@token_required
 def list(chat_id, tenant_id):
 def list(chat_id,tenant_id):
    if not DialogService.query(tenant_id=tenant_id, id=chat_id, status=StatusEnum.VALID.value):
        return get_error_data_result(retmsg=f"You don't own the assistant {chat_id}.")
    id = request.args.get("id")
    name = request.args.get("name")
    session = ConversationService.query(id=id, name=name, dialog_id=chat_id)
    session = ConversationService.query(id=id,name=name,dialog_id=chat_id)
    if not session:
        return get_error_data_result(retmsg="The session doesn't exist")
    page_number = int(request.args.get("page", 1))
        desc = False
    else:
        desc = True
    convs = ConversationService.get_list(chat_id, page_number, items_per_page, orderby, desc, id, name)
    convs = ConversationService.get_list(chat_id,page_number,items_per_page,orderby,desc,id,name)
    if not convs:
        return get_result(data=[])
    for conv in convs:
        del conv["reference"]
    return get_result(data=convs)
@manager.route('/chat/<chat_id>/session', methods=["DELETE"])
@token_required
 def delete(tenant_id, chat_id):
 def delete(tenant_id,chat_id):
    if not DialogService.query(id=chat_id, tenant_id=tenant_id, status=StatusEnum.VALID.value):
        return get_error_data_result(retmsg="You don't own the chat")
    ids = request.json.get("ids")
    if not ids:
        return get_error_data_result(retmsg="`ids` is required in deleting operation")
    for id in ids:
        conv = ConversationService.query(id=id, dialog_id=chat_id)
        conv = ConversationService.query(id=id,dialog_id=chat_id)
        if not conv:
            return get_error_data_result(retmsg="The chat doesn't own the session")
        ConversationService.delete_by_id(id)
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
            docs = docs.where(
                fn.LOWER(cls.model.name).contains(keywords.lower())
            )
        count = docs.count()
        if desc:
            docs = docs.order_by(cls.model.getter_by(orderby).desc())
        else:
            docs = docs.order_by(cls.model.getter_by(orderby).asc())
        docs = docs.paginate(page_number, items_per_page)
        count = docs.count()
        return list(docs.dicts()), count
--- a/api/http_api.md
+++ b/api/http_api.md
 }
 ```
 ## Delete files from a dataset
 **DELETE** `/api/v1/dataset/{dataset_id}/document `
 Delete files from a dataset
 ### Request
 - Method: DELETE
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/document`
 - Headers:
  - 'Content-Type: application/json'
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `ids`:List[str]
 #### Request example
 ```bash
 curl --request DELETE \
  --url http://{address}/api/v1/dataset/{dataset_id}/document \
  --header 'Content-Type: application/json' \
  --header 'Authorization: {YOUR ACCESS TOKEN}' \
  --data '{
  "ids": ["id_1","id_2"]
  }'
 ```
 #### Request parameters
 - `"ids"`: (*Body parameter*)
    The ids of teh documents to be deleted
 ### Response
 The successful response includes a JSON object like the following:
 ```json
 {
    "code": 0
 }.
 ```
 - `"error_code"`: `integer`  
  `0`: The operation succeeds.
 The error response includes a JSON object like the following:
 ```json
 {
    "code": 102,
    "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005."
 }
 ```
 ## Download a file from a dataset
 **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}`
 Downloads files from a dataset. 
 Downloads a file from a dataset. 
 ### Request
 - Method: GET
 - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}`
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Output:
  - '{FILE_NAME}'
 ```bash
 curl --request GET \
     --url http://{address}/api/v1/dataset/{dataset_id}/document/{documents_id} \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
     --output '{FILE_NAME}'
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
  --output ./ragflow.txt
 ```
 #### Request parameters
 ### Response
 The successful response includes a JSON object like the following:
 The successful response includes a text object like the following:
 ```text
 test_2.
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `name`:`string`
  - `parser_method`:`string`
  - `parser_config`:`dict`
 #### Request example
 ```bash
 curl --request PUT \
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id} \
  --url http://{address}/api/v1/dataset/{dataset_id}/info/{document_id} \
  --header 'Authorization: Bearer {YOUR_ACCESS TOKEN}' \
  --header 'Content-Type: application/json' \
  --data '{
  "name": "manual.txt", 
  "thumbnail": null, 
  "knowledgebase_id": "779333c0758611ef910f0242ac120004", 
  "parser_method": "manual", 
  "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。；！？", "layout_recognize": true, "task_page_size": 12}, 
  "source_type": "local", "type": "doc", 
  "created_by": "134408906b6811efbcd20242ac120005", 
  "size": 0, "token_count": 0, "chunk_count": 0, 
  "progress": 0.0, 
  "progress_msg": "", 
  "process_begin_at": null, 
  "process_duration": 0.0
  "parser_config": {"chunk_token_count": 128, "delimiter": "\n!?。；！？", "layout_recognize": true, "task_page_size": 12}
  }'
 ```
 #### Request parameters
 - `"thumbnail"`: (*Body parameter*)  
    Thumbnail image of the document.  
    - `""`
 - `"knowledgebase_id"`: (*Body parameter*)  
    Knowledge base ID related to the document.  
    - `""`
 - `"parser_method"`: (*Body parameter*)  
    Method used to parse the document.  
    - `""`
 - `"parser_config"`: (*Body parameter*)  
    Configuration object for the parser.  
    - If the value is `None`, a dictionary with default values will be generated.
 - `"source_type"`: (*Body parameter*)  
    Source type of the document.  
    - `""`
 - `"type"`: (*Body parameter*)  
    Type or category of the document.  
    - `""`
 - `"created_by"`: (*Body parameter*)  
    Creator of the document.  
    - `""`
 - `"name"`: (*Body parameter*)  
    Name or title of the document.  
    - `""`
 - `"size"`: (*Body parameter*)  
    Size of the document in bytes or some other unit.  
    - `0`
 - `"token_count"`: (*Body parameter*)  
    Number of tokens in the document.  
    - `0`
 - `"chunk_count"`: (*Body parameter*)  
    Number of chunks the document is split into.  
    - `0`
 - `"progress"`: (*Body parameter*)  
    Current processing progress as a percentage.  
    - `0.0`
 - `"progress_msg"`: (*Body parameter*)  
    Message indicating current progress status.  
    - `""`
 - `"process_begin_at"`: (*Body parameter*)  
    Start time of the document processing.  
    - `None`
 - `"process_duration"`: (*Body parameter*)  
    Duration of the processing in seconds or minutes.  
    - `0.0`
 ### Response
 ### Request
 - Method: POST
 - URL: `/api/v1/dataset/{dataset_id}/chunk`
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk `
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `document_ids`:List[str]
 #### Request example
 ```shell
 ```bash
 curl --request POST \
     --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
     --raw '{
         "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
     }'
    --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
    --header 'Content-Type: application/json' \
    --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
    --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
 - `"documents"`: (*Body parameter*)
  - Documents to parse
 - `"document_ids"`:(*Body parameter*)  
  The ids of the documents to be parsed
 ### Response
 The successful response includes a JSON object like the following:
 ```shell
 ```json
 {
    "code": 0
 }
 The error response includes a JSON object like the following:
 ```shell
 ```json
 {
    "code": 3016,
    "message": "Can't connect database"
    "code": 102,
    "message": "`document_ids` is required"
 }
 ```
 ### Request
 - Method: POST
 - URL: `/api/v1/dataset/{dataset_id}/chunk`
 - Method: DELETE
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/chunk`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `document_ids`:List[str]
 #### Request example
 ```shell
 ```bash
 curl --request DELETE \
     --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
     --raw '{
         "documents": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
     }'
   --url http://{address}/api/v1/dataset/{dataset_id}/chunk \
   --header 'Content-Type: application/json' \
   --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
   --data '{"document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]}'
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
 - `"documents"`: (*Body parameter*)
  - Documents to stop parsing
 - `"document_ids"`:(*Body parameter*)  
  The ids of the documents to be parsed
 ### Response
 The successful response includes a JSON object like the following:
 ```shell
 ```json
 {
    "code": 0
 }
 The error response includes a JSON object like the following:
 ```shell
 ```json
 {
    "code": 3016,
    "message": "Can't connect database"
    "code": 102,
    "message": "`document_ids` is required"
 }
 ```
 ## Get document chunk list
 **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 **GET** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
 Get document chunk list
 ### Request
 - Method: GET
 - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id}`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 #### Request example
 ```shell
 ```bash
 curl --request GET \
     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk?keywords={keywords}&offset={offset}&limit={limit}&id={id} \
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' 
 ```
 #### Request parameters
 - `"dataset_id"`: (*Path parameter*)
 - `"document_id"`: (*Path parameter*)
 - `"offset"`(*Filter parameter*)  
  The beginning number of records for paging.
 - `"keywords"`(*Filter parameter*)  
  List chunks whose name has the given keywords
 - `"limit"`(*Filter parameter*)  
  Records number to return
 - `"id"`(*Filter parameter*)  
  The id of chunk to be got
 ### Response
 The successful response includes a JSON object like the following:
 ```shell
 ```json
 {
    "code": 0
    "code": 0,
    "data": {
        "chunks": [
            {
                "available_int": 1,
                "content": "<em>advantag</em>of ragflow increas accuraci and relev:by incorpor retriev inform , ragflow can gener respons that are more accur",
                "document_keyword": "ragflow_test.txt",
                "document_id": "77df9ef4759a11ef8bdd0242ac120004",
                "id": "4ab8c77cfac1a829c8d5ed022a0808c0",
                "image_id": "",
                "important_keywords": [],
                "positions": [
                    ""
                ]
            }
        ],
        "chunks": [],
        "doc": {
            "chunk_count": 5,
            "create_date": "Wed, 18 Sep 2024 08:46:16 GMT",
            "create_time": 1726649176833,
            "created_by": "134408906b6811efbcd20242ac120005",
            "id": "77df9ef4759a11ef8bdd0242ac120004",
            "knowledgebase_id": "77d9d24e759a11ef880c0242ac120004",
            "location": "ragflow_test.txt",
            "name": "ragflow_test.txt",
            "chunk_num": 0,
            "create_date": "Sun, 29 Sep 2024 03:47:29 GMT",
            "create_time": 1727581649216,
            "created_by": "69736c5e723611efb51b0242ac120007",
            "id": "8cb781ec7e1511ef98ac0242ac120006",
            "kb_id": "c7ee74067a2c11efb21c0242ac120006",
            "location": "明天的天气是晴天.txt",
            "name": "明天的天气是晴天.txt",
            "parser_config": {
                "chunk_token_count": 128,
                "delimiter": "\n!?。；！？",
                "layout_recognize": true,
                "task_page_size": 12
                "pages": [
                    [
                        1,
                        1000000
                    ]
                ]
            },
            "parser_method": "naive",
            "process_begin_at": "Wed, 18 Sep 2024 08:46:16 GMT",
            "process_duation": 7.3213,
            "progress": 1.0,
            "progress_msg": "\nTask has been received.\nStart to parse.\nFinish parsing.\nFinished slicing files(5). Start to embedding the content.\nFinished embedding(6.16)! Start to build index!\nDone!",
            "run": "3",
            "size": 4209,
            "parser_id": "naive",
            "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT",
            "process_duation": 1435.37,
            "progress": 0.0370833,
            "progress_msg": "\nTask has been received.",
            "run": "1",
            "size": 24,
            "source_type": "local",
            "status": "1",
            "thumbnail": null,
            "token_count": 746,
            "token_num": 0,
            "type": "doc",
            "update_date": "Wed, 18 Sep 2024 08:46:23 GMT",
            "update_time": 1726649183321
            "update_date": "Tue, 15 Oct 2024 10:47:46 GMT",
            "update_time": 1728989266371
        },
        "total": 1
    },
        "total": 0
    }
 }
 ```
 The error response includes a JSON object like the following:
 ```shell
 ```json
 {
    "code": 3016,
    "message": "Can't connect database"
    "code": 102,
    "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5."
 }
 ```
 ### Request
 - Method: DELETE
 - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `chunk_ids`:List[str]
 #### Request example
 ```shell
 ```bash
 curl --request DELETE \
     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
     --raw '{
         "chunks": ["f6b170ac758811efa0660242ac120004", "97ad64b6759811ef9fc30242ac120004"]
     }'
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
  --header 'Content-Type: application/json' \
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
  --data '{
  "chunk_ids": ["test_1", "test_2"]
  }'
 ```
 #### Request parameters
 - `"chunk_ids"`:(*Body parameter*)
  The chunks of the document to be deleted
 ### Response
 Success
 ```json
 {
    "code": 0
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "`chunk_ids` is required"
 }
 ```
 ## Update document chunk
 **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 **PUT** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
 Update document chunk
 ### Request
 - Method: PUT
 - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `content`:str
  - `important_keywords`:str
  - `available`:int
 #### Request example
 ```shell
 ```bash
 curl --request PUT \
     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
     --raw '{
        "chunk_id": "d87fb0b7212c15c18d0831677552d7de",  
        "knowledgebase_id": null,  
        "name": "",  
        "content": "ragflow123",  
        "important_keywords": [],   
        "document_id": "e6bbba92759511efaa900242ac120004",  
        "status": "1" 
     }'
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id} \
  --header 'Content-Type: application/json' \
  --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
  --data '{   
    "content": "ragflow123",  
    "important_keywords": [],   
 }'
 ```
 #### Request parameters
 - `"content"`:(*Body parameter*)
  Contains the main text or information of the chunk.
 - `"important_keywords"`:(*Body parameter*)
  list the key terms or phrases that are significant or central to the chunk's content.
 - `"available"`:(*Body parameter*)
   Indicating the availability status, 0 means unavailable and 1 means available.
 ### Response
 Success
 ```json
 {
    "code": 0
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2"
 }
 ```
 ## Insert document chunks
 **POST** `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 ### Request
 - Method: POST
 - URL: `/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - URL: `http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `content`: str
  - `important_keywords`:List[str]
 #### Request example
 ```shell
 ```bash
 curl --request POST \
     --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
     --raw '{
         "document_id": "97ad64b6759811ef9fc30242ac120004",
         "content": ["ragflow content", "ragflow content"]
     }'
  --url http://{address}/api/v1/dataset/{dataset_id}/document/{document_id}/chunk \
  --header 'Content-Type: application/json' \
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
  --data '{
    "content": "ragflow content"
 }'
 ```
 #### Request parameters
 - `content`:(*Body parameter*)  
  Contains the main text or information of the chunk.
 - `important_keywords`(*Body parameter*)  
  list the key terms or phrases that are significant or central to the chunk's content.
 ### Response
 Success
 ```json
 {
    "code": 0,
    "data": {
        "chunk": {
            "content": "ragflow content",
            "create_time": "2024-10-16 08:05:04",
            "create_timestamp": 1729065904.581025,
            "dataset_id": [
                "c7ee74067a2c11efb21c0242ac120006"
            ],
            "document_id": "5c5999ec7be811ef9cab0242ac120005",
            "id": "d78435d142bd5cf6704da62c778795c5",
            "important_keywords": []
        }
    }
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "`content` is required"
 }
 ```
 ## Dataset retrieval test
 **GET** `/api/v1/dataset/{dataset_id}/retrieval`
 **GET** `/api/v1/retrieval`
 Retrieval test of a dataset
 ### Request
 - Method: GET
 - URL: `/api/v1/dataset/{dataset_id}/retrieval`
 - Method: POST
 - URL: `http://{address}/api/v1/retrieval`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `question`: str  
  - `datasets`: List[str]  
  - `documents`: List[str]
  - `offset`: int  
  - `limit`: int  
  - `similarity_threshold`: float  
  - `vector_similarity_weight`: float  
  - `top_k`: int  
  - `rerank_id`: string  
  - `keyword`: bool  
  - `highlight`: bool
 #### Request example
 ```shell
 curl --request GET \
     --url http://{address}/api/v1/dataset/{dataset_id}/retrieval \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
     --raw '{
         "query_text": "This is a cat."
     }'
 ```bash
 curl --request POST \
  --url http://{address}/api/v1/retrieval \
  --header 'Content-Type: application/json' \
  --header 'Authorization: {YOUR_ACCESS_TOKEN}' \
  --data '{
    "question": "What is advantage of ragflow?",
    "datasets": [
        "b2a62730759d11ef987d0242ac120004"
    ],
    "documents": [
        "77df9ef4759a11ef8bdd0242ac120004"
    ]
 }'
 ```
 #### Request parameter
 - `"question"`: (*Body parameter*)  
  User's question, search keywords  
  `""`
 - `"datasets"`: (*Body parameter*)  
  The scope of datasets  
  `None`
 - `"documents"`: (*Body parameter*)  
  The scope of document. `None` means no limitation  
  `None`
 - `"offset"`: (*Body parameter*)  
  The beginning point of retrieved records  
  `1`
 - `"limit"`: (*Body parameter*)  
  The maximum number of records needed to return  
  `30`
 - `"similarity_threshold"`: (*Body parameter*)  
  The minimum similarity score  
  `0.2`
 - `"vector_similarity_weight"`: (*Body parameter*)  
  The weight of vector cosine similarity, `1 - x` is the term similarity weight  
  `0.3`
 - `"top_k"`: (*Body parameter*)  
  Number of records engaged in vector cosine computation  
  `1024`
 - `"rerank_id"`: (*Body parameter*)  
  ID of the rerank model  
  `None`
 - `"keyword"`: (*Body parameter*)  
  Whether keyword-based matching is enabled  
  `False`
 - `"highlight"`: (*Body parameter*)  
  Whether to enable highlighting of matched terms in the results  
  `False`
 ### Response
 Success
 ```json
 {
    "code": 0,
    "data": {
        "chunks": [
            {
                "content": "ragflow content",
                "content_ltks": "ragflow content",
                "document_id": "5c5999ec7be811ef9cab0242ac120005",
                "document_keyword": "1.txt",
                "highlight": "<em>ragflow</em> content",
                "id": "d78435d142bd5cf6704da62c778795c5",
                "img_id": "",
                "important_keywords": [
                    ""
                ],
                "kb_id": "c7ee74067a2c11efb21c0242ac120006",
                "positions": [
                    ""
                ],
                "similarity": 0.9669436601210759,
                "term_similarity": 1.0,
                "vector_similarity": 0.8898122004035864
            }
        ],
        "doc_aggs": [
            {
                "count": 1,
                "doc_id": "5c5999ec7be811ef9cab0242ac120005",
                "doc_name": "1.txt"
            }
        ],
        "total": 1
    }
 }
 ```
 Error
 ```json
 {
    "code": 102,
    "message": "`datasets` is required."
 }
 ```
 ## Create chat
 **POST** `/api/v1/chat`
 ## Chat with a chat session
 **POST** `/api/v1/chat/{chat_id}/session/{session_id}/completion`
 **POST** `/api/v1/chat/{chat_id}/completion`
 Chat with a chat session
 ### Request
 - Method: POST
 - URL: `http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion`
 - URL: `http://{address} /api/v1/chat/{chat_id}/completion`
 - Headers:
  - `content-Type: application/json`
  - 'Authorization: Bearer {YOUR_ACCESS_TOKEN}'
 - Body:
  - `question`: string
  - `stream`: bool
  - `session_id`: str
 #### Request example
 ```bash
 curl --request POST \
  --url http://{address} /api/v1/chat/{chat_id}/session/{session_id}/completion \
  --url http://{address} /api/v1/chat/{chat_id}/completion \
  --header 'Content-Type: application/json' \
  --header 'Authorization: Bearer {YOUR_ACCESS_TOKEN}' \
  --data-binary '{
 - `stream`: (*Body Parameter*)  
    The approach of streaming text generation.
    `False`
 - `session_id`: (*Body Parameter*)  
    The id of session.If not provided, a new session will be generated.
 ### Response
 Success
 ```json
--- a/api/python_api_reference.md
+++ b/api/python_api_reference.md
 ## Upload document
 ```python
 RAGFLOW.upload_document(ds:DataSet, name:str, blob:bytes)-> bool
 DataSet.upload_documents(document_list: List[dict])
 ```
 ### Parameters
 #### name
 #### document_list:`List[dict]`
 A list composed of dicts containing `name` and `blob`.
 #### blob
 ### Returns
 no return
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")
 ds.upload_documents([{name="1.txt", blob="123"}, ...] }
 ```
 ---
 ## Update document
 ```python
 Document.update(update_message:dict)
 ```
 ### Parameters
 #### update_message:`dict`  
 only `name`,`parser_config`,`parser_method` can be changed
 ### Returns
 no return
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds=rag.list_datasets(id='id')
 ds=ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 doc.update([{"parser_method": "manual"...}])
 ```
 ---
 ## Download document
 ```python
 Document.download() -> bytes
 ```
 ### Returns
 bytes of the document.
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds=rag.list_datasets(id="id")
 ds=ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 open("~/ragflow.txt", "wb+").write(doc.download())
 print(doc)
 ```
 ---
 ## Retrieve document
 ## List documents
 ```python
 RAGFlow.get_document(id:str=None,name:str=None) -> Document
 Dataset.list_documents(id:str =None, keywords: str=None, offset: int=0, limit:int = 1024,order_by:str = "create_time", desc: bool = True) -> List[Document]
 ```
 ### Parameters
 #### id: `str`, *Required*
 #### id: `str`
 ID of the document to retrieve.
 The id of the document to be got
 #### name: `str`
 #### keywords: `str`
 List documents whose name has the given keywords. Defaults to `None`.
 #### offset: `int`
 The beginning number of records for paging. Defaults to `0`.
 Name or title of the document.
 #### limit: `int`
 Records number to return, -1 means all of them. Records number to return, -1 means all of them.
 #### orderby: `str`
 The field by which the records should be sorted. This specifies the attribute or column used to order the results.
 #### desc:`bool`
 A boolean flag indicating whether the sorting should be in descending order.
 ### Returns
 List[Document]  
 A document object containing the following attributes:
 #### id: `str`
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d",name='testdocument.txt')
 print(doc)
 ```
 ---
 ## Save document settings
 ```python
 Document.save() -> bool
 ```
 ### Returns
 bool
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d")
 doc.parser_method= "manual"
 doc.save()
 ```
 ---
 ## Download document
 ```python
 Document.download() -> bytes
 ```
 ### Returns
 bytes of the document.
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d")
 open("~/ragflow.txt", "w+").write(doc.download())
 print(doc) 
 ```
 ---
 ## List documents
 ```python
 Dataset.list_docs(keywords: str=None, offset: int=0, limit:int = -1) -> List[Document]
 ```
 ### Parameters
 #### keywords: `str`
 List documents whose name has the given keywords. Defaults to `None`.
 #### offset: `int`
 The beginning number of records for paging. Defaults to `0`.
 #### limit: `int`
 Records number to return, -1 means all of them. Records number to return, -1 means all of them.
 ### Returns
 List[Document]
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")
 filename1 = "~/ragflow.txt"
 rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
 filename2 = "~/infinity.txt"
 rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
 for d in ds.list_docs(keywords="rag", offset=0, limit=12):
 blob=open(filename1 , "rb").read()
 list_files=[{"name":filename1,"blob":blob}]
 ds.upload_documents(list_files)
 for d in ds.list_documents(keywords="rag", offset=0, limit=12):
    print(d)
 ```
 ## Delete documents
 ```python
 Document.delete() -> bool
 DataSet.delete_documents(ids: List[str] = None)
 ```
 ### Returns
 bool
 description: delete success or not
 no return
 ### Examples
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.create_dataset(name="kb_1")
 filename1 = "~/ragflow.txt"
 rag.create_document(ds, name=filename1 , blob=open(filename1 , "rb").read())
 filename2 = "~/infinity.txt"
 rag.create_document(ds, name=filename2 , blob=open(filename2 , "rb").read())
 for d in ds.list_docs(keywords="rag", offset=0, limit=12):
    d.delete()
 ds = rag.list_datasets(name="kb_1")
 ds = ds[0]
 ds.delete_documents(ids=["id_1","id_2"])
 ```
 ---
 ## Parse document
 ## Parse and stop parsing document
 ```python
 Document.async_parse() -> None
 RAGFLOW.async_parse_documents() -> None
 DataSet.async_parse_documents(document_ids:List[str]) -> None
 DataSet.async_cancel_parse_documents(document_ids:List[str])-> None
 ```
 ### Parameters
 #### document_ids:`List[str]`
 The ids of the documents to be parsed
 ????????????????????????????????????????????????????
 ### Returns
 no return
 ????????????????????????????????????????????????????
 ### Examples
 ```python
 #document parse and cancel
 rag = RAGFlow(API_KEY, HOST_ADDRESS)
 ds = rag.create_dataset(name="dataset_name")
 name3 = 'ai.pdf'
 path = 'test_data/ai.pdf'
 rag.create_document(ds, name=name3, blob=open(path, "rb").read())
 doc = rag.get_document(name="ai.pdf")
 doc.async_parse()
 print("Async parsing initiated")
 ```
 ---
 ## Cancel document parsing
 ```python
 rag.async_cancel_parse_documents(ids)
 RAGFLOW.async_cancel_parse_documents()-> None
 ```
 ### Parameters
 #### ids, `list[]`
 ### Returns
 ?????????????????????????????????????????????????
 ### Examples
 ```python
 #documents parse and cancel
 rag = RAGFlow(API_KEY, HOST_ADDRESS)
 ds = rag.create_dataset(name="God5")
 documents = [
    {'name': 'test1.txt', 'path': 'test_data/test1.txt'},
    {'name': 'test2.txt', 'path': 'test_data/test2.txt'},
    {'name': 'test3.txt', 'path': 'test_data/test3.txt'}
    {'name': 'test1.txt', 'blob': open('./test_data/test1.txt',"rb").read()},
    {'name': 'test2.txt', 'blob': open('./test_data/test2.txt',"rb").read()},
    {'name': 'test3.txt', 'blob': open('./test_data/test3.txt',"rb").read()}
 ]
 # Create documents in bulk
 for doc_info in documents:
    with open(doc_info['path'], "rb") as file:
        created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
 docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
 ids = [doc.id for doc in docs]
 rag.async_parse_documents(ids)
 ds.upload_documents(documents)
 documents=ds.list_documents(keywords="test")
 ids=[]
 for document in documents:
    ids.append(document.id)
 ds.async_parse_documents(ids)
 print("Async bulk parsing initiated")
 for doc in docs:
    for progress, msg in doc.join(interval=5, timeout=10):
        print(f"{doc.name}: Progress: {progress}, Message: {msg}")
 cancel_result = rag.async_cancel_parse_documents(ids)
 ds.async_cancel_parse_documents(ids)
 print("Async bulk parsing cancelled")
 ```
 ---
 ## Join document
 ??????????????????
 ## List chunks
 ```python
 Document.join(interval=15, timeout=3600) -> iteral[Tuple[float, str]]
 Document.list_chunks(keywords: str = None, offset: int = 0, limit: int = -1, id : str = None) -> List[Chunk]
 ```
 ### Parameters
 #### interval: `int`
 - `keywords`: `str`  
  List chunks whose name has the given keywords  
  default: `None`
 Time interval in seconds for progress report. Defaults to `15`.
 - `offset`: `int`  
  The beginning number of records for paging  
  default: `1`
 #### timeout: `int`
 Timeout in seconds. Defaults to `3600`.
 - `limit`: `int`  
  Records number to return  
  default: `30`
 - `id`: `str`  
  The ID of the chunk to be retrieved  
  default: `None`
 ### Returns
 List[chunk]
 iteral[Tuple[float, str]]
 ### Examples
 ```python
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.list_datasets("123")
 ds = ds[0]
 ds.async_parse_documents(["wdfxb5t547d"])
 for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
    print(c)
 ```
 ## Add chunk
 ```python
 ### Parameters
 #### content: `str`, *Required*
 Contains the main text or information of the chunk.
 #### important_keywords :`List[str]`
 list the key terms or phrases that are significant or central to the chunk's content.
 ### Returns
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d")
 ds = rag.list_datasets(id="123")
 ds = ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
 ```
 ## Delete chunk
 ```python
 Chunk.delete() -> bool
 Document.delete_chunks(chunk_ids: List[str])
 ```
 ### Parameters
 #### chunk_ids:`List[str]`
 The list of chunk_id
 ### Returns
 bool
 no return
 ### Examples
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d")
 ds = rag.list_datasets(id="123")
 ds = ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
 chunk.delete()
 doc.delete_chunks(["id_1","id_2"])
 ```
 ---
 ## Save chunk contents
 ## Update chunk
 ```python
 Chunk.save() -> bool
 Chunk.update(update_message: dict)
 ```
 ### Parameters
 - `content`: `str`  
  Contains the main text or information of the chunk
 - `important_keywords`: `List[str]`  
  List the key terms or phrases that are significant or central to the chunk's content
 - `available`: `int`  
  Indicating the availability status, `0` means unavailable and `1` means available
 ### Returns
 bool
 no return
 ### Examples
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 doc = rag.get_document(id="wdfxb5t547d")
 ds = rag.list_datasets(id="123")
 ds = ds[0]
 doc = ds.list_documents(id="wdfxb5t547d")
 doc = doc[0]
 chunk = doc.add_chunk(content="xxxxxxx")
 chunk.content = "sdfx"
 chunk.save()
 chunk.update({"content":"sdfx...})
 ```
 ---
 ## Retrieval
 ```python
 RAGFlow.retrieval(question:str, datasets:List[Dataset], document=List[Document]=None,     offset:int=0, limit:int=6, similarity_threshold:float=0.1, vector_similarity_weight:float=0.3, top_k:int=1024) -> List[Chunk]
 RAGFlow.retrieve(question:str="", datasets:List[str]=None, document=List[str]=None, offset:int=1, limit:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,higlight:bool=False) -> List[Chunk]
 ```
 ### Parameters
 Number of records engaged in vector cosine computaton. Defaults to `1024`.
 #### rerank_id:`str`
 ID of the rerank model.  Defaults to `None`.
 #### keyword:`bool`
 Indicating whether keyword-based matching is enabled (True) or disabled (False).
 #### highlight:`bool`
 Specifying whether to enable highlighting of matched terms in the results (True) or not (False).
 ### Returns
 List[Chunk]
 from ragflow import RAGFlow
 rag = RAGFlow(api_key="xxxxxx", base_url="http://xxx.xx.xx.xxx:9380")
 ds = rag.get_dataset(name="ragflow")
 ds = rag.list_datasets(name="ragflow")
 ds = ds[0]
 name = 'ragflow_test.txt'
 path = 'test_data/ragflow_test.txt'
 path = './test_data/ragflow_test.txt'
 rag.create_document(ds, name=name, blob=open(path, "rb").read())
 doc = rag.get_document(name=name)
 doc.async_parse()
 # Wait for parsing to complete 
 for progress, msg in doc.join(interval=5, timeout=30):
    print(progress, msg)
 for c in rag.retrieval(question="What's ragflow?", 
             datasets=[ds], documents=[doc], 
             offset=0, limit=6, similarity_threshold=0.1, 
 doc = ds.list_documents(name=name)
 doc = doc[0]
 ds.async_parse_documents([doc.id])
 for c in rag.retrieve(question="What's ragflow?", 
             datasets=[ds.id], documents=[doc.id], 
             offset=1, limit=30, similarity_threshold=0.2, 
             vector_similarity_weight=0.3,
             top_k=1024
             ):
--- a/sdk/python/ragflow/modules/chunk.py
+++ b/sdk/python/ragflow/modules/chunk.py
                res_dict.pop(k)
        super().__init__(rag, res_dict)
    def delete(self) -> bool:
        """
        Delete the chunk in the document.
        """
        res = self.post('/doc/chunk/rm',
                        {"document_id": self.document_id, 'chunk_ids': [self.id]})
        res = res.json()
        if res.get("retmsg") == "success":
            return True
        raise Exception(res["retmsg"])
    def save(self) -> bool:
        """
        Save the document details to the server.
        """
        res = self.post('/doc/chunk/set',
                        {"chunk_id": self.id,
                         "knowledgebase_id": self.knowledgebase_id,
                         "name": self.document_name,
                         "content": self.content,
                         "important_keywords": self.important_keywords,
                         "document_id": self.document_id,
                         "available": self.available,
                         })
    def update(self,update_message:dict):
        res = self.put(f"/dataset/{self.knowledgebase_id}/document/{self.document_id}/chunk/{self.id}",update_message)
        res = res.json()
        if res.get("retmsg") == "success":
            return True
        raise Exception(res["retmsg"])
        if res.get("code") != 0 :
            raise Exception(res["message"])
--- a/sdk/python/ragflow/modules/dataset.py
+++ b/sdk/python/ragflow/modules/dataset.py
        if res.get("code") != 0:
            raise Exception(res["message"])
    def async_parse_documents(self,document_ids):
        res = self.post(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res.get("message"))
    def async_cancel_parse_documents(self,document_ids):
        res = self.rm(f"/dataset/{self.id}/chunk",{"document_ids":document_ids})
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res.get("message"))
--- a/sdk/python/ragflow/modules/document.py
+++ b/sdk/python/ragflow/modules/document.py
 import time
 from PIL.ImageFile import raise_oserror
 from .base import Base
 from .chunk import Chunk
 from typing import List
 class Document(Base):
                res_dict.pop(k)
        super().__init__(rag, res_dict)
    def update(self,update_message:dict) -> bool:
        """
        Save the document details to the server.
        """
        res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
        res = res.json()
        if res.get("code") != 0:
            raise Exception(res["message"])
    def delete(self) -> bool:
        """
        Delete the document from the server.
        """
        res = self.rm('/doc/delete',
                      {"document_id": self.id})
    def list_chunks(self,offset=0, limit=30, keywords="", id:str=None):
        data={"document_id": self.id,"keywords": keywords,"offset":offset,"limit":limit,"id":id}
        res = self.get(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', data)
        res = res.json()
        if res.get("retmsg") == "success":
            return True
        raise Exception(res["retmsg"])
    def download(self) -> bytes:
        """
        Download the document content from the server using the Flask API.
        :return: The downloaded document content in bytes.
        """
        # Construct the URL for the API request using the document ID and knowledge base ID
        res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")
        # Check the response status code to ensure the request was successful
        if res.status_code == 200:
            # Return the document content as bytes
            return res.content
        else:
            # Handle the error and raise an exception
            raise Exception(
                f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
            )
    def async_parse(self):
        """
        Initiate document parsing asynchronously without waiting for completion.
        """
        try:
            # Construct request data including document ID and run status (assuming 1 means to run)
            data = {"document_ids": [self.id], "run": 1}
            # Send a POST request to the specified parsing status endpoint to start parsing
            res = self.post(f'/doc/run', data)
            # Check the server response status code
            if res.status_code != 200:
                raise Exception(f"Failed to start async parsing: {res.text}")
            print("Async parsing started successfully.")
        except Exception as e:
            # Catch and handle exceptions
            print(f"Error occurred during async parsing: {str(e)}")
            raise
    import time
    def join(self, interval=5, timeout=3600):
        """
        Wait for the asynchronous parsing to complete and yield parsing progress periodically.
        :param interval: The time interval (in seconds) for progress reports.
        :param timeout: The timeout (in seconds) for the parsing operation.
        :return: An iterator yielding parsing progress and messages.
        """
        start_time = time.time()
        while time.time() - start_time < timeout:
            # Check the parsing status
            res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]})
            res_data = res.json()
            data = res_data.get("data", [])
            # Retrieve progress and status message
            progress = data.get("progress", 0)
            progress_msg = data.get("status", "")
        if res.get("code") == 0:
            chunks=[]
            for data in res["data"].get("chunks"):
                chunk = Chunk(self.rag,data)
                chunks.append(chunk)
            return chunks
        raise Exception(res.get("message"))
            yield progress, progress_msg  # Yield progress and message
            if progress == 100:  # Parsing completed
                break
            time.sleep(interval)
    def cancel(self):
        """
        Cancel the parsing task for the document.
        """
        try:
            # Construct request data, including document ID and action to cancel (assuming 2 means cancel)
            data = {"document_ids": [self.id], "run": 2}
            # Send a POST request to the specified parsing status endpoint to cancel parsing
            res = self.post(f'/doc/run', data)
            # Check the server response status code
            if res.status_code != 200:
                print("Failed to cancel parsing. Server response:", res.text)
            else:
                print("Parsing cancelled successfully.")
        except Exception as e:
            print(f"Error occurred during async parsing cancellation: {str(e)}")
            raise
    def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None):
        """
        List all chunks associated with this document by calling the external API.
        Args:
            page (int): The page number to retrieve (default 1).
            size (int): The number of chunks per page (default 30).
            keywords (str): Keywords for searching specific chunks (default "").
            available_int (int): Filter for available chunks (optional).
        Returns:
            list: A list of chunks returned from the API.
        """
        data = {
            "document_id": self.id,
            "page": page,
            "size": size,
            "keywords": keywords,
            "offset":offset,
            "limit":limit
        }
        if available_int is not None:
            data["available_int"] = available_int
        res = self.post(f'/doc/chunk/list', data)
        if res.status_code == 200:
            res_data = res.json()
            if res_data.get("retmsg") == "success":
                chunks=[]
                for chunk_data in res_data["data"].get("chunks", []):
                    chunk=Chunk(self.rag,chunk_data)
                    chunks.append(chunk)
                return chunks
            else:
                raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
        else:
            raise Exception(f"API request failed with status code {res.status_code}")
    def add_chunk(self, content: str):
        res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content})
        if res.status_code == 200:
            res_data = res.json().get("data")
            chunk_data = res_data.get("chunk")
            return Chunk(self.rag,chunk_data)
        else:
            raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")
        res = self.post(f'/dataset/{self.knowledgebase_id}/document/{self.id}/chunk', {"content":content})
        res = res.json()
        if res.get("code") == 0:
            return Chunk(self.rag,res["data"].get("chunk"))
        raise Exception(res.get("message"))
    def delete_chunks(self,ids:List[str]):
        res = self.rm(f"dataset/{self.knowledgebase_id}/document/{self.id}/chunk",{"ids":ids})
        res = res.json()
        if res.get("code")!=0:
            raise Exception(res.get("message"))
--- a/sdk/python/ragflow/modules/session.py
+++ b/sdk/python/ragflow/modules/session.py
        for message in self.messages:
            if "reference" in message:
                message.pop("reference")
        res = self.post(f"/chat/{self.chat_id}/session/{self.id}/completion",
                        {"question": question, "stream": True}, stream=stream)
        res = self.post(f"/chat/{self.chat_id}/completion",
                        {"question": question, "stream": True,"session_id":self.id}, stream=stream)
        for line in res.iter_lines():
            line = line.decode("utf-8")
            if line.startswith("{"):
        self.term_similarity = None
        self.positions = None
        super().__init__(rag, res_dict)
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
        raise Exception(res["message"])
    def async_parse_documents(self, doc_ids):
        """
        Asynchronously start parsing multiple documents without waiting for completion.
        :param doc_ids: A list containing multiple document IDs.
        """
        try:
            if not doc_ids or not isinstance(doc_ids, list):
                raise ValueError("doc_ids must be a non-empty list of document IDs")
            data = {"document_ids": doc_ids, "run": 1}
            res = self.post(f'/doc/run', data)
            if res.status_code != 200:
                raise Exception(f"Failed to start async parsing for documents: {res.text}")
            print(f"Async parsing started successfully for documents: {doc_ids}")
        except Exception as e:
            print(f"Error occurred during async parsing for documents: {str(e)}")
            raise
    def async_cancel_parse_documents(self, doc_ids):
        """
        Cancel the asynchronous parsing of multiple documents.
        :param doc_ids: A list containing multiple document IDs.
        """
        try:
            if not doc_ids or not isinstance(doc_ids, list):
                raise ValueError("doc_ids must be a non-empty list of document IDs")
            data = {"document_ids": doc_ids, "run": 2}
            res = self.post(f'/doc/run', data)
            if res.status_code != 200:
                raise Exception(f"Failed to cancel async parsing for documents: {res.text}")
            print(f"Async parsing canceled successfully for documents: {doc_ids}")
        except Exception as e:
            print(f"Error occurred during canceling parsing for documents: {str(e)}")
            raise
    def retrieval(self,
                  question,
                  datasets=None,
                  documents=None,
                  offset=0,
                  limit=6,
                  similarity_threshold=0.1,
                  vector_similarity_weight=0.3,
                  top_k=1024):
        """
        Perform document retrieval based on the given parameters.
        :param question: The query question.
        :param datasets: A list of datasets (optional, as documents may be provided directly).
        :param documents: A list of documents (if specific documents are provided).
        :param offset: Offset for the retrieval results.
        :param limit: Maximum number of retrieval results.
        :param similarity_threshold: Similarity threshold.
        :param vector_similarity_weight: Weight of vector similarity.
        :param top_k: Number of top most similar documents to consider (for pre-filtering or ranking).
        Note: This is a hypothetical implementation and may need adjustments based on the actual backend service API.
        """
        try:
            data = {
                "question": question,
                "datasets": datasets if datasets is not None else [],
                "documents": [doc.id if hasattr(doc, 'id') else doc for doc in
                              documents] if documents is not None else [],
    def retrieve(self, question="",datasets=None,documents=None, offset=1, limit=30, similarity_threshold=0.2,vector_similarity_weight=0.3,top_k=1024,rerank_id:str=None,keyword:bool=False,):
            data_params = {
                "offset": offset,
                "limit": limit,
                "similarity_threshold": similarity_threshold,
                "vector_similarity_weight": vector_similarity_weight,
                "top_k": top_k,
                "knowledgebase_id": datasets,
                "rerank_id":rerank_id,
                "keyword":keyword
            }
            data_json ={
                "question": question,
                "datasets": datasets,
                "documents": documents
            }
            # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
            res = self.post(f'/doc/retrieval_test', data)
            # Check the response status code
            if res.status_code == 200:
                res_data = res.json()
                if res_data.get("retmsg") == "success":
                    chunks = []
                    for chunk_data in res_data["data"].get("chunks", []):
                        chunk = Chunk(self, chunk_data)
                        chunks.append(chunk)
                    return chunks
                else:
                    raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
            else:
                raise Exception(f"API request failed with status code {res.status_code}")
        except Exception as e:
            print(f"An error occurred during retrieval: {e}")
            raise
            res = self.get(f'/retrieval', data_params,data_json)
            res = res.json()
            if res.get("code") ==0:
                chunks=[]
                for chunk_data in res["data"].get("chunks"):
                    chunk=Chunk(self,chunk_data)
                    chunks.append(chunk)
                return chunks
            raise Exception(res.get("message"))
--- a/sdk/python/test/t_document.py
+++ b/sdk/python/test/t_document.py
        # Check if the retrieved document is of type Document
        if isinstance(doc, Document):
            # Download the document content and save it to a file
            try:
                with open("ragflow.txt", "wb+") as file:
                    file.write(doc.download())
                    # Print the document object for debugging
                print(doc)
                # Assert that the download was successful
                assert True, "Document downloaded successfully."
            except Exception as e:
                # If an error occurs, raise an assertion error
                assert False, f"Failed to download document, error: {str(e)}"
            with open("./ragflow.txt", "wb+") as file:
                file.write(doc.download())
                # Print the document object for debugging
            print(doc)
            # Assert that the download was successful
            assert True, f"Failed to download document, error: {doc}"
        else:
            # If the document retrieval fails, assert failure
            assert False, f"Failed to get document, error: {doc}"
        blob2 = b"Sample document content for ingestion test222."
        list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
        ds.upload_documents(list_1)
        for d in ds.list_docs(keywords="test", offset=0, limit=12):
        for d in ds.list_documents(keywords="test", offset=0, limit=12):
            assert isinstance(d, Document), "Failed to upload documents"
    def test_delete_documents_in_dataset_with_success(self):
        blob1 = b"Sample document content for ingestion test333."
        name2 = "Test Document444.txt"
        blob2 = b"Sample document content for ingestion test444."
        name3 = 'test.txt'
        path = 'test_data/test.txt'
        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
        rag.create_document(ds, name=name1, blob=blob1)
        rag.create_document(ds, name=name2, blob=blob2)
        for d in ds.list_docs(keywords="document", offset=0, limit=12):
        ds.upload_documents([{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}])
        for d in ds.list_documents(keywords="document", offset=0, limit=12):
            assert isinstance(d, Document)
            d.delete()
            print(d)
        remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
            ds.delete_documents([d.id])
        remaining_docs = ds.list_documents(keywords="rag", offset=0, limit=12)
        assert len(remaining_docs) == 0, "Documents were not properly deleted."
    def test_parse_and_cancel_document(self):
        # Define the document name and path
        name3 = 'westworld.pdf'
        path = 'test_data/westworld.pdf'
        path = './test_data/westworld.pdf'
        # Create a document in the dataset using the file path
        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
        ds.upload_documents({"name":name3, "blob":open(path, "rb").read()})
        # Retrieve the document by name
        doc = rag.get_document(name="westworld.pdf")
        # Initiate asynchronous parsing
        doc.async_parse()
        doc = rag.list_documents(name="westworld.pdf")
        doc = doc[0]
        ds.async_parse_documents(document_ids=[])
        # Print message to confirm asynchronous parsing has been initiated
        print("Async parsing initiated")