|
|
|
@@ -40,6 +40,7 @@ from api.utils.api_utils import get_json_result |
|
|
|
from rag.utils.minio_conn import MINIO
|
|
|
|
from api.utils.file_utils import filename_type, thumbnail
|
|
|
|
from api.utils.web_utils import html2pdf, is_valid_url
|
|
|
|
from api.utils.web_utils import html2pdf, is_valid_url
|
|
|
|
|
|
|
|
|
|
|
|
@manager.route('/upload', methods=['POST'])
|
|
|
|
@@ -117,6 +118,68 @@ def upload(): |
|
|
|
return get_json_result(data=True)
|
|
|
|
|
|
|
|
|
|
|
|
@manager.route('/web_crawl', methods=['POST'])
|
|
|
|
@login_required
|
|
|
|
@validate_request("kb_id", "name", "url")
|
|
|
|
def web_crawl():
|
|
|
|
kb_id = request.form.get("kb_id")
|
|
|
|
if not kb_id:
|
|
|
|
return get_json_result(
|
|
|
|
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
name = request.form.get("name")
|
|
|
|
url = request.form.get("url")
|
|
|
|
if not is_valid_url(url):
|
|
|
|
return get_json_result(
|
|
|
|
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
|
|
|
if not e:
|
|
|
|
raise LookupError("Can't find this knowledgebase!")
|
|
|
|
|
|
|
|
blob = html2pdf(url)
|
|
|
|
if not blob: return server_error_response(ValueError("Download failure."))
|
|
|
|
|
|
|
|
root_folder = FileService.get_root_folder(current_user.id)
|
|
|
|
pf_id = root_folder["id"]
|
|
|
|
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
|
|
|
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
|
|
|
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
|
|
|
|
|
|
|
try:
|
|
|
|
filename = duplicate_name(
|
|
|
|
DocumentService.query,
|
|
|
|
name=name+".pdf",
|
|
|
|
kb_id=kb.id)
|
|
|
|
filetype = filename_type(filename)
|
|
|
|
if filetype == FileType.OTHER.value:
|
|
|
|
raise RuntimeError("This type of file has not been supported yet!")
|
|
|
|
|
|
|
|
location = filename
|
|
|
|
while MINIO.obj_exist(kb_id, location):
|
|
|
|
location += "_"
|
|
|
|
MINIO.put(kb_id, location, blob)
|
|
|
|
doc = {
|
|
|
|
"id": get_uuid(),
|
|
|
|
"kb_id": kb.id,
|
|
|
|
"parser_id": kb.parser_id,
|
|
|
|
"parser_config": kb.parser_config,
|
|
|
|
"created_by": current_user.id,
|
|
|
|
"type": filetype,
|
|
|
|
"name": filename,
|
|
|
|
"location": location,
|
|
|
|
"size": len(blob),
|
|
|
|
"thumbnail": thumbnail(filename, blob)
|
|
|
|
}
|
|
|
|
if doc["type"] == FileType.VISUAL:
|
|
|
|
doc["parser_id"] = ParserType.PICTURE.value
|
|
|
|
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
|
|
|
doc["parser_id"] = ParserType.PRESENTATION.value
|
|
|
|
DocumentService.insert(doc)
|
|
|
|
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
|
|
|
except Exception as e:
|
|
|
|
return server_error_response(e)
|
|
|
|
return get_json_result(data=True)
|
|
|
|
|
|
|
|
|
|
|
|
@manager.route('/create', methods=['POST'])
|
|
|
|
@login_required
|
|
|
|
@validate_request("name", "kb_id")
|
|
|
|
@@ -417,69 +480,3 @@ def get_image(image_id): |
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
|
|
return server_error_response(e)
|
|
|
|
|
|
|
|
|
|
|
|
@manager.route('/web_crawl', methods=['POST'])
|
|
|
|
@login_required
|
|
|
|
def web_crawl():
|
|
|
|
kb_id = request.form.get("kb_id")
|
|
|
|
if not kb_id:
|
|
|
|
return get_json_result(
|
|
|
|
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
name = request.form.get("name")
|
|
|
|
url = request.form.get("url")
|
|
|
|
if not name:
|
|
|
|
return get_json_result(
|
|
|
|
data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
if not url:
|
|
|
|
return get_json_result(
|
|
|
|
data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
if not is_valid_url(url):
|
|
|
|
return get_json_result(
|
|
|
|
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
|
|
|
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
|
|
|
if not e:
|
|
|
|
raise LookupError("Can't find this knowledgebase!")
|
|
|
|
|
|
|
|
root_folder = FileService.get_root_folder(current_user.id)
|
|
|
|
pf_id = root_folder["id"]
|
|
|
|
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
|
|
|
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
|
|
|
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
|
|
|
|
|
|
|
try:
|
|
|
|
filename = duplicate_name(
|
|
|
|
DocumentService.query,
|
|
|
|
name=name+".pdf",
|
|
|
|
kb_id=kb.id)
|
|
|
|
filetype = filename_type(filename)
|
|
|
|
if filetype == FileType.OTHER.value:
|
|
|
|
raise RuntimeError("This type of file has not been supported yet!")
|
|
|
|
|
|
|
|
location = filename
|
|
|
|
while MINIO.obj_exist(kb_id, location):
|
|
|
|
location += "_"
|
|
|
|
blob = html2pdf(url)
|
|
|
|
MINIO.put(kb_id, location, blob)
|
|
|
|
doc = {
|
|
|
|
"id": get_uuid(),
|
|
|
|
"kb_id": kb.id,
|
|
|
|
"parser_id": kb.parser_id,
|
|
|
|
"parser_config": kb.parser_config,
|
|
|
|
"created_by": current_user.id,
|
|
|
|
"type": filetype,
|
|
|
|
"name": filename,
|
|
|
|
"location": location,
|
|
|
|
"size": len(blob),
|
|
|
|
"thumbnail": thumbnail(filename, blob)
|
|
|
|
}
|
|
|
|
if doc["type"] == FileType.VISUAL:
|
|
|
|
doc["parser_id"] = ParserType.PICTURE.value
|
|
|
|
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
|
|
|
doc["parser_id"] = ParserType.PRESENTATION.value
|
|
|
|
DocumentService.insert(doc)
|
|
|
|
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
|
|
|
except Exception as e:
|
|
|
|
return get_json_result(
|
|
|
|
data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
|
|
|
|
return get_json_result(data=True)
|