### What problem does this PR solve? #1514 ### Type of change - [x] New Feature (non-breaking change which adds functionality)

1 year ago · ac7a0d4fbf
--- a/api/apps/api_app.py
+++ b/api/apps/api_app.py
@@ -335,6 +335,8 @@ def upload():
                doc["parser_id"] = request.form.get("parser_id").strip()
        if doc["type"] == FileType.VISUAL:
            doc["parser_id"] = ParserType.PICTURE.value
        if doc["type"] == FileType.AURAL:
            doc["parser_id"] = ParserType.AUDIO.value
        if re.search(r"\.(ppt|pptx|pages)$", filename):
            doc["parser_id"] = ParserType.PRESENTATION.value

@@ -581,4 +583,4 @@ def completion_faq():
        return response

    except Exception as e:
        return server_error_response(e)
        return server_error_response(e)
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@@ -39,7 +39,7 @@ from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_error_response
 from api.utils.api_utils import construct_result, validate_request
 from api.utils.file_utils import filename_type, thumbnail
 from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
 from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
@@ -377,6 +377,8 @@ def upload_documents(dataset_id):
            }
            if doc["type"] == FileType.VISUAL:
                doc["parser_id"] = ParserType.PICTURE.value
            if doc["type"] == FileType.AURAL:
                doc["parser_id"] = ParserType.AUDIO.value
            if re.search(r"\.(ppt|pptx|pages)$", filename):
                doc["parser_id"] = ParserType.PRESENTATION.value
            DocumentService.insert(doc)
@@ -648,6 +650,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
            resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case "table":
            table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case "audio":
            audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
        case _:
            return False

--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@@ -105,6 +105,8 @@ def upload():
            }
            if doc["type"] == FileType.VISUAL:
                doc["parser_id"] = ParserType.PICTURE.value
            if doc["type"] == FileType.AURAL:
                doc["parser_id"] = ParserType.AUDIO.value
            if re.search(r"\.(ppt|pptx|pages)$", filename):
                doc["parser_id"] = ParserType.PRESENTATION.value
            DocumentService.insert(doc)
@@ -171,6 +173,8 @@ def web_crawl():
        }
        if doc["type"] == FileType.VISUAL:
            doc["parser_id"] = ParserType.PICTURE.value
        if doc["type"] == FileType.AURAL:
            doc["parser_id"] = ParserType.AUDIO.value
        if re.search(r"\.(ppt|pptx|pages)$", filename):
            doc["parser_id"] = ParserType.PRESENTATION.value
        DocumentService.insert(doc)
--- a/api/db/__init__.py
+++ b/api/db/__init__.py
@@ -84,6 +84,7 @@ class ParserType(StrEnum):
    NAIVE = "naive"
    PICTURE = "picture"
    ONE = "one"
    AUDIO = "audio"


 class FileSource(StrEnum):
@@ -96,4 +97,4 @@ class CanvasType(StrEnum):
    ChatBot = "chatbot"
    DocBot = "docbot"

 KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
 KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@@ -121,6 +121,8 @@ def init_llm_factory():
    LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
    LLMService.filter_delete([LLMService.model.fid == "QAnything"])
    TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
    TenantService.filter_update([1 == 1], {
        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
    ## insert openai two embedding models to the current openai user.
    print("Start to insert 2 OpenAI embedding models...")
    tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@@ -143,7 +145,7 @@ def init_llm_factory():
    """
    drop table llm;
    drop table llm_factories;
    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
    alter table knowledgebase modify avatar longtext;
    alter table user modify avatar longtext;
    alter table dialog modify icon longtext;
--- a/api/db/services/llm_service.py
+++ b/api/db/services/llm_service.py
@@ -15,7 +15,7 @@
 #
 from api.db.services.user_service import TenantService
 from api.settings import database_logger
 from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel
 from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
 from api.db import LLMType
 from api.db.db_models import DB, UserTenant
 from api.db.db_models import LLMFactories, LLM, TenantLLM
@@ -120,6 +120,14 @@ class TenantLLMService(CommonService):
            return ChatModel[model_config["llm_factory"]](
                model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])

        if llm_type == LLMType.SPEECH2TEXT:
            if model_config["llm_factory"] not in Seq2txtModel:
                return
            return Seq2txtModel[model_config["llm_factory"]](
                model_config["api_key"], model_config["llm_name"], lang,
                base_url=model_config["api_base"]
            )

    @classmethod
    @DB.connection_context()
    def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
@@ -207,6 +215,14 @@ class LLMBundle(object):
                "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
        return txt

    def transcription(self, audio):
        txt, used_tokens = self.mdl.transcription(audio)
        if not TenantLLMService.increase_usage(
                self.tenant_id, self.llm_type, used_tokens):
            database_logger.error(
                "Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id))
        return txt

    def chat(self, system, history, gen_conf):
        txt, used_tokens = self.mdl.chat(system, history, gen_conf)
        if not TenantLLMService.increase_usage(
--- a/api/settings.py
+++ b/api/settings.py
@@ -131,7 +131,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
 API_KEY = LLM.get("api_key", "")
 PARSERS = LLM.get(
    "parsers",
    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
    "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio")

 # distribution
 DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
--- a/rag/app/audio.py
+++ b/rag/app/audio.py
@@ -0,0 +1,42 @@
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import io
 import re
 import numpy as np

 from api.db import LLMType
 from rag.nlp import rag_tokenizer
 from api.db.services.llm_service import LLMBundle
 from rag.nlp import tokenize


 def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
    }
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])

    # is it English
    eng = lang.lower() == "english"  # is_english(sections)
    try:
        callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
        seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
        ans = seq2txt_mdl.transcription(binary)
        callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
        tokenize(doc, ans, eng)
        return [doc]
    except Exception as e:
        callback(prog=-1, msg=str(e))

    return []
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
        callback(0.4, "Use CV LLM to describe the picture.")
        cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
        ans = cv_mdl.describe(binary)
        callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
        callback(0.8, "CV LLM respond: %s ..." % ans[:32])
        txt += "\n" + ans
        tokenize(doc, txt, eng)
        return [doc]
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
 from io import BytesIO
 import pandas as pd

 from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
 from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio

 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
@@ -68,6 +68,7 @@ FACTORY = {
    ParserType.RESUME.value: resume,
    ParserType.PICTURE.value: picture,
    ParserType.ONE.value: one,
    ParserType.AUDIO.value: audio
 }