Browse Source

Add ParsertType Audio (#1637)

### What problem does this PR solve?

#1514 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.9.0
H 1 year ago
parent
commit
ac7a0d4fbf
No account linked to committer's email address

+ 3
- 1
api/apps/api_app.py View File

@@ -335,6 +335,8 @@ def upload():
doc["parser_id"] = request.form.get("parser_id").strip()
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
@@ -581,4 +583,4 @@ def completion_faq():
return response
except Exception as e:
return server_error_response(e)
return server_error_response(e)

+ 5
- 1
api/apps/dataset_api.py View File

@@ -39,7 +39,7 @@ from api.utils import get_uuid
from api.utils.api_utils import construct_json_result, construct_error_response
from api.utils.api_utils import construct_result, validate_request
from api.utils.file_utils import filename_type, thumbnail
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO
@@ -377,6 +377,8 @@ def upload_documents(dataset_id):
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc)
@@ -648,6 +650,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "table":
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case "audio":
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
case _:
return False


+ 4
- 0
api/apps/document_app.py View File

@@ -105,6 +105,8 @@ def upload():
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc)
@@ -171,6 +173,8 @@ def web_crawl():
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc)

+ 2
- 1
api/db/__init__.py View File

@@ -84,6 +84,7 @@ class ParserType(StrEnum):
NAIVE = "naive"
PICTURE = "picture"
ONE = "one"
AUDIO = "audio"
class FileSource(StrEnum):
@@ -96,4 +97,4 @@ class CanvasType(StrEnum):
ChatBot = "chatbot"
DocBot = "docbot"
KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"

+ 3
- 1
api/db/init_data.py View File

@@ -121,6 +121,8 @@ def init_llm_factory():
LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
TenantService.filter_update([1 == 1], {
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
## insert openai two embedding models to the current openai user.
print("Start to insert 2 OpenAI embedding models...")
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@@ -143,7 +145,7 @@ def init_llm_factory():
"""
drop table llm;
drop table llm_factories;
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
alter table knowledgebase modify avatar longtext;
alter table user modify avatar longtext;
alter table dialog modify icon longtext;

+ 17
- 1
api/db/services/llm_service.py View File

@@ -15,7 +15,7 @@
#
from api.db.services.user_service import TenantService
from api.settings import database_logger
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel
from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
from api.db import LLMType
from api.db.db_models import DB, UserTenant
from api.db.db_models import LLMFactories, LLM, TenantLLM
@@ -120,6 +120,14 @@ class TenantLLMService(CommonService):
return ChatModel[model_config["llm_factory"]](
model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
if llm_type == LLMType.SPEECH2TEXT:
if model_config["llm_factory"] not in Seq2txtModel:
return
return Seq2txtModel[model_config["llm_factory"]](
model_config["api_key"], model_config["llm_name"], lang,
base_url=model_config["api_base"]
)
@classmethod
@DB.connection_context()
def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
@@ -207,6 +215,14 @@ class LLMBundle(object):
"Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
return txt
def transcription(self, audio):
txt, used_tokens = self.mdl.transcription(audio)
if not TenantLLMService.increase_usage(
self.tenant_id, self.llm_type, used_tokens):
database_logger.error(
"Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id))
return txt
def chat(self, system, history, gen_conf):
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
if not TenantLLMService.increase_usage(

+ 1
- 1
api/settings.py View File

@@ -131,7 +131,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
API_KEY = LLM.get("api_key", "")
PARSERS = LLM.get(
"parsers",
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio")
# distribution
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

+ 42
- 0
rag/app/audio.py View File

@@ -0,0 +1,42 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import re
import numpy as np

from api.db import LLMType
from rag.nlp import rag_tokenizer
from api.db.services.llm_service import LLMBundle
from rag.nlp import tokenize


def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])

# is it English
eng = lang.lower() == "english" # is_english(sections)
try:
callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
ans = seq2txt_mdl.transcription(binary)
callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
tokenize(doc, ans, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))

return []

+ 1
- 1
rag/app/picture.py View File

@@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.4, "Use CV LLM to describe the picture.")
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.describe(binary)
callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
return [doc]

+ 2
- 1
rag/svr/task_executor.py View File

@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
from io import BytesIO
import pandas as pd

from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio

from api.db import LLMType, ParserType
from api.db.services.document_service import DocumentService
@@ -68,6 +68,7 @@ FACTORY = {
ParserType.RESUME.value: resume,
ParserType.PICTURE.value: picture,
ParserType.ONE.value: one,
ParserType.AUDIO.value: audio
}



Loading…
Cancel
Save