| 12345678910111213141516171819202122232425262728293031323334353637383940 |
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import re
-
- from api.db import LLMType
- from rag.nlp import rag_tokenizer
- from api.db.services.llm_service import LLMBundle
- from rag.nlp import tokenize
-
-
- def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
- doc = {
- "docnm_kwd": filename,
- "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
- }
- doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-
- # is it English
- eng = lang.lower() == "english" # is_english(sections)
- try:
- callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
- seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
- ans = seq2txt_mdl.transcription(binary)
- callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
- tokenize(doc, ans, eng)
- return [doc]
- except Exception as e:
- callback(prog=-1, msg=str(e))
-
- return []
|