Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

audio.py 1.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import io
  14. import re
  15. import numpy as np
  16. from api.db import LLMType
  17. from rag.nlp import rag_tokenizer
  18. from api.db.services.llm_service import LLMBundle
  19. from rag.nlp import tokenize
  20. def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
  21. doc = {
  22. "docnm_kwd": filename,
  23. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
  24. }
  25. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  26. # is it English
  27. eng = lang.lower() == "english" # is_english(sections)
  28. try:
  29. callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
  30. seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
  31. ans = seq2txt_mdl.transcription(binary)
  32. callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
  33. tokenize(doc, ans, eng)
  34. return [doc]
  35. except Exception as e:
  36. callback(prog=-1, msg=str(e))
  37. return []