Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

12345678910111213141516171819202122232425262728293031323334353637383940
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import re
  14. from api.db import LLMType
  15. from rag.nlp import rag_tokenizer
  16. from api.db.services.llm_service import LLMBundle
  17. from rag.nlp import tokenize
  18. def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
  19. doc = {
  20. "docnm_kwd": filename,
  21. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
  22. }
  23. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  24. # is it English
  25. eng = lang.lower() == "english" # is_english(sections)
  26. try:
  27. callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
  28. seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
  29. ans = seq2txt_mdl.transcription(binary)
  30. callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
  31. tokenize(doc, ans, eng)
  32. return [doc]
  33. except Exception as e:
  34. callback(prog=-1, msg=str(e))
  35. return []