You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import os
  17. import re
  18. import tempfile
  19. from api.db import LLMType
  20. from api.db.services.llm_service import LLMBundle
  21. from rag.nlp import rag_tokenizer, tokenize
  22. def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
  23. doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
  24. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  25. # is it English
  26. eng = lang.lower() == "english" # is_english(sections)
  27. try:
  28. _, ext = os.path.splitext(filename)
  29. if not ext:
  30. raise RuntimeError("No extension detected.")
  31. if ext not in [".da", ".wave", ".wav", ".mp3", ".wav", ".aac", ".flac", ".ogg", ".aiff", ".au", ".midi", ".wma", ".realaudio", ".vqf", ".oggvorbis", ".aac", ".ape"]:
  32. raise RuntimeError(f"Extension {ext} is not supported yet.")
  33. tmp_path = ""
  34. with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmpf:
  35. tmpf.write(binary)
  36. tmpf.flush()
  37. tmp_path = os.path.abspath(tmpf.name)
  38. callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
  39. seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
  40. ans = seq2txt_mdl.transcription(tmp_path)
  41. callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
  42. tokenize(doc, ans, eng)
  43. return [doc]
  44. except Exception as e:
  45. callback(prog=-1, msg=str(e))
  46. finally:
  47. if tmp_path and os.path.exists(tmp_path):
  48. try:
  49. os.unlink(tmp_path)
  50. except Exception:
  51. pass
  52. return []