選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

__init__.py 2.5KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import os
  17. import re
  18. import tiktoken
  19. from api.utils.file_utils import get_project_base_directory
  20. def singleton(cls, *args, **kw):
  21. instances = {}
  22. def _singleton():
  23. key = str(cls) + str(os.getpid())
  24. if key not in instances:
  25. instances[key] = cls(*args, **kw)
  26. return instances[key]
  27. return _singleton
  28. def rmSpace(txt):
  29. txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
  30. return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
  31. def findMaxDt(fnm):
  32. m = "1970-01-01 00:00:00"
  33. try:
  34. with open(fnm, "r") as f:
  35. while True:
  36. line = f.readline()
  37. if not line:
  38. break
  39. line = line.strip("\n")
  40. if line == 'nan':
  41. continue
  42. if line > m:
  43. m = line
  44. except Exception:
  45. pass
  46. return m
  47. def findMaxTm(fnm):
  48. m = 0
  49. try:
  50. with open(fnm, "r") as f:
  51. while True:
  52. line = f.readline()
  53. if not line:
  54. break
  55. line = line.strip("\n")
  56. if line == 'nan':
  57. continue
  58. if int(line) > m:
  59. m = int(line)
  60. except Exception:
  61. pass
  62. return m
  63. tiktoken_cache_dir = get_project_base_directory()
  64. os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
  65. # encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
  66. encoder = tiktoken.get_encoding("cl100k_base")
  67. def num_tokens_from_string(string: str) -> int:
  68. """Returns the number of tokens in a text string."""
  69. try:
  70. return len(encoder.encode(string))
  71. except Exception:
  72. return 0
  73. def truncate(string: str, max_len: int) -> str:
  74. """Returns truncated text if the length of text exceed max_len."""
  75. return encoder.decode(encoder.encode(string)[:max_len])