You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

__init__.py 1.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import os
  2. import re
  3. import tiktoken
  4. def singleton(cls, *args, **kw):
  5. instances = {}
  6. def _singleton():
  7. key = str(cls) + str(os.getpid())
  8. if key not in instances:
  9. instances[key] = cls(*args, **kw)
  10. return instances[key]
  11. return _singleton
  12. from .minio_conn import MINIO
  13. from .es_conn import ELASTICSEARCH
  14. def rmSpace(txt):
  15. txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt)
  16. return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt)
  17. def findMaxDt(fnm):
  18. m = "1970-01-01 00:00:00"
  19. try:
  20. with open(fnm, "r") as f:
  21. while True:
  22. l = f.readline()
  23. if not l:
  24. break
  25. l = l.strip("\n")
  26. if l == 'nan':
  27. continue
  28. if l > m:
  29. m = l
  30. except Exception as e:
  31. print("WARNING: can't find " + fnm)
  32. return m
  33. def findMaxTm(fnm):
  34. m = 0
  35. try:
  36. with open(fnm, "r") as f:
  37. while True:
  38. l = f.readline()
  39. if not l:
  40. break
  41. l = l.strip("\n")
  42. if l == 'nan':
  43. continue
  44. if int(l) > m:
  45. m = int(l)
  46. except Exception as e:
  47. print("WARNING: can't find " + fnm)
  48. return m
  49. def num_tokens_from_string(string: str) -> int:
  50. """Returns the number of tokens in a text string."""
  51. encoding = tiktoken.get_encoding('cl100k_base')
  52. num_tokens = len(encoding.encode(string))
  53. return num_tokens