You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

__init__.py 1.5KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import os
  2. import re
  3. import tiktoken
  4. def singleton(cls, *args, **kw):
  5. instances = {}
  6. def _singleton():
  7. key = str(cls) + str(os.getpid())
  8. if key not in instances:
  9. instances[key] = cls(*args, **kw)
  10. return instances[key]
  11. return _singleton
  12. def rmSpace(txt):
  13. txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
  14. return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
  15. def findMaxDt(fnm):
  16. m = "1970-01-01 00:00:00"
  17. try:
  18. with open(fnm, "r") as f:
  19. while True:
  20. l = f.readline()
  21. if not l:
  22. break
  23. l = l.strip("\n")
  24. if l == 'nan':
  25. continue
  26. if l > m:
  27. m = l
  28. except Exception as e:
  29. pass
  30. return m
  31. def findMaxTm(fnm):
  32. m = 0
  33. try:
  34. with open(fnm, "r") as f:
  35. while True:
  36. l = f.readline()
  37. if not l:
  38. break
  39. l = l.strip("\n")
  40. if l == 'nan':
  41. continue
  42. if int(l) > m:
  43. m = int(l)
  44. except Exception as e:
  45. pass
  46. return m
  47. encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
  48. def num_tokens_from_string(string: str) -> int:
  49. """Returns the number of tokens in a text string."""
  50. num_tokens = len(encoder.encode(string))
  51. return num_tokens