You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

schools.py 2.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # -*- coding: UTF-8 -*-
  2. import os, json,re,copy
  3. import pandas as pd
  4. current_file_path = os.path.dirname(os.path.abspath(__file__))
  5. TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
  6. TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
  7. GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r"))
  8. GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
  9. def loadRank(fnm):
  10. global TBL
  11. TBL["rank"] = 1000000
  12. with open(fnm, "r",encoding='UTF-8') as f:
  13. while True:
  14. l = f.readline()
  15. if not l:break
  16. l = l.strip("\n").split(",")
  17. try:
  18. nm,rk = l[0].strip(),int(l[1])
  19. #assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
  20. TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
  21. except Exception as e:
  22. pass
  23. loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
  24. def split(txt):
  25. tks = []
  26. for t in re.sub(r"[ \t]+", " ",txt).split(" "):
  27. if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
  28. re.match(r"[a-zA-Z]", t) and tks:
  29. tks[-1] = tks[-1] + " " + t
  30. else:tks.append(t)
  31. return tks
  32. def select(nm):
  33. global TBL
  34. if not nm:return
  35. if isinstance(nm, list):nm = str(nm[0])
  36. nm = split(nm)[0]
  37. nm = str(nm).lower().strip()
  38. nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
  39. nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm)
  40. nm = re.sub(r"大学.*学院", "大学", nm)
  41. tbl = copy.deepcopy(TBL)
  42. tbl["hit_alias"] = tbl["alias"].map(lambda x:nm in set(x.split("+")))
  43. res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | (tbl.hit_alias == True))]
  44. if res.empty:return
  45. return json.loads(res.to_json(orient="records"))[0]
  46. def is_good(nm):
  47. global GOOD_SCH
  48. nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
  49. nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm)
  50. return nm in GOOD_SCH