Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

schools.py 2.7KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import os
  14. import json
  15. import re
  16. import copy
  17. import pandas as pd
  18. current_file_path = os.path.dirname(os.path.abspath(__file__))
  19. TBL = pd.read_csv(
  20. os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0
  21. ).fillna("")
  22. TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
  23. GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r"))
  24. GOOD_SCH = set([re.sub(r"[,. &()()]+", "", c) for c in GOOD_SCH])
  25. def loadRank(fnm):
  26. global TBL
  27. TBL["rank"] = 1000000
  28. with open(fnm, "r", encoding="utf-8") as f:
  29. while True:
  30. line = f.readline()
  31. if not line:
  32. break
  33. line = line.strip("\n").split(",")
  34. try:
  35. nm, rk = line[0].strip(), int(line[1])
  36. # assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
  37. TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
  38. except Exception:
  39. pass
  40. loadRank(os.path.join(current_file_path, "res/school.rank.csv"))
  41. def split(txt):
  42. tks = []
  43. for t in re.sub(r"[ \t]+", " ", txt).split():
  44. if (
  45. tks
  46. and re.match(r".*[a-zA-Z]$", tks[-1])
  47. and re.match(r"[a-zA-Z]", t)
  48. and tks
  49. ):
  50. tks[-1] = tks[-1] + " " + t
  51. else:
  52. tks.append(t)
  53. return tks
  54. def select(nm):
  55. global TBL
  56. if not nm:
  57. return
  58. if isinstance(nm, list):
  59. nm = str(nm[0])
  60. nm = split(nm)[0]
  61. nm = str(nm).lower().strip()
  62. nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
  63. nm = re.sub(r"(^the |[,.&()();;·]+|^(英国|美国|瑞士))", "", nm)
  64. nm = re.sub(r"大学.*学院", "大学", nm)
  65. tbl = copy.deepcopy(TBL)
  66. tbl["hit_alias"] = tbl["alias"].map(lambda x: nm in set(x.split("+")))
  67. res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | tbl.hit_alias)]
  68. if res.empty:
  69. return
  70. return json.loads(res.to_json(orient="records"))[0]
  71. def is_good(nm):
  72. global GOOD_SCH
  73. nm = re.sub(r"[((][^()()]+[))]", "", nm.lower())
  74. nm = re.sub(r"[''`‘’“”,. &()();;]+", "", nm)
  75. return nm in GOOD_SCH