You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

qa.py 4.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. import random
  2. import re
  3. from io import BytesIO
  4. from nltk import word_tokenize
  5. from openpyxl import load_workbook
  6. from rag.parser import is_english, random_choices
  7. from rag.nlp import huqie, stemmer
  8. class Excel(object):
  9. def __call__(self, fnm, binary=None, callback=None):
  10. if not binary:
  11. wb = load_workbook(fnm)
  12. else:
  13. wb = load_workbook(BytesIO(binary))
  14. total = 0
  15. for sheetname in wb.sheetnames:
  16. total += len(list(wb[sheetname].rows))
  17. res, fails = [], []
  18. for sheetname in wb.sheetnames:
  19. ws = wb[sheetname]
  20. rows = list(ws.rows)
  21. for i, r in enumerate(rows):
  22. q, a = "", ""
  23. for cell in r:
  24. if not cell.value:
  25. continue
  26. if not q:
  27. q = str(cell.value)
  28. elif not a:
  29. a = str(cell.value)
  30. else:
  31. break
  32. if q and a:
  33. res.append((q, a))
  34. else:
  35. fails.append(str(i + 1))
  36. if len(res) % 999 == 0:
  37. callback(len(res) *
  38. 0.6 /
  39. total, ("Extract Q&A: {}".format(len(res)) +
  40. (f"{len(fails)} failure, line: %s..." %
  41. (",".join(fails[:3])) if fails else "")))
  42. callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
  43. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  44. self.is_english = is_english(
  45. [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
  46. return res
  47. def rmPrefix(txt):
  48. return re.sub(
  49. r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
  50. def beAdoc(d, q, a, eng):
  51. qprefix = "Question: " if eng else "问题:"
  52. aprefix = "Answer: " if eng else "回答:"
  53. d["content_with_weight"] = "\t".join(
  54. [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
  55. if eng:
  56. d["content_ltks"] = " ".join([stemmer.stem(w)
  57. for w in word_tokenize(q)])
  58. else:
  59. d["content_ltks"] = huqie.qie(q)
  60. d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
  61. return d
  62. def chunk(filename, binary=None, callback=None, **kwargs):
  63. res = []
  64. if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
  65. callback(0.1, "Start to parse.")
  66. excel_parser = Excel()
  67. for q, a in excel_parser(filename, binary, callback):
  68. res.append(beAdoc({}, q, a, excel_parser.is_english))
  69. return res
  70. elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
  71. callback(0.1, "Start to parse.")
  72. txt = ""
  73. if binary:
  74. txt = binary.decode("utf-8")
  75. else:
  76. with open(filename, "r") as f:
  77. while True:
  78. l = f.readline()
  79. if not l:
  80. break
  81. txt += l
  82. lines = txt.split("\n")
  83. eng = is_english([rmPrefix(l) for l in lines[:100]])
  84. fails = []
  85. for i, line in enumerate(lines):
  86. arr = [l for l in line.split("\t") if len(l) > 1]
  87. if len(arr) != 2:
  88. fails.append(str(i))
  89. continue
  90. res.append(beAdoc({}, arr[0], arr[1], eng))
  91. if len(res) % 999 == 0:
  92. callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
  93. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  94. callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
  95. f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
  96. return res
  97. raise NotImplementedError(
  98. "file type not supported yet(pptx, pdf supported)")
  99. if __name__ == "__main__":
  100. import sys
  101. def dummy(a, b):
  102. pass
  103. chunk(sys.argv[1], callback=dummy)