| @@ -106,7 +106,9 @@ def github_callback(): | |||
| stat_logger.exception(e) | |||
| return redirect("/?error=%s"%str(e)) | |||
| user = users[0] | |||
| user.access_token = get_uuid() | |||
| login_user(user) | |||
| user.save() | |||
| return redirect("/?auth=%s" % user.get_id()) | |||
| @@ -639,7 +639,7 @@ class HuParser: | |||
| mink = "" | |||
| minv = 1000000000 | |||
| for k, bxs in tbls.items(): | |||
| for b in bxs[:10]: | |||
| for b in bxs: | |||
| if b.get("layout_type", "").find("caption") >= 0: | |||
| continue | |||
| y_dis = self._y_dis(c, b) | |||
| @@ -62,9 +62,6 @@ class Pdf(PdfParser): | |||
| for b in self.boxes: | |||
| b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip()) | |||
| # merge chunks with the same bullets | |||
| self._merge_with_same_bullet() | |||
| # set pivot using the most frequent type of title, | |||
| # then merge between 2 pivot | |||
| bull = bullets_category([b["text"] for b in self.boxes]) | |||
| @@ -79,7 +76,7 @@ class Pdf(PdfParser): | |||
| sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)] | |||
| for (img, rows), poss in tbls: | |||
| sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss])) | |||
| sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss])) | |||
| chunks = [] | |||
| last_sid = -2 | |||
| @@ -11,6 +11,7 @@ | |||
| # limitations under the License. | |||
| # | |||
| import re | |||
| from copy import deepcopy | |||
| from io import BytesIO | |||
| from nltk import word_tokenize | |||
| from openpyxl import load_workbook | |||
| @@ -93,12 +94,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| All the deformed lines will be ignored. | |||
| Every pair of Q&A will be treated as a chunk. | |||
| """ | |||
| eng = lang.lower() == "english" | |||
| res = [] | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| excel_parser = Excel() | |||
| for q, a in excel_parser(filename, binary, callback): | |||
| res.append(beAdoc({}, q, a, excel_parser.is_english)) | |||
| res.append(beAdoc(deepcopy(doc), q, a, eng)) | |||
| return res | |||
| elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| @@ -113,14 +119,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]]) | |||
| #is_english([rmPrefix(l) for l in lines[:100]]) | |||
| fails = [] | |||
| for i, line in enumerate(lines): | |||
| arr = [l for l in line.split("\t") if len(l) > 1] | |||
| if len(arr) != 2: | |||
| fails.append(str(i)) | |||
| continue | |||
| res.append(beAdoc({}, arr[0], arr[1], eng)) | |||
| res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng)) | |||
| if len(res) % 999 == 0: | |||
| callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| @@ -76,6 +76,7 @@ def is_english(texts): | |||
| def tokenize(d, t, eng): | |||
| d["content_with_weight"] = t | |||
| t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) | |||
| if eng: | |||
| t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) | |||
| d["content_ltks"] = " ".join([stemmer.stem(w) | |||
| @@ -29,7 +29,7 @@ class EsQueryer: | |||
| for t in arr: | |||
| if not re.match(r"[a-zA-Z]+$", t): | |||
| e += 1 | |||
| return e * 1. / len(arr) >= 0.8 | |||
| return e * 1. / len(arr) >= 0.7 | |||
| @staticmethod | |||
| def rmWWW(txt): | |||