| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 | 
							- #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - import re
 - from copy import deepcopy
 - from io import BytesIO
 - from nltk import word_tokenize
 - from openpyxl import load_workbook
 - from rag.nlp import is_english, random_choices, find_codec
 - from rag.nlp import huqie
 - from deepdoc.parser import ExcelParser
 - 
 - 
 - class Excel(ExcelParser):
 -     def __call__(self, fnm, binary=None, callback=None):
 -         if not binary:
 -             wb = load_workbook(fnm)
 -         else:
 -             wb = load_workbook(BytesIO(binary))
 -         total = 0
 -         for sheetname in wb.sheetnames:
 -             total += len(list(wb[sheetname].rows))
 - 
 -         res, fails = [], []
 -         for sheetname in wb.sheetnames:
 -             ws = wb[sheetname]
 -             rows = list(ws.rows)
 -             for i, r in enumerate(rows):
 -                 q, a = "", ""
 -                 for cell in r:
 -                     if not cell.value:
 -                         continue
 -                     if not q:
 -                         q = str(cell.value)
 -                     elif not a:
 -                         a = str(cell.value)
 -                     else:
 -                         break
 -                 if q and a:
 -                     res.append((q, a))
 -                 else:
 -                     fails.append(str(i + 1))
 -                 if len(res) % 999 == 0:
 -                     callback(len(res) *
 -                              0.6 /
 -                              total, ("Extract Q&A: {}".format(len(res)) +
 -                                      (f"{len(fails)} failure, line: %s..." %
 -                                       (",".join(fails[:3])) if fails else "")))
 - 
 -         callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
 -             f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 -         self.is_english = is_english(
 -             [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
 -         return res
 - 
 - 
 - def rmPrefix(txt):
 -     return re.sub(
 -         r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
 - 
 - 
 - def beAdoc(d, q, a, eng):
 -     qprefix = "Question: " if eng else "问题:"
 -     aprefix = "Answer: " if eng else "回答:"
 -     d["content_with_weight"] = "\t".join(
 -         [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
 -     d["content_ltks"] = huqie.qie(q)
 -     d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 -     return d
 - 
 - 
 - def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
 -     """
 -         Excel and csv(txt) format files are supported.
 -         If the file is in excel format, there should be 2 column question and answer without header.
 -         And question column is ahead of answer column.
 -         And it's O.K if it has multiple sheets as long as the columns are rightly composed.
 - 
 -         If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
 - 
 -         All the deformed lines will be ignored.
 -         Every pair of Q&A will be treated as a chunk.
 -     """
 -     eng = lang.lower() == "english"
 -     res = []
 -     doc = {
 -         "docnm_kwd": filename,
 -         "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
 -     }
 -     if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
 -         callback(0.1, "Start to parse.")
 -         excel_parser = Excel()
 -         for q, a in excel_parser(filename, binary, callback):
 -             res.append(beAdoc(deepcopy(doc), q, a, eng))
 -         return res
 -     elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
 -         callback(0.1, "Start to parse.")
 -         txt = ""
 -         if binary:
 -             encoding = find_codec(binary)
 -             txt = binary.decode(encoding)
 -         else:
 -             with open(filename, "r") as f:
 -                 while True:
 -                     l = f.readline()
 -                     if not l:
 -                         break
 -                     txt += l
 -         lines = txt.split("\n")
 -         #is_english([rmPrefix(l) for l in lines[:100]])
 -         fails = []
 -         for i, line in enumerate(lines):
 -             arr = [l for l in line.split("\t") if len(l) > 1]
 -             if len(arr) != 2:
 -                 fails.append(str(i))
 -                 continue
 -             res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
 -             if len(res) % 999 == 0:
 -                 callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
 -                     f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 - 
 -         callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
 -             f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 - 
 -         return res
 - 
 -     raise NotImplementedError(
 -         "Excel and csv(txt) format files are supported.")
 - 
 - 
 - if __name__ == "__main__":
 -     import sys
 - 
 -     def dummy(a, b):
 -         pass
 -     chunk(sys.argv[1], callback=dummy)
 
 
  |