| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import re
- from copy import deepcopy
- from io import BytesIO
- from nltk import word_tokenize
- from openpyxl import load_workbook
- from rag.nlp import is_english, random_choices, find_codec
- from rag.nlp import rag_tokenizer
- from deepdoc.parser import ExcelParser
-
-
- class Excel(ExcelParser):
- def __call__(self, fnm, binary=None, callback=None):
- if not binary:
- wb = load_workbook(fnm)
- else:
- wb = load_workbook(BytesIO(binary))
- total = 0
- for sheetname in wb.sheetnames:
- total += len(list(wb[sheetname].rows))
-
- res, fails = [], []
- for sheetname in wb.sheetnames:
- ws = wb[sheetname]
- rows = list(ws.rows)
- for i, r in enumerate(rows):
- q, a = "", ""
- for cell in r:
- if not cell.value:
- continue
- if not q:
- q = str(cell.value)
- elif not a:
- a = str(cell.value)
- else:
- break
- if q and a:
- res.append((q, a))
- else:
- fails.append(str(i + 1))
- if len(res) % 999 == 0:
- callback(len(res) *
- 0.6 /
- total, ("Extract Q&A: {}".format(len(res)) +
- (f"{len(fails)} failure, line: %s..." %
- (",".join(fails[:3])) if fails else "")))
-
- callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
- f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
- self.is_english = is_english(
- [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
- return res
-
-
- def rmPrefix(txt):
- return re.sub(
- r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
-
-
- def beAdoc(d, q, a, eng):
- qprefix = "Question: " if eng else "问题:"
- aprefix = "Answer: " if eng else "回答:"
- d["content_with_weight"] = "\t".join(
- [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
- d["content_ltks"] = rag_tokenizer.tokenize(q)
- d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
- return d
-
-
- def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
- """
- Excel and csv(txt) format files are supported.
- If the file is in excel format, there should be 2 column question and answer without header.
- And question column is ahead of answer column.
- And it's O.K if it has multiple sheets as long as the columns are rightly composed.
-
- If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
-
- All the deformed lines will be ignored.
- Every pair of Q&A will be treated as a chunk.
- """
- eng = lang.lower() == "english"
- res = []
- doc = {
- "docnm_kwd": filename,
- "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
- }
- if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
- callback(0.1, "Start to parse.")
- excel_parser = Excel()
- for q, a in excel_parser(filename, binary, callback):
- res.append(beAdoc(deepcopy(doc), q, a, eng))
- return res
- elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
- callback(0.1, "Start to parse.")
- txt = ""
- if binary:
- encoding = find_codec(binary)
- txt = binary.decode(encoding)
- else:
- with open(filename, "r") as f:
- while True:
- l = f.readline()
- if not l:
- break
- txt += l
- lines = txt.split("\n")
- #is_english([rmPrefix(l) for l in lines[:100]])
- fails = []
- for i, line in enumerate(lines):
- arr = [l for l in line.split("\t") if len(l) > 1]
- if len(arr) != 2:
- fails.append(str(i))
- continue
- res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
- if len(res) % 999 == 0:
- callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
- f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
-
- callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
- f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
-
- return res
-
- raise NotImplementedError(
- "Excel and csv(txt) format files are supported.")
-
-
- if __name__ == "__main__":
- import sys
-
- def dummy(a, b):
- pass
- chunk(sys.argv[1], callback=dummy)
|