| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
-
- from docx import Document
- import re
- import pandas as pd
- from collections import Counter
- from rag.nlp import rag_tokenizer
- from io import BytesIO
-
-
- class RAGFlowDocxParser:
-
- def __extract_table_content(self, tb):
- df = []
- for row in tb.rows:
- df.append([c.text for c in row.cells])
- return self.__compose_table_content(pd.DataFrame(df))
-
- def __compose_table_content(self, df):
-
- def blockType(b):
- patt = [
- ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
- (r"^(20|19)[0-9]{2}年$", "Dt"),
- (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
- ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
- (r"^第*[一二三四1-4]季度$", "Dt"),
- (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
- (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
- ("^[0-9.,+%/ -]+$", "Nu"),
- (r"^[0-9A-Z/\._~-]+$", "Ca"),
- (r"^[A-Z]*[a-z' -]+$", "En"),
- (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
- (r"^.{1}$", "Sg")
- ]
- for p, n in patt:
- if re.search(p, b):
- return n
- tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
- if len(tks) > 3:
- if len(tks) < 12:
- return "Tx"
- else:
- return "Lx"
-
- if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
- return "Nr"
-
- return "Ot"
-
- if len(df) < 2:
- return []
- max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
- 1, len(df)) for j in range(len(df.iloc[i, :]))])
- max_type = max(max_type.items(), key=lambda x: x[1])[0]
-
- colnm = len(df.iloc[0, :])
- hdrows = [0] # header is not nessesarily appear in the first line
- if max_type == "Nu":
- for r in range(1, len(df)):
- tys = Counter([blockType(str(df.iloc[r, j]))
- for j in range(len(df.iloc[r, :]))])
- tys = max(tys.items(), key=lambda x: x[1])[0]
- if tys != max_type:
- hdrows.append(r)
-
- lines = []
- for i in range(1, len(df)):
- if i in hdrows:
- continue
- hr = [r - i for r in hdrows]
- hr = [r for r in hr if r < 0]
- t = len(hr) - 1
- while t > 0:
- if hr[t] - hr[t - 1] > 1:
- hr = hr[t:]
- break
- t -= 1
- headers = []
- for j in range(len(df.iloc[i, :])):
- t = []
- for h in hr:
- x = str(df.iloc[i + h, j]).strip()
- if x in t:
- continue
- t.append(x)
- t = ",".join(t)
- if t:
- t += ": "
- headers.append(t)
- cells = []
- for j in range(len(df.iloc[i, :])):
- if not str(df.iloc[i, j]):
- continue
- cells.append(headers[j] + str(df.iloc[i, j]))
- lines.append(";".join(cells))
-
- if colnm > 3:
- return lines
- return ["\n".join(lines)]
-
- def __call__(self, fnm, from_page=0, to_page=100000000):
- self.doc = Document(fnm) if isinstance(
- fnm, str) else Document(BytesIO(fnm))
- pn = 0 # parsed page
- secs = [] # parsed contents
- for p in self.doc.paragraphs:
- if pn > to_page:
- break
-
- runs_within_single_paragraph = [] # save runs within the range of pages
- for run in p.runs:
- if pn > to_page:
- break
- if from_page <= pn < to_page and p.text.strip():
- runs_within_single_paragraph.append(run.text) # append run.text first
-
- # wrap page break checker into a static method
- if 'lastRenderedPageBreak' in run._element.xml:
- pn += 1
-
- secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
-
- tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
- return secs, tbls
|