| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 | 
							- #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - 
 - from docx import Document
 - import re
 - import pandas as pd
 - from collections import Counter
 - from rag.nlp import rag_tokenizer
 - from io import BytesIO
 - 
 - 
 - class RAGFlowDocxParser:
 - 
 -     def __extract_table_content(self, tb):
 -         df = []
 -         for row in tb.rows:
 -             df.append([c.text for c in row.cells])
 -         return self.__compose_table_content(pd.DataFrame(df))
 - 
 -     def __compose_table_content(self, df):
 - 
 -         def blockType(b):
 -             patt = [
 -                 ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
 -                 (r"^(20|19)[0-9]{2}年$", "Dt"),
 -                 (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
 -                 ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
 -                 (r"^第*[一二三四1-4]季度$", "Dt"),
 -                 (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
 -                 (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
 -                 ("^[0-9.,+%/ -]+$", "Nu"),
 -                 (r"^[0-9A-Z/\._~-]+$", "Ca"),
 -                 (r"^[A-Z]*[a-z' -]+$", "En"),
 -                 (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
 -                 (r"^.{1}$", "Sg")
 -             ]
 -             for p, n in patt:
 -                 if re.search(p, b):
 -                     return n
 -             tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
 -             if len(tks) > 3:
 -                 if len(tks) < 12:
 -                     return "Tx"
 -                 else:
 -                     return "Lx"
 - 
 -             if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
 -                 return "Nr"
 - 
 -             return "Ot"
 - 
 -         if len(df) < 2:
 -             return []
 -         max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
 -             1, len(df)) for j in range(len(df.iloc[i, :]))])
 -         max_type = max(max_type.items(), key=lambda x: x[1])[0]
 - 
 -         colnm = len(df.iloc[0, :])
 -         hdrows = [0]  # header is not nessesarily appear in the first line
 -         if max_type == "Nu":
 -             for r in range(1, len(df)):
 -                 tys = Counter([blockType(str(df.iloc[r, j]))
 -                               for j in range(len(df.iloc[r, :]))])
 -                 tys = max(tys.items(), key=lambda x: x[1])[0]
 -                 if tys != max_type:
 -                     hdrows.append(r)
 - 
 -         lines = []
 -         for i in range(1, len(df)):
 -             if i in hdrows:
 -                 continue
 -             hr = [r - i for r in hdrows]
 -             hr = [r for r in hr if r < 0]
 -             t = len(hr) - 1
 -             while t > 0:
 -                 if hr[t] - hr[t - 1] > 1:
 -                     hr = hr[t:]
 -                     break
 -                 t -= 1
 -             headers = []
 -             for j in range(len(df.iloc[i, :])):
 -                 t = []
 -                 for h in hr:
 -                     x = str(df.iloc[i + h, j]).strip()
 -                     if x in t:
 -                         continue
 -                     t.append(x)
 -                 t = ",".join(t)
 -                 if t:
 -                     t += ": "
 -                 headers.append(t)
 -             cells = []
 -             for j in range(len(df.iloc[i, :])):
 -                 if not str(df.iloc[i, j]):
 -                     continue
 -                 cells.append(headers[j] + str(df.iloc[i, j]))
 -             lines.append(";".join(cells))
 - 
 -         if colnm > 3:
 -             return lines
 -         return ["\n".join(lines)]
 - 
 -     def __call__(self, fnm, from_page=0, to_page=100000):
 -         self.doc = Document(fnm) if isinstance(
 -             fnm, str) else Document(BytesIO(fnm))
 -         pn = 0 # parsed page
 -         secs = [] # parsed contents
 -         for p in self.doc.paragraphs:
 -             if pn > to_page:
 -                 break
 - 
 -             runs_within_single_paragraph = [] # save runs within the range of pages
 -             for run in p.runs:
 -                 if pn > to_page:
 -                     break
 -                 if from_page <= pn < to_page and p.text.strip():
 -                     runs_within_single_paragraph.append(run.text) # append run.text first
 - 
 -                 # wrap page break checker into a static method
 -                 if 'lastRenderedPageBreak' in run._element.xml:
 -                     pn += 1
 - 
 -             secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
 - 
 -         tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
 -         return secs, tbls
 
 
  |