### What problem does this PR solve?
Add `.doc` file parser, using tika.
```
pip install tika
```
```
from tika import parser
from io import BytesIO
def extract_text_from_doc_bytes(doc_bytes):
file_like_object = BytesIO(doc_bytes)
parsed = parser.from_buffer(file_like_object)
return parsed["content"]
```
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: chrysanthemum-boy <fannc@qq.com>
tags/v0.3.1
| @@ -147,7 +147,7 @@ def filename_type(filename): | |||
| return FileType.PDF.value | |||
| if re.match( | |||
| r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): | |||
| r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): | |||
| return FileType.DOC.value | |||
| if re.match( | |||
| @@ -11,6 +11,7 @@ | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| @@ -103,9 +104,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [(l, "") for l in sections if l] | |||
| remove_contents_table(sections, eng=is_english( | |||
| random_choices([t for t, _ in sections], k=200))) | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(docx, pdf, txt supported)") | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| make_colon_as_title(sections) | |||
| bull = bullets_category( | |||
| @@ -11,6 +11,7 @@ | |||
| # limitations under the License. | |||
| # | |||
| import copy | |||
| from tika import parser | |||
| import re | |||
| from io import BytesIO | |||
| from docx import Document | |||
| @@ -123,9 +124,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| sections = txt.split("\n") | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(docx, pdf, txt supported)") | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| # is it English | |||
| eng = lang.lower() == "english" # is_english(sections) | |||
| @@ -10,6 +10,7 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from tika import parser | |||
| from io import BytesIO | |||
| from docx import Document | |||
| import re | |||
| @@ -154,9 +155,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [(l, "") for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(docx, pdf, txt supported)") | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| chunks = naive_merge( | |||
| sections, parser_config.get( | |||
| @@ -10,6 +10,8 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # | |||
| from tika import parser | |||
| from io import BytesIO | |||
| import re | |||
| from rag.app import laws | |||
| from rag.nlp import huqie, tokenize, find_codec | |||
| @@ -95,9 +97,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |||
| sections = [s for s in sections if s] | |||
| callback(0.8, "Finish parsing.") | |||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||
| callback(0.1, "Start to parse.") | |||
| binary = BytesIO(binary) | |||
| doc_parsed = parser.from_buffer(binary) | |||
| sections = doc_parsed['content'].split('\n') | |||
| sections = [l for l in sections if l] | |||
| callback(0.8, "Finish parsing.") | |||
| else: | |||
| raise NotImplementedError( | |||
| "file type not supported yet(docx, pdf, txt supported)") | |||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||
| doc = { | |||
| "docnm_kwd": filename, | |||
| @@ -116,6 +116,7 @@ sniffio==1.3.1 | |||
| StrEnum==0.4.15 | |||
| sympy==1.12 | |||
| threadpoolctl==3.3.0 | |||
| tika==2.6.0 | |||
| tiktoken==0.6.0 | |||
| tokenizers==0.15.2 | |||
| torch==2.2.1 | |||
| @@ -133,4 +134,4 @@ xxhash==3.4.1 | |||
| yarl==1.9.4 | |||
| zhipuai==2.0.1 | |||
| BCEmbedding | |||
| loguru==0.7.2 | |||
| loguru==0.7.2 | |||