### What problem does this PR solve?
Add `.doc` file parser, using tika.
```
pip install tika
```
```
from tika import parser
from io import BytesIO
def extract_text_from_doc_bytes(doc_bytes):
file_like_object = BytesIO(doc_bytes)
parsed = parser.from_buffer(file_like_object)
return parsed["content"]
```
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---------
Co-authored-by: chrysanthemum-boy <fannc@qq.com>
tags/v0.3.1
| return FileType.PDF.value | return FileType.PDF.value | ||||
| if re.match( | if re.match( | ||||
| r".*\.(docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): | |||||
| r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename): | |||||
| return FileType.DOC.value | return FileType.DOC.value | ||||
| if re.match( | if re.match( |
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import copy | import copy | ||||
| from tika import parser | |||||
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO | ||||
| random_choices([t for t, _ in sections], k=200))) | random_choices([t for t, _ in sections], k=200))) | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| binary = BytesIO(binary) | |||||
| doc_parsed = parser.from_buffer(binary) | |||||
| sections = doc_parsed['content'].split('\n') | |||||
| sections = [(l, "") for l in sections if l] | |||||
| remove_contents_table(sections, eng=is_english( | |||||
| random_choices([t for t, _ in sections], k=200))) | |||||
| callback(0.8, "Finish parsing.") | |||||
| else: | else: | ||||
| raise NotImplementedError( | raise NotImplementedError( | ||||
| "file type not supported yet(docx, pdf, txt supported)") | |||||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||||
| make_colon_as_title(sections) | make_colon_as_title(sections) | ||||
| bull = bullets_category( | bull = bullets_category( |
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| import copy | import copy | ||||
| from tika import parser | |||||
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO | ||||
| from docx import Document | from docx import Document | ||||
| sections = txt.split("\n") | sections = txt.split("\n") | ||||
| sections = [l for l in sections if l] | sections = [l for l in sections if l] | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| binary = BytesIO(binary) | |||||
| doc_parsed = parser.from_buffer(binary) | |||||
| sections = doc_parsed['content'].split('\n') | |||||
| sections = [l for l in sections if l] | |||||
| callback(0.8, "Finish parsing.") | |||||
| else: | else: | ||||
| raise NotImplementedError( | raise NotImplementedError( | ||||
| "file type not supported yet(docx, pdf, txt supported)") | |||||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||||
| # is it English | # is it English | ||||
| eng = lang.lower() == "english" # is_english(sections) | eng = lang.lower() == "english" # is_english(sections) |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from tika import parser | |||||
| from io import BytesIO | from io import BytesIO | ||||
| from docx import Document | from docx import Document | ||||
| import re | import re | ||||
| sections = [(l, "") for l in sections if l] | sections = [(l, "") for l in sections if l] | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| binary = BytesIO(binary) | |||||
| doc_parsed = parser.from_buffer(binary) | |||||
| sections = doc_parsed['content'].split('\n') | |||||
| sections = [(l, "") for l in sections if l] | |||||
| callback(0.8, "Finish parsing.") | |||||
| else: | else: | ||||
| raise NotImplementedError( | raise NotImplementedError( | ||||
| "file type not supported yet(docx, pdf, txt supported)") | |||||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||||
| chunks = naive_merge( | chunks = naive_merge( | ||||
| sections, parser_config.get( | sections, parser_config.get( |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from tika import parser | |||||
| from io import BytesIO | |||||
| import re | import re | ||||
| from rag.app import laws | from rag.app import laws | ||||
| from rag.nlp import huqie, tokenize, find_codec | from rag.nlp import huqie, tokenize, find_codec | ||||
| sections = [s for s in sections if s] | sections = [s for s in sections if s] | ||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.doc$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | |||||
| binary = BytesIO(binary) | |||||
| doc_parsed = parser.from_buffer(binary) | |||||
| sections = doc_parsed['content'].split('\n') | |||||
| sections = [l for l in sections if l] | |||||
| callback(0.8, "Finish parsing.") | |||||
| else: | else: | ||||
| raise NotImplementedError( | raise NotImplementedError( | ||||
| "file type not supported yet(docx, pdf, txt supported)") | |||||
| "file type not supported yet(doc, docx, pdf, txt supported)") | |||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, |
| StrEnum==0.4.15 | StrEnum==0.4.15 | ||||
| sympy==1.12 | sympy==1.12 | ||||
| threadpoolctl==3.3.0 | threadpoolctl==3.3.0 | ||||
| tika==2.6.0 | |||||
| tiktoken==0.6.0 | tiktoken==0.6.0 | ||||
| tokenizers==0.15.2 | tokenizers==0.15.2 | ||||
| torch==2.2.1 | torch==2.2.1 | ||||
| yarl==1.9.4 | yarl==1.9.4 | ||||
| zhipuai==2.0.1 | zhipuai==2.0.1 | ||||
| BCEmbedding | BCEmbedding | ||||
| loguru==0.7.2 | |||||
| loguru==0.7.2 |