Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>tags/0.3.2
| @@ -18,6 +18,7 @@ from controllers.console.setup import setup_required | |||
| from controllers.console.wraps import account_initialization_required | |||
| from core.index.readers.html_parser import HTMLParser | |||
| from core.index.readers.pdf_parser import PDFParser | |||
| from core.index.readers.xlsx_parser import XLSXParser | |||
| from extensions.ext_storage import storage | |||
| from libs.helper import TimestampField | |||
| from extensions.ext_database import db | |||
| @@ -26,7 +27,7 @@ from models.model import UploadFile | |||
| cache = TTLCache(maxsize=None, ttl=30) | |||
| FILE_SIZE_LIMIT = 15 * 1024 * 1024 # 15MB | |||
| ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm'] | |||
| ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx'] | |||
| PREVIEW_WORDS_LIMIT = 3000 | |||
| @@ -133,6 +134,9 @@ class FilePreviewApi(Resource): | |||
| # Use BeautifulSoup to extract text | |||
| parser = HTMLParser() | |||
| text = parser.parse_file(Path(filepath)) | |||
| elif extension == 'xlsx': | |||
| parser = XLSXParser() | |||
| text = parser.parse_file(filepath) | |||
| else: | |||
| # ['txt', 'markdown', 'md'] | |||
| with open(filepath, "rb") as fp: | |||
| @@ -0,0 +1,31 @@ | |||
| from pathlib import Path | |||
| import json | |||
| from typing import Dict | |||
| from openpyxl import load_workbook | |||
| from llama_index.readers.file.base_parser import BaseParser | |||
| from flask import current_app | |||
| class XLSXParser(BaseParser): | |||
| """XLSX parser.""" | |||
| def _init_parser(self) -> Dict: | |||
| """Init parser""" | |||
| return {} | |||
| def parse_file(self, file: Path, errors: str = "ignore") -> str: | |||
| data = [] | |||
| keys = [] | |||
| with open(file, "r") as fp: | |||
| wb = load_workbook(filename=file, read_only=True) | |||
| # loop over all sheets | |||
| for sheet in wb: | |||
| for row in sheet.iter_rows(values_only=True): | |||
| if all(v is None for v in row): | |||
| continue | |||
| if keys == []: | |||
| keys = row | |||
| else: | |||
| data.append(json.dumps(dict(zip(keys, row)), ensure_ascii=False)) | |||
| return data | |||
| @@ -12,6 +12,8 @@ from llama_index.data_structs import Node | |||
| from llama_index.data_structs.node_v2 import DocumentRelationship | |||
| from llama_index.node_parser import SimpleNodeParser, NodeParser | |||
| from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR | |||
| from llama_index.readers.file.markdown_parser import MarkdownParser | |||
| from core.index.readers.xlsx_parser import XLSXParser | |||
| from core.docstore.dataset_docstore import DatesetDocumentStore | |||
| from core.index.keyword_table_index import KeywordTableIndex | |||
| from core.index.readers.html_parser import HTMLParser | |||
| @@ -250,6 +252,7 @@ class IndexingRunner: | |||
| file_extractor[".html"] = HTMLParser() | |||
| file_extractor[".htm"] = HTMLParser() | |||
| file_extractor[".pdf"] = PDFParser({'upload_file': upload_file}) | |||
| file_extractor[".xlsx"] = XLSXParser() | |||
| loader = SimpleDirectoryReader(input_files=[filepath], file_extractor=file_extractor) | |||
| text_docs = loader.load_data() | |||
| @@ -29,4 +29,5 @@ sentry-sdk[flask]~=1.21.1 | |||
| jieba==0.42.1 | |||
| celery==5.2.7 | |||
| redis~=4.5.4 | |||
| pypdf==3.8.1 | |||
| pypdf==3.8.1 | |||
| openpyxl==3.1.2 | |||