Sfoglia il codice sorgente

refactor(file extractor): file extractor (#1059)

tags/0.3.19
yezhwi 2 anni fa
parent
commit
d33a269548
Nessun account collegato all'indirizzo email del committer

+ 2
- 2
api/controllers/console/datasets/file.py Vedi File

raise FileTooLargeError(message) raise FileTooLargeError(message)


extension = file.filename.split('.')[-1] extension = file.filename.split('.')[-1]
if extension not in ALLOWED_EXTENSIONS:
if extension.lower() not in ALLOWED_EXTENSIONS:
raise UnsupportedFileTypeError() raise UnsupportedFileTypeError()


# user uuid as file name # user uuid as file name


# extract text from file # extract text from file
extension = upload_file.extension extension = upload_file.extension
if extension not in ALLOWED_EXTENSIONS:
if extension.lower() not in ALLOWED_EXTENSIONS:
raise UnsupportedFileTypeError() raise UnsupportedFileTypeError()


text = FileExtractor.load(upload_file, return_text=True) text = FileExtractor.load(upload_file, return_text=True)

+ 7
- 6
api/core/data_loader/file_extractor.py Vedi File

upload_file: Optional[UploadFile] = None) -> Union[List[Document] | str]: upload_file: Optional[UploadFile] = None) -> Union[List[Document] | str]:
input_file = Path(file_path) input_file = Path(file_path)
delimiter = '\n' delimiter = '\n'
if input_file.suffix == '.xlsx':
file_extension = input_file.suffix.lower()
if file_extension == '.xlsx':
loader = ExcelLoader(file_path) loader = ExcelLoader(file_path)
elif input_file.suffix == '.pdf':
elif file_extension == '.pdf':
loader = PdfLoader(file_path, upload_file=upload_file) loader = PdfLoader(file_path, upload_file=upload_file)
elif input_file.suffix in ['.md', '.markdown']:
elif file_extension in ['.md', '.markdown']:
loader = MarkdownLoader(file_path, autodetect_encoding=True) loader = MarkdownLoader(file_path, autodetect_encoding=True)
elif input_file.suffix in ['.htm', '.html']:
elif file_extension in ['.htm', '.html']:
loader = HTMLLoader(file_path) loader = HTMLLoader(file_path)
elif input_file.suffix == '.docx':
elif file_extension == '.docx':
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif input_file.suffix == '.csv':
elif file_extension == '.csv':
loader = CSVLoader(file_path, autodetect_encoding=True) loader = CSVLoader(file_path, autodetect_encoding=True)
else: else:
# txt # txt

+ 1
- 1
web/app/components/datasets/create/file-uploader/index.tsx Vedi File

const isValid = useCallback((file: File) => { const isValid = useCallback((file: File) => {
const { size } = file const { size } = file
const ext = `.${getFileType(file)}` const ext = `.${getFileType(file)}`
const isValidType = ACCEPTS.includes(ext)
const isValidType = ACCEPTS.includes(ext.toLowerCase())
if (!isValidType) if (!isValidType)
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') }) notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') })



+ 1
- 1
web/app/components/datasets/create/index.tsx Vedi File

) )
} }


export default DatasetUpdateForm
export default DatasetUpdateForm

Loading…
Annulla
Salva