Kaynağa Gözat

dep: bump pandas from 1.x to 2.x (#4820)

tags/0.6.10
Bowen Liang 1 yıl önce
ebeveyn
işleme
58db719a2c
No account linked to committer's email address

+ 1
- 1
api/core/rag/extractor/csv_extractor.py Dosyayı Görüntüle

docs = [] docs = []
try: try:
# load csv file into pandas dataframe # load csv file into pandas dataframe
df = pd.read_csv(csvfile, error_bad_lines=False, **self.csv_args)
df = pd.read_csv(csvfile, on_bad_lines='skip', **self.csv_args)


# check source column exists # check source column exists
if self.source_column and self.source_column not in df.columns: if self.source_column and self.source_column not in df.columns:

+ 1
- 3
api/requirements.txt Dosyayı Görüntüle

jieba==0.42.1 jieba==0.42.1
celery~=5.3.6 celery~=5.3.6
redis[hiredis]~=5.0.3 redis[hiredis]~=5.0.3
openpyxl==3.1.2
chardet~=5.1.0 chardet~=5.1.0
python-docx~=1.1.0 python-docx~=1.1.0
pypdfium2~=4.17.0 pypdfium2~=4.17.0
huggingface_hub~=0.16.4 huggingface_hub~=0.16.4
transformers~=4.35.0 transformers~=4.35.0
tokenizers~=0.15.0 tokenizers~=0.15.0
pandas==1.5.3
pandas[performance,excel]~=2.2.2
xinference-client==0.9.4 xinference-client==0.9.4
safetensors~=0.4.3 safetensors~=0.4.3
zhipuai==1.0.7 zhipuai==1.0.7
azure-storage-blob==12.13.0 azure-storage-blob==12.13.0
azure-identity==1.15.0 azure-identity==1.15.0
lxml==5.1.0 lxml==5.1.0
xlrd~=2.0.1
pydantic~=1.10.0 pydantic~=1.10.0
pgvecto-rs==0.1.4 pgvecto-rs==0.1.4
firecrawl-py==0.0.5 firecrawl-py==0.0.5

+ 62
- 0
api/tests/unit_tests/libs/test_pandas.py Dosyayı Görüntüle

import pandas as pd


def test_pandas_csv(tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
data = {'col1': [1, 2.2, -3.3, 4.0, 5],
'col2': ['A', 'B', 'C', 'D', 'E']}
df1 = pd.DataFrame(data)

# write to csv file
csv_file_path = tmp_path.joinpath('example.csv')
df1.to_csv(csv_file_path, index=False)

# read from csv file
df2 = pd.read_csv(csv_file_path, on_bad_lines='skip')
assert df2[df2.columns[0]].to_list() == data['col1']
assert df2[df2.columns[1]].to_list() == data['col2']


def test_pandas_xlsx(tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
data = {'col1': [1, 2.2, -3.3, 4.0, 5],
'col2': ['A', 'B', 'C', 'D', 'E']}
df1 = pd.DataFrame(data)

# write to xlsx file
xlsx_file_path = tmp_path.joinpath('example.xlsx')
df1.to_excel(xlsx_file_path, index=False)

# read from xlsx file
df2 = pd.read_excel(xlsx_file_path)
assert df2[df2.columns[0]].to_list() == data['col1']
assert df2[df2.columns[1]].to_list() == data['col2']


def test_pandas_xlsx_with_sheets(tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
data1 = {'col1': [1, 2, 3, 4, 5],
'col2': ['A', 'B', 'C', 'D', 'E']}
df1 = pd.DataFrame(data1)

data2 = {'col1': [6, 7, 8, 9, 10],
'col2': ['F', 'G', 'H', 'I', 'J']}
df2 = pd.DataFrame(data2)

# write to xlsx file with sheets
xlsx_file_path = tmp_path.joinpath('example_with_sheets.xlsx')
sheet1 = 'Sheet1'
sheet2 = 'Sheet2'
with pd.ExcelWriter(xlsx_file_path) as excel_writer:
df1.to_excel(excel_writer, sheet_name=sheet1, index=False)
df2.to_excel(excel_writer, sheet_name=sheet2, index=False)

# read from xlsx file with sheets
with pd.ExcelFile(xlsx_file_path) as excel_file:
df1 = pd.read_excel(excel_file, sheet_name=sheet1)
assert df1[df1.columns[0]].to_list() == data1['col1']
assert df1[df1.columns[1]].to_list() == data1['col2']

df2 = pd.read_excel(excel_file, sheet_name=sheet2)
assert df2[df2.columns[0]].to_list() == data2['col1']
assert df2[df2.columns[1]].to_list() == data2['col2']

Loading…
İptal
Kaydet