After reading dataset with filters in dataset.fragments
other values of filtered column is presented. Is this the expected behavior?
import pyarrow.parquet as pq
from pyarrow import csv
path_ds = 'path/to/ds/'
path_csv = 'path/to/csv/'
read_options = csv.ReadOptions(autogenerate_column_names=True)
parse_options = csv.ParseOptions(delimiter='|')
with csv.open_csv(path_csv, parse_options=parse_options, read_options=read_options) as reader:
for chunk in reader:
tbl = pa.Table.from_batches([chunk])
pq.write_to_dataset(
tbl,
root_path=path_ds,
partition_cols=['f0', 'f2'],
use_legacy_dataset=False
)
temp_dataset = pq.ParquetDataset(
path_ds,
use_legacy_dataset=False,
filters=[('f0', '=', '01.09.2022'), ('f2', '=', 'code1')]
)
print(temp_dataset.fragments)
>>> [<pyarrow.dataset.ParquetFileFragment path=path/to/ds/f0=01.09.2022/f2=code1/008f64795a3640f3a5cab0273fc287b1-0.parquet partition=[f0=01.09.2022, f2='code1']>,
>>> ...
>>> <pyarrow.dataset.ParquetFileFragment path=path/to/ds/f0=01.09.2022/f2=code2/5c1225fae02a4226b62f3959f6a57cf0-0.parquet partition=[f0=01.09.2022, f2='code2']>,
>>> ...