-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Copy link
Labels
Description
Describe the bug
ArrowWrites ignores page size properties when writing to parquet. It also seems to always write just two pages, the first one a normal sized page and all the remaining data in the second page.
To Reproduce
#[test]
fn arrow_writer_page_size() {
let mut rng = thread_rng();
let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)]));
let mut builder = StringBuilder::with_capacity(10_000, 2 * 10_0000);
for _ in 0..100_000 {
let value = (0..200)
.map(|_| rng.gen_range(b'a'..=b'z') as char)
.collect::<String>();
builder.append_value(value);
}
let array = Arc::new(builder.finish());
let batch = RecordBatch::try_new(schema, vec![array]).unwrap();
let file = tempfile::tempfile().unwrap();
let props = WriterProperties::builder()
.set_max_row_group_size(usize::MAX)
.set_data_pagesize_limit(512)
.set_write_batch_size(512)
.build();
let mut writer = ArrowWriter::try_new(
file.try_clone().unwrap(),
batch.schema(),
Some(props),
)
.expect("Unable to write file");
writer.write(&batch).unwrap();
writer.close().unwrap();
let reader = SerializedFileReader::new(file.try_clone().unwrap()).unwrap();
let column = reader.metadata().row_group(0).columns();
let page_locations = read_pages_locations(&file, column).unwrap();
let offset_index = page_locations[0].clone();
assert!(offset_index.len() > 2, "Expected more than two pages but got {:#?}", offset_index);
}
This outputs
thread 'arrow::arrow_writer::tests::arrow_writer_page_size' panicked at 'Expected more than two pages but got [
PageLocation {
offset: 1148953,
compressed_page_size: 9595,
first_row_index: 0,
},
PageLocation {
offset: 1158548,
compressed_page_size: 19251505,
first_row_index: 5632,
},
]'
Expected behavior
The writer should respect the page size properties and write similarly sized pages.
Additional context
Reactions are currently unavailable