-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Labels
arrowChanges to the arrow crateChanges to the arrow crateenhancementAny new improvement worthy of a entry in the changelogAny new improvement worthy of a entry in the changelog
Description
Is your feature request related to a problem or challenge? Please describe what you are trying to do.
datafusion-python currently errors when calling select count(*) from t when t is a pyarrow.Dataset.
The resulting pyarrow.RecordBatch contains no rows and has a schema with no columns, but it does have num_rows set to the correct number.
Describe the solution you'd like
Support was added to arrow-rs in #1552 for a RecordBatch with zero columns but non zero row count.
I'd like impl FromPyArrow for RecordBatch to use this functionality.
Lines 334 to 392 in b711f23
| impl FromPyArrow for RecordBatch { | |
| fn from_pyarrow_bound(value: &Bound<PyAny>) -> PyResult<Self> { | |
| // Newer versions of PyArrow as well as other libraries with Arrow data implement this | |
| // method, so prefer it over _export_to_c. | |
| // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html | |
| if value.hasattr("__arrow_c_array__")? { | |
| let tuple = value.getattr("__arrow_c_array__")?.call0()?; | |
| if !tuple.is_instance_of::<PyTuple>() { | |
| return Err(PyTypeError::new_err( | |
| "Expected __arrow_c_array__ to return a tuple.", | |
| )); | |
| } | |
| let schema_capsule = tuple.get_item(0)?; | |
| let schema_capsule = schema_capsule.downcast::<PyCapsule>()?; | |
| let array_capsule = tuple.get_item(1)?; | |
| let array_capsule = array_capsule.downcast::<PyCapsule>()?; | |
| validate_pycapsule(schema_capsule, "arrow_schema")?; | |
| validate_pycapsule(array_capsule, "arrow_array")?; | |
| let schema_ptr = unsafe { schema_capsule.reference::<FFI_ArrowSchema>() }; | |
| let ffi_array = unsafe { FFI_ArrowArray::from_raw(array_capsule.pointer() as _) }; | |
| let array_data = unsafe { ffi::from_ffi(ffi_array, schema_ptr) }.map_err(to_py_err)?; | |
| if !matches!(array_data.data_type(), DataType::Struct(_)) { | |
| return Err(PyTypeError::new_err( | |
| "Expected Struct type from __arrow_c_array.", | |
| )); | |
| } | |
| let array = StructArray::from(array_data); | |
| // StructArray does not embed metadata from schema. We need to override | |
| // the output schema with the schema from the capsule. | |
| let schema = Arc::new(Schema::try_from(schema_ptr).map_err(to_py_err)?); | |
| let (_fields, columns, nulls) = array.into_parts(); | |
| assert_eq!( | |
| nulls.map(|n| n.null_count()).unwrap_or_default(), | |
| 0, | |
| "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation" | |
| ); | |
| return RecordBatch::try_new(schema, columns).map_err(to_py_err); | |
| } | |
| validate_class("RecordBatch", value)?; | |
| // TODO(kszucs): implement the FFI conversions in arrow-rs for RecordBatches | |
| let schema = value.getattr("schema")?; | |
| let schema = Arc::new(Schema::from_pyarrow_bound(&schema)?); | |
| let arrays = value.getattr("columns")?; | |
| let arrays = arrays | |
| .downcast::<PyList>()? | |
| .iter() | |
| .map(|a| Ok(make_array(ArrayData::from_pyarrow_bound(&a)?))) | |
| .collect::<PyResult<_>>()?; | |
| let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?; | |
| Ok(batch) | |
| } | |
| } |
Additional Context
datafusion-python issue: apache/datafusion-python#800
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
arrowChanges to the arrow crateChanges to the arrow crateenhancementAny new improvement worthy of a entry in the changelogAny new improvement worthy of a entry in the changelog