-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Description
ARROW-010 & ARROW-011: Parquet Statistics and Metadata API Removals
Problem
Arrow 56.0 removed numerous Parquet statistics and metadata APIs that were deprecated in 54.0.0 and earlier. Our linter doesn't detect usage of these removed functions, causing compilation failures.
API Changes Details
All removed in 56.0.0 per cargo-public-api diff:
Statistics API (ARROW-010)
// Statistics methods:
-pub fn parquet::file::statistics::Statistics::distinct_count(&self) -> core::option::Option<u64>
-pub fn parquet::file::statistics::Statistics::has_min_max_set(&self) -> bool
-pub fn parquet::file::statistics::Statistics::has_nulls(&self) -> bool
-pub fn parquet::file::statistics::Statistics::max_bytes(&self) -> &[u8]
-pub fn parquet::file::statistics::Statistics::min_bytes(&self) -> &[u8]
-pub fn parquet::file::statistics::Statistics::null_count(&self) -> u64
// ValueStatistics methods:
-pub fn parquet::file::statistics::ValueStatistics<T>::has_min_max_set(&self) -> bool
-pub fn parquet::file::statistics::ValueStatistics<T>::max(&self) -> &T
-pub fn parquet::file::statistics::ValueStatistics<T>::max_bytes(&self) -> &[u8]
-pub fn parquet::file::statistics::ValueStatistics<T>::min(&self) -> &T
-pub fn parquet::file::statistics::ValueStatistics<T>::min_bytes(&self) -> &[u8]
-pub fn parquet::file::statistics::ValueStatistics<T>::null_count(&self) -> u64Metadata and Properties API (ARROW-011)
// Schema conversion:
-pub fn parquet::arrow::arrow_to_parquet_schema(schema: &arrow_schema::schema::Schema) -> parquet::errors::Result<parquet::schema::types::SchemaDescriptor>
// Writer properties:
-pub fn parquet::file::properties::WriterProperties::max_statistics_size(&self, col: &parquet::schema::types::ColumnPath) -> usize
-pub fn parquet::file::properties::WriterPropertiesBuilder::set_column_max_statistics_size(self, col: parquet::schema::types::ColumnPath, value: usize) -> Self
-pub fn parquet::file::properties::WriterPropertiesBuilder::set_max_statistics_size(self, value: usize) -> Self
-pub const parquet::file::properties::DEFAULT_MAX_STATISTICS_SIZE: usize
// Metadata constructors:
-pub fn parquet::file::metadata::ParquetMetaData::new_with_page_index(file_metadata: parquet::file::metadata::FileMetaData, row_groups: alloc::vec::Vec<parquet::file::metadata::RowGroupMetaData>, column_index: core::option::Option<parquet::file::metadata::ParquetColumnIndex>, offset_index: core::option::Option<parquet::file::metadata::ParquetOffsetIndex>) -> Self
-pub fn parquet::file::metadata::ColumnChunkMetaDataBuilder::set_file_offset(self, value: i64) -> Self
// Page index functions:
-pub fn parquet::file::page_index::index_reader::read_pages_locations<R: parquet::file::reader::ChunkReader>(reader: &R, chunks: &[parquet::file::metadata::ColumnChunkMetaData]) -> core::result::Result<alloc::vec::Vec<alloc::vec::Vec<parquet::format::PageLocation>>, parquet::errors::ParquetError>Implementation Task
Create src/rules/arrow_010_parquet_statistics_removed.rs with these specifications:
Statistics Rule Implementation (ARROW-010)
use regex::Regex;
use std::path::Path;
use crate::output::{Issue, Severity};
use crate::rules::Rule;
/// ARROW-010: Detect removed Parquet Statistics API usage
///
/// Arrow 56.0 removed deprecated Statistics and ValueStatistics methods
/// that were deprecated in 54.0.0 and earlier versions.
pub struct Arrow010Rule;
impl Rule for Arrow010Rule {
fn rule_id(&self) -> &'static str {
"ARROW-010"
}
fn check_rust_source(&self, file_path: &Path, content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
let mut issues = Vec::new();
let removed_stats_methods = vec\![
(r"\.distinct_count\s*\(\s*\)", "Use statistics.distinct_count_opt() or handle the Option return type."),
(r"\.has_min_max_set\s*\(\s*\)", "Check if statistics.min_opt() and statistics.max_opt() are Some instead."),
(r"\.has_nulls\s*\(\s*\)", "Use statistics.null_count_opt().map(|c| c > 0).unwrap_or(false) instead."),
(r"\.max_bytes\s*\(\s*\)", "Use statistics.max_bytes_opt().unwrap_or(&[]) for the same behavior."),
(r"\.min_bytes\s*\(\s*\)", "Use statistics.min_bytes_opt().unwrap_or(&[]) for the same behavior."),
(r"\.null_count\s*\(\s*\)", "Use statistics.null_count_opt().unwrap_or(0) instead."),
];
for (line_num, line) in content.lines().enumerate() {
// Only check lines that might be using Parquet statistics
if line.contains("Statistics") || line.contains("ValueStatistics") ||
line.contains("statistics") || line.contains("stats") {
for (pattern, suggestion) in &removed_stats_methods {
let regex = Regex::new(pattern)?;
if let Some(mat) = regex.find(line) {
issues.push(Issue {
rule_id: self.rule_id().to_string(),
severity: Severity::Error,
message: format\!("Parquet Statistics method removed in Arrow 56.0: {}",
line[mat.start()+1..mat.end()].trim_end_matches(['(', ')', ' '])),
file_path: file_path.to_path_buf(),
line: line_num + 1,
column: mat.start() + 1,
suggestion: Some(suggestion.to_string()),
auto_fixable: false, // Return types changed to Option
deprecated_since: Some("54.0.0 (removed in 56.0.0)".to_string()),
changelog_url: Some("https://github.com/apache/arrow-rs/blob/main/CHANGELOG.md#560".to_string()),
migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-statistics-api".to_string()),
});
break;
}
}
}
}
Ok(issues)
}
fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
Ok(Vec::new())
}
}Metadata Rule Implementation (ARROW-011)
Create src/rules/arrow_011_parquet_metadata_removed.rs:
use regex::Regex;
use std::path::Path;
use crate::output::{Issue, Severity};
use crate::rules::Rule;
/// ARROW-011: Detect removed Parquet metadata and properties APIs
///
/// Arrow 56.0 removed various deprecated metadata construction and
/// writer property configuration methods.
pub struct Arrow011Rule;
impl Rule for Arrow011Rule {
fn rule_id(&self) -> &'static str {
"ARROW-011"
}
fn check_rust_source(&self, file_path: &Path, content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
let mut issues = Vec::new();
let removed_functions = vec\![
(r"arrow_to_parquet_schema\s*\(", "Use parquet::arrow::ArrowToParquetSchemaConverter instead."),
(r"\.max_statistics_size\s*\(", "This method was removed. Statistics size is now automatically managed."),
(r"\.set_column_max_statistics_size\s*\(", "Remove this call. Statistics size is now automatically managed per-column."),
(r"\.set_max_statistics_size\s*\(", "Remove this call. Global statistics size limits are no longer configurable."),
(r"DEFAULT_MAX_STATISTICS_SIZE", "This constant was removed. Statistics size is now automatically managed."),
(r"ParquetMetaData::new_with_page_index\s*\(", "Use ParquetMetaData::new() and add page index separately."),
(r"\.set_file_offset\s*\(", "This method was removed from ColumnChunkMetaDataBuilder."),
(r"read_pages_locations\s*\(", "This function was removed. Use the page index APIs directly."),
];
for (line_num, line) in content.lines().enumerate() {
for (pattern, suggestion) in &removed_functions {
let regex = Regex::new(pattern)?;
if let Some(mat) = regex.find(line) {
issues.push(Issue {
rule_id: self.rule_id().to_string(),
severity: Severity::Error,
message: format\!("Parquet metadata/properties API removed in Arrow 56.0: {}",
line[mat.start()..mat.end()].trim_end_matches(['(', ' '])),
file_path: file_path.to_path_buf(),
line: line_num + 1,
column: mat.start() + 1,
suggestion: Some(suggestion.to_string()),
auto_fixable: false,
deprecated_since: Some("54.0.0 (removed in 56.0.0)".to_string()),
changelog_url: Some("https://github.com/apache/arrow-rs/pull/7811".to_string()),
migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-metadata-api".to_string()),
});
break;
}
}
}
Ok(issues)
}
fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
Ok(Vec::new())
}
}Tests for Both Rules
// In arrow_010 tests:
#[test]
fn test_detects_statistics_methods() {
let rule = Arrow010Rule;
let content = r#"
fn analyze_stats(stats: &Statistics) {
let count = stats.null_count();
let has_values = stats.has_min_max_set();
let min = stats.min_bytes();
}
"#;
let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
assert_eq\!(issues.len(), 3);
assert\!(issues.iter().all(|i| matches\!(i.severity, Severity::Error)));
}
// In arrow_011 tests:
#[test]
fn test_detects_metadata_functions() {
let rule = Arrow011Rule;
let content = r#"
let schema_desc = arrow_to_parquet_schema(&arrow_schema)?;
let props = WriterPropertiesBuilder::new()
.set_max_statistics_size(1024)
.build()?;
"#;
let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
assert_eq\!(issues.len(), 2);
}Integration Steps
- Add both to
src/rules/mod.rs:pub mod arrow_010_parquet_statistics_removed; pub mod arrow_011_parquet_metadata_removed; // In RuleEngine::new(): rules.push(Box::new(arrow_010_parquet_statistics_removed::Arrow010Rule)); rules.push(Box::new(arrow_011_parquet_metadata_removed::Arrow011Rule));
Acceptance Criteria
- ✅ Detects all removed Statistics methods (ARROW-010)
- ✅ Detects all removed metadata/properties APIs (ARROW-011)
- ✅ Provides specific migration guidance with Option types
- ✅ Error severity for compilation failures
- ✅ Non-auto-fixable due to API changes
- ✅ Comprehensive test coverage
Migration Notes
- Statistics: All methods now return Option types for safety
- Properties: Statistics sizing is now automatic
- Metadata: Use modern constructors and builders
- Timeline: Deprecated ≤54.0.0 → Removed 56.0.0
Metadata
Metadata
Assignees
Labels
No labels