-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Closed
Description
ARROW-008: SerializedPageReaderState Type Changes (Parquet)
Problem
Arrow 56.0 changed SerializedPageReaderState.offset and remaining_bytes from usize to u64. This breaks code that directly accesses these fields or depends on usize return types.
API Change Details
Changed in commit 5555d30:
// OLD (55.2.0):
pub struct SerializedPageReaderState {
pub offset: usize,
pub remaining_bytes: usize,
// ...
}
// NEW (56.0.0):
pub struct SerializedPageReaderState {
pub offset: u64,
pub remaining_bytes: u64,
// ...
}Rationale: WebAssembly compatibility - usize is 32-bit on wasm32, limiting file sizes to 4GB.
Commit: 5555d30b0 - [Parquet] Use u64 for SerializedPageReaderState.offset & remaining_bytes, instead of usize
Implementation Task
Create src/rules/arrow_008_parquet_page_reader_types.rs with these specifications:
Rule Implementation
use regex::Regex;
use std::path::Path;
use crate::output::{Issue, Severity};
use crate::rules::Rule;
/// ARROW-008: Detect SerializedPageReaderState field type assumptions
///
/// Arrow 56.0 changed offset and remaining_bytes from usize to u64
/// in SerializedPageReaderState for WebAssembly compatibility.
pub struct Arrow008Rule;
impl Rule for Arrow008Rule {
fn rule_id(&self) -> &'static str {
"ARROW-008"
}
fn check_rust_source(&self, file_path: &Path, content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
let mut issues = Vec::new();
// Pattern 1: Direct field access
let field_access = Regex::new(r"(\w+)\.(?:offset|remaining_bytes)\b")?;
// Pattern 2: Type annotations expecting usize
let usize_pattern = Regex::new(r":\s*usize\s*=.*\.(?:offset|remaining_bytes)")?;
for (line_num, line) in content.lines().enumerate() {
// Check for direct field access that might assume usize
if let Some(mat) = field_access.find(line) {
// Only flag if it looks like SerializedPageReaderState
if line.contains("SerializedPageReaderState") ||
line.contains("page_reader") ||
line.contains("PageReader") {
issues.push(Issue {
rule_id: self.rule_id().to_string(),
severity: Severity::Warning,
message: "SerializedPageReaderState.offset and .remaining_bytes changed from usize to u64 in Arrow 56.0".to_string(),
file_path: file_path.to_path_buf(),
line: line_num + 1,
column: mat.start() + 1,
suggestion: Some("Verify type compatibility: these fields are now u64. Use explicit casting if needed: field as usize or u64::from(field).".to_string()),
auto_fixable: false,
deprecated_since: Some("56.0.0".to_string()),
changelog_url: Some("https://github.com/apache/arrow-rs/pull/7918".to_string()),
migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-page-reader-types".to_string()),
});
}
}
// Check for explicit usize type annotations
if usize_pattern.is_match(line) {
issues.push(Issue {
rule_id: self.rule_id().to_string(),
severity: Severity::Error,
message: "Type mismatch: SerializedPageReaderState fields are now u64, not usize".to_string(),
file_path: file_path.to_path_buf(),
line: line_num + 1,
column: line.find(": usize").unwrap_or(0) + 1,
suggestion: Some("Change type annotation from 'usize' to 'u64'. These fields were changed for WebAssembly compatibility.".to_string()),
auto_fixable: true,
deprecated_since: Some("56.0.0".to_string()),
changelog_url: Some("https://github.com/apache/arrow-rs/pull/7918".to_string()),
migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-page-reader-types".to_string()),
});
}
}
Ok(issues)
}
fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
Ok(Vec::new())
}
}Tests Required
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_detects_usize_type_annotation() {
let rule = Arrow008Rule;
let content = r#"
use parquet::file::serialized_reader::SerializedPageReaderState;
fn process_state(state: &SerializedPageReaderState) {
let offset: usize = state.offset; // Should trigger error
let remaining: usize = state.remaining_bytes; // Should trigger error
}
"#;
let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
assert_eq\!(issues.len(), 2);
assert\!(issues.iter().any(|i| i.severity == Severity::Error));
assert\!(issues.iter().any(|i| i.auto_fixable));
}
#[test]
fn test_detects_field_access() {
let rule = Arrow008Rule;
let content = r#"
use parquet::file::serialized_reader::SerializedPageReaderState;
fn get_position(page_reader: &SerializedPageReaderState) -> (usize, usize) {
(page_reader.offset, page_reader.remaining_bytes) // Should warn about access
}
"#;
let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
assert\!(issues.len() >= 1);
assert\!(issues.iter().any(|i| i.severity == Severity::Warning));
}
#[test]
fn test_no_issues_for_u64_usage() {
let rule = Arrow008Rule;
let content = r#"
use parquet::file::serialized_reader::SerializedPageReaderState;
fn process_state(state: &SerializedPageReaderState) {
let offset: u64 = state.offset;
let remaining: u64 = state.remaining_bytes;
}
"#;
let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
assert_eq\!(issues.len(), 0);
}
}Integration Steps
-
Add to
src/rules/mod.rs:pub mod arrow_008_parquet_page_reader_types; // In RuleEngine::new(): rules.push(Box::new(arrow_008_parquet_page_reader_types::Arrow008Rule));
-
Test with these patterns:
let offset: usize = state.offset; // Error - type mismatch let pos = page_reader.remaining_bytes; // Warning - verify compatibility let offset: u64 = state.offset; // OK
Acceptance Criteria
- ✅ Detects explicit
usizetype annotations (Error severity) - ✅ Warns about direct field access patterns
- ✅ Provides WebAssembly context in messages
- ✅ Auto-fixable for type annotations only
- ✅ References correct PR ([Parquet] Use
u64forSerializedPageReaderState.offset&remaining_bytes, instead ofusize#7918) and commit
Migration Notes for Users
- Breaking Change: Fields changed from
usizetou64 - Reason: WebAssembly compatibility (usize = 32-bit on wasm32)
- Fix: Update type annotations, add explicit casting where needed
- Impact: Affects code directly accessing these struct fields
Metadata
Metadata
Assignees
Labels
No labels