Skip to content

ARROW-008: Add linter rule for SerializedPageReaderState usize→u64 breaking change #8117

@ryanrussell

Description

@ryanrussell

ARROW-008: SerializedPageReaderState Type Changes (Parquet)

Problem

Arrow 56.0 changed SerializedPageReaderState.offset and remaining_bytes from usize to u64. This breaks code that directly accesses these fields or depends on usize return types.

API Change Details

Changed in commit 5555d30:

// OLD (55.2.0):
pub struct SerializedPageReaderState {
    pub offset: usize,
    pub remaining_bytes: usize,
    // ...
}

// NEW (56.0.0):
pub struct SerializedPageReaderState {
    pub offset: u64,
    pub remaining_bytes: u64, 
    // ...
}

Rationale: WebAssembly compatibility - usize is 32-bit on wasm32, limiting file sizes to 4GB.

Commit: 5555d30b0 - [Parquet] Use u64 for SerializedPageReaderState.offset & remaining_bytes, instead of usize

Implementation Task

Create src/rules/arrow_008_parquet_page_reader_types.rs with these specifications:

Rule Implementation

use regex::Regex;
use std::path::Path;
use crate::output::{Issue, Severity};
use crate::rules::Rule;

/// ARROW-008: Detect SerializedPageReaderState field type assumptions
/// 
/// Arrow 56.0 changed offset and remaining_bytes from usize to u64
/// in SerializedPageReaderState for WebAssembly compatibility.
pub struct Arrow008Rule;

impl Rule for Arrow008Rule {
    fn rule_id(&self) -> &'static str {
        "ARROW-008"
    }

    fn check_rust_source(&self, file_path: &Path, content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
        let mut issues = Vec::new();
        
        // Pattern 1: Direct field access
        let field_access = Regex::new(r"(\w+)\.(?:offset|remaining_bytes)\b")?;
        
        // Pattern 2: Type annotations expecting usize
        let usize_pattern = Regex::new(r":\s*usize\s*=.*\.(?:offset|remaining_bytes)")?;
        
        for (line_num, line) in content.lines().enumerate() {
            // Check for direct field access that might assume usize
            if let Some(mat) = field_access.find(line) {
                // Only flag if it looks like SerializedPageReaderState
                if line.contains("SerializedPageReaderState") || 
                   line.contains("page_reader") ||
                   line.contains("PageReader") {
                    
                    issues.push(Issue {
                        rule_id: self.rule_id().to_string(),
                        severity: Severity::Warning,
                        message: "SerializedPageReaderState.offset and .remaining_bytes changed from usize to u64 in Arrow 56.0".to_string(),
                        file_path: file_path.to_path_buf(),
                        line: line_num + 1,
                        column: mat.start() + 1,
                        suggestion: Some("Verify type compatibility: these fields are now u64. Use explicit casting if needed: field as usize or u64::from(field).".to_string()),
                        auto_fixable: false,
                        deprecated_since: Some("56.0.0".to_string()),
                        changelog_url: Some("https://github.com/apache/arrow-rs/pull/7918".to_string()),
                        migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-page-reader-types".to_string()),
                    });
                }
            }
            
            // Check for explicit usize type annotations
            if usize_pattern.is_match(line) {
                issues.push(Issue {
                    rule_id: self.rule_id().to_string(),
                    severity: Severity::Error,
                    message: "Type mismatch: SerializedPageReaderState fields are now u64, not usize".to_string(),
                    file_path: file_path.to_path_buf(),
                    line: line_num + 1,
                    column: line.find(": usize").unwrap_or(0) + 1,
                    suggestion: Some("Change type annotation from 'usize' to 'u64'. These fields were changed for WebAssembly compatibility.".to_string()),
                    auto_fixable: true,
                    deprecated_since: Some("56.0.0".to_string()),
                    changelog_url: Some("https://github.com/apache/arrow-rs/pull/7918".to_string()),
                    migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-page-reader-types".to_string()),
                });
            }
        }
        
        Ok(issues)
    }

    fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> {
        Ok(Vec::new())
    }
}

Tests Required

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_detects_usize_type_annotation() {
        let rule = Arrow008Rule;
        let content = r#"
use parquet::file::serialized_reader::SerializedPageReaderState;

fn process_state(state: &SerializedPageReaderState) {
    let offset: usize = state.offset; // Should trigger error
    let remaining: usize = state.remaining_bytes; // Should trigger error
}
"#;
        
        let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
        assert_eq\!(issues.len(), 2);
        assert\!(issues.iter().any(|i| i.severity == Severity::Error));
        assert\!(issues.iter().any(|i| i.auto_fixable));
    }

    #[test] 
    fn test_detects_field_access() {
        let rule = Arrow008Rule;
        let content = r#"
use parquet::file::serialized_reader::SerializedPageReaderState;

fn get_position(page_reader: &SerializedPageReaderState) -> (usize, usize) {
    (page_reader.offset, page_reader.remaining_bytes) // Should warn about access
}
"#;
        
        let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
        assert\!(issues.len() >= 1);
        assert\!(issues.iter().any(|i| i.severity == Severity::Warning));
    }

    #[test]
    fn test_no_issues_for_u64_usage() {
        let rule = Arrow008Rule;
        let content = r#"
use parquet::file::serialized_reader::SerializedPageReaderState;

fn process_state(state: &SerializedPageReaderState) {
    let offset: u64 = state.offset;
    let remaining: u64 = state.remaining_bytes;
}
"#;
        
        let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap();
        assert_eq\!(issues.len(), 0);
    }
}

Integration Steps

  1. Add to src/rules/mod.rs:

    pub mod arrow_008_parquet_page_reader_types;
    // In RuleEngine::new():
    rules.push(Box::new(arrow_008_parquet_page_reader_types::Arrow008Rule));
  2. Test with these patterns:

    let offset: usize = state.offset;         // Error - type mismatch
    let pos = page_reader.remaining_bytes;    // Warning - verify compatibility
    let offset: u64 = state.offset;          // OK

Acceptance Criteria

Migration Notes for Users

  • Breaking Change: Fields changed from usize to u64
  • Reason: WebAssembly compatibility (usize = 32-bit on wasm32)
  • Fix: Update type annotations, add explicit casting where needed
  • Impact: Affects code directly accessing these struct fields

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions