Skip to content

Commit 861818b

Browse files
jdxclaude
andcommitted
fix(step): only auto-batch when rendered command exceeds ARG_MAX
Auto-batching previously split jobs based on the size of the file-list expansion alone — even when the run command never interpolated `{{files}}`. On Windows (ARG_MAX falls back to 128KB), a hook step like `check = "echo static-message"` running on a large repo (~20K files) would be split into ~29 jobs and the message would print 29 times. Move auto-batching out of `build_step_jobs` and into a new `auto_batch_jobs` invoked from execution time, where the full tera context is available. The new implementation renders the actual run command for each job and only splits jobs whose rendered command would exceed the safe ARG_MAX limit, falling back to the previous byte estimation if rendering fails. Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
1 parent df83654 commit 861818b

5 files changed

Lines changed: 167 additions & 88 deletions

File tree

src/step/batching.rs

Lines changed: 115 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -3,114 +3,148 @@
33
//! When passing large file lists to shell commands, the total argument length can exceed
44
//! the operating system's `ARG_MAX` limit, causing "Argument list too long" errors.
55
//!
6-
//! This module provides automatic batching that:
7-
//! 1. Estimates the shell-quoted size of file lists
8-
//! 2. Uses binary search to find optimal batch sizes
9-
//! 3. Splits jobs to stay within safe limits
6+
//! This module renders the actual run command for each job and only splits jobs whose
7+
//! rendered command exceeds the safe limit. Steps whose commands don't reference
8+
//! `{{files}}` (or render to a small string for any other reason) are left as a
9+
//! single job, even when the underlying file list is large.
1010
1111
use crate::env;
1212
use crate::step_job::StepJob;
13+
use crate::tera;
1314
use std::path::PathBuf;
15+
use std::sync::Arc;
1416

1517
use super::types::Step;
1618

1719
impl Step {
1820
/// Estimates the size of the `{{files}}` template variable expansion.
1921
///
20-
/// This includes shell quoting overhead and spaces between files.
21-
/// Uses a conservative estimate (2x + 2 for quotes, +1 for space) to ensure
22-
/// we don't underestimate.
23-
///
24-
/// # Arguments
25-
///
26-
/// * `files` - The list of files to estimate
27-
///
28-
/// # Returns
29-
///
30-
/// Estimated byte size of the quoted file list string
22+
/// Used as a fallback when rendering the run command fails (e.g. the
23+
/// template references a variable not yet present in the context).
3124
pub(crate) fn estimate_files_string_size(&self, files: &[PathBuf]) -> usize {
3225
files
3326
.iter()
3427
.map(|f| {
3528
let path_str = f.to_str().unwrap_or("");
36-
// Estimate quoted size: conservative estimate assuming worst-case quoting
37-
// For shell quoting, worst case is roughly 2x + 2 (quotes)
38-
path_str.len() * 2 + 2 + 1 // +1 for space separator
29+
// Worst-case quoted size: 2x + 2 (quotes), +1 for space separator
30+
path_str.len() * 2 + 2 + 1
3931
})
4032
.sum()
4133
}
4234

43-
/// Automatically batch jobs if the file list would exceed safe ARG_MAX limits.
44-
///
45-
/// This prevents "Argument list too long" errors when passing large file lists
46-
/// to commands. Uses binary search to find the largest batch size that fits
47-
/// within a safe limit (50% of ARG_MAX to account for environment variables
48-
/// and the command itself).
49-
///
50-
/// # Arguments
51-
///
52-
/// * `jobs` - The jobs to potentially batch
35+
/// Render the run command for a hypothetical job containing `files` and return
36+
/// its byte length. Returns `None` if rendering fails (e.g. the template
37+
/// references a variable not in the context).
38+
fn render_run_command_size(
39+
&self,
40+
original_job: &StepJob,
41+
files: &[PathBuf],
42+
base_tctx: &tera::Context,
43+
) -> Option<usize> {
44+
let run_cmd = if original_job.check_first {
45+
self.check_first_cmd()
46+
} else {
47+
self.run_cmd(original_job.run_type)
48+
}?;
49+
let run = run_cmd.to_string();
50+
if run.trim().is_empty() {
51+
return None;
52+
}
53+
let run = if let Some(prefix) = &self.prefix {
54+
format!("{prefix} {run}")
55+
} else {
56+
run
57+
};
58+
59+
let mut temp = StepJob::new(
60+
Arc::clone(&original_job.step),
61+
files.to_vec(),
62+
original_job.run_type,
63+
);
64+
temp.check_first = original_job.check_first;
65+
if let Some(wi) = original_job.workspace_indicator() {
66+
temp = temp.with_workspace_indicator(wi.clone());
67+
}
68+
let tctx = temp.tctx(base_tctx);
69+
tera::render(&run, &tctx).ok().map(|s| s.len())
70+
}
71+
72+
/// Automatically batch jobs whose rendered run command would exceed the safe ARG_MAX limit.
5373
///
54-
/// # Returns
74+
/// Uses 50% of `ARG_MAX` as the safety margin (accounts for env vars and the command itself).
75+
/// Renders the actual run command with each candidate file subset; if the rendered command
76+
/// fits, no batching is performed. Otherwise binary-searches the largest batch size whose
77+
/// rendered command still fits.
5578
///
56-
/// A new list of jobs, potentially with large jobs split into multiple smaller ones
57-
pub(crate) fn auto_batch_jobs_if_needed(&self, jobs: Vec<StepJob>) -> Vec<StepJob> {
58-
// Use 50% of ARG_MAX as a safety margin, accounting for environment variables
59-
// and the command itself
60-
let safe_limit = *env::ARG_MAX / 2;
79+
/// If rendering fails for any reason, falls back to estimating the size of the quoted
80+
/// file-list expansion — preserves the previous (purely size-based) behavior as a safety net.
81+
pub(crate) fn auto_batch_jobs(
82+
&self,
83+
jobs: Vec<StepJob>,
84+
base_tctx: &tera::Context,
85+
) -> Vec<StepJob> {
86+
if self.stdin.is_some() {
87+
// stdin path doesn't pass files via argv; never auto-batch
88+
return jobs;
89+
}
6190

62-
let mut batched_jobs = Vec::new();
91+
let safe_limit = *env::ARG_MAX / 2;
92+
let mut batched_jobs = Vec::with_capacity(jobs.len());
6393

6494
for job in jobs {
65-
let estimated_size = self.estimate_files_string_size(&job.files);
66-
67-
if estimated_size > safe_limit && job.files.len() > 1 {
68-
// Need to batch this job
69-
debug!(
70-
"{}: auto-batching {} files (estimated size: {} bytes, limit: {} bytes)",
71-
self.name,
72-
job.files.len(),
73-
estimated_size,
74-
safe_limit
75-
);
76-
77-
// Binary search to find the largest batch_size where files fit within safe_limit
78-
let mut low = 1;
79-
let mut high = job.files.len();
80-
81-
while low < high {
82-
let mid = (low + high).div_ceil(2);
83-
let test_size = self.estimate_files_string_size(&job.files[..mid]);
84-
85-
if test_size <= safe_limit {
86-
// mid files fit, try larger
87-
low = mid;
88-
} else {
89-
// mid files don't fit, try smaller
90-
high = mid - 1;
91-
}
95+
if job.skip_reason.is_some() || job.files.len() <= 1 {
96+
batched_jobs.push(job);
97+
continue;
98+
}
99+
100+
// Try render-based sizing first; fall back to byte estimation on render failure.
101+
let full_size = self
102+
.render_run_command_size(&job, &job.files, base_tctx)
103+
.unwrap_or_else(|| self.estimate_files_string_size(&job.files));
104+
105+
if full_size <= safe_limit {
106+
batched_jobs.push(job);
107+
continue;
108+
}
109+
110+
debug!(
111+
"{}: auto-batching {} files (rendered size: {} bytes, limit: {} bytes)",
112+
self.name,
113+
job.files.len(),
114+
full_size,
115+
safe_limit
116+
);
117+
118+
// Binary search the largest batch size whose rendered command fits.
119+
let mut low = 1;
120+
let mut high = job.files.len();
121+
while low < high {
122+
let mid = (low + high).div_ceil(2);
123+
let test_size = self
124+
.render_run_command_size(&job, &job.files[..mid], base_tctx)
125+
.unwrap_or_else(|| self.estimate_files_string_size(&job.files[..mid]));
126+
if test_size <= safe_limit {
127+
low = mid;
128+
} else {
129+
high = mid - 1;
92130
}
131+
}
132+
let batch_size = low.max(1);
133+
134+
debug!(
135+
"{}: using batch size of {} files per batch",
136+
self.name, batch_size
137+
);
93138

94-
// After binary search, low contains the largest batch size that fits
95-
let batch_size = low.max(1); // Ensure at least 1 file per batch
96-
97-
debug!(
98-
"{}: using batch size of {} files per batch",
99-
self.name, batch_size
100-
);
101-
102-
// Create batched jobs - use the StepJob constructor to properly handle private fields
103-
for chunk in job.files.chunks(batch_size) {
104-
let new_job = StepJob::new(job.step.clone(), chunk.to_vec(), job.run_type);
105-
// Note: we can't preserve workspace_indicator or other private fields
106-
// without adding public methods to StepJob. For now, batching will
107-
// break workspace_indicator jobs, but that's acceptable since those
108-
// are typically small workspaces.
109-
batched_jobs.push(new_job);
139+
for chunk in job.files.chunks(batch_size) {
140+
let mut new_job = StepJob::new(Arc::clone(&job.step), chunk.to_vec(), job.run_type);
141+
// Preserve job-level state that isn't reconstructed by StepJob::new.
142+
new_job.check_first = job.check_first;
143+
new_job.skip_reason = job.skip_reason.clone();
144+
if let Some(wi) = job.workspace_indicator() {
145+
new_job = new_job.with_workspace_indicator(wi.clone());
110146
}
111-
} else {
112-
// No batching needed
113-
batched_jobs.push(job);
147+
batched_jobs.push(new_job);
114148
}
115149
}
116150

src/step/execution.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,16 @@ impl Step {
6666
}
6767

6868
let files = ctx.hook_ctx.files();
69-
let mut jobs = self.build_step_jobs(
69+
let jobs = self.build_step_jobs(
7070
&files,
7171
ctx.hook_ctx.run_type,
7272
&ctx.hook_ctx.files_in_contention.lock().unwrap(),
7373
&ctx.hook_ctx.skip_steps,
7474
)?;
75+
// Apply ARG_MAX-safe auto-batching now that the full tera context is
76+
// available — only split jobs whose rendered run command would actually
77+
// exceed the limit.
78+
let mut jobs = self.auto_batch_jobs(jobs, &ctx.hook_ctx.tctx);
7579
if let Some(job) = jobs.first_mut() {
7680
job.semaphore = Some(semaphore);
7781
}

src/step/job_builder.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,11 @@ impl Step {
3333
/// 2. Check if step has a command for the run type
3434
/// 3. Filter files based on step configuration
3535
/// 4. Split into workspace jobs (if workspace_indicator set) or batch jobs
36-
/// 5. Apply auto-batching for ARG_MAX safety
37-
/// 6. Configure check_first based on file contention
36+
/// 5. Configure check_first based on file contention
37+
///
38+
/// Auto-batching for ARG_MAX safety is applied later by [`Step::auto_batch_jobs`]
39+
/// (called from execution time) so the full tera context is available to render
40+
/// the actual command.
3841
///
3942
/// # Arguments
4043
///
@@ -143,10 +146,10 @@ impl Step {
143146
)]
144147
};
145148

146-
if self.stdin.is_none() {
147-
// Auto-batch any jobs where the file list would exceed safe limits
148-
jobs = self.auto_batch_jobs_if_needed(jobs);
149-
}
149+
// Note: auto-batching for ARG_MAX safety happens at execution time
150+
// (see `Step::auto_batch_jobs`) where the full tera context is available
151+
// to render the actual run command — so steps whose commands don't
152+
// reference `{{files}}` are not split into many jobs unnecessarily.
150153

151154
// Apply profile skip only after determining files/no-files, so NoFilesToProcess wins
152155
// Also, if a condition is present, defer profile checks to run() so ConditionFalse wins

src/step_job.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ impl StepJob {
6060
self
6161
}
6262

63+
pub fn workspace_indicator(&self) -> Option<&PathBuf> {
64+
self.workspace_indicator.as_ref()
65+
}
66+
6367
pub fn tctx(&self, base: &tera::Context) -> tera::Context {
6468
let mut tctx = base.clone();
6569

test/auto_batch_large_files.bats

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,40 @@ EOF
6666
assert_output --partial "file10.txt"
6767
}
6868

69+
@test "auto-batch does not split steps that don't reference {{files}}" {
70+
# Regression test: previously hk would auto-batch any step on a large file
71+
# list based on the *file list* size, even if the run command never
72+
# interpolated `{{files}}`. With render-based sizing, a static command
73+
# should run as a single job regardless of how many files match.
74+
cat <<EOF > hk.pkl
75+
amends "$PKL_PATH/Config.pkl"
76+
hooks {
77+
["check"] {
78+
steps {
79+
["static-message"] {
80+
check = "echo hello world"
81+
}
82+
}
83+
}
84+
}
85+
EOF
86+
87+
# Create enough files (with long paths) that the *file-list* expansion
88+
# would blow past ARG_MAX/2 and force batching under the old logic.
89+
for i in {1..1000}; do
90+
dir="very/long/directory/path/number/$i/with/many/levels/to/increase/path/length"
91+
mkdir -p "$dir"
92+
echo "test" > "$dir/file_with_very_long_name_to_increase_arg_size_$i.txt"
93+
done
94+
95+
run hk check --all
96+
assert_success
97+
# Each batch produces its own "N files – ... – <command>" progress line;
98+
# one such line means the step ran as a single job.
99+
run bash -c "hk check --all 2>&1 | grep -Ec 'files –.*– echo hello world' || true"
100+
assert_output "1"
101+
}
102+
69103
@test "auto-batch with fix command" {
70104
cat <<EOF > hk.pkl
71105
amends "$PKL_PATH/Config.pkl"

0 commit comments

Comments
 (0)