Skip to content

Commit 356f2d6

Browse files
committed
count output-line length in charactes (not bytes)
1 parent 9c62c2e commit 356f2d6

File tree

7 files changed

+68
-80
lines changed

7 files changed

+68
-80
lines changed

doc/source/stringsext--man.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,13 +194,13 @@ as *GNU strings* replacement.
194194

195195
**-q** *NUM*, *\--output-line-len*=*NUM*
196196

197-
: Set output-line-length in UTF-8 bytes. Length of the printed output line
198-
in UTF-8 bytes (string-findings only, metadata excluded). The line-length
199-
is limited by some internal buffer size value (see "`OUTPUT_BUF_LEN`" in
200-
source code). A value "`NUM`" bigger than "`OUTPUT_BUF_LEN`" is set to
201-
"`OUT_PUT_LEN`". The longer the line-length is, the fewer strings will be
202-
wrapped to the next line. The downside with long output lines is, that
203-
the scanner loses precision in locating the findings.
197+
: Set the printed output-line-length in UTF-8 characters (string-findings
198+
only, metadata excluded). The line-length is limited by some internal
199+
buffer size value (see "`OUTPUT_BUF_LEN`" in source code). A value
200+
"`NUM`" bigger than "`OUTPUT_BUF_LEN/2`" is set to "`OUTPUT_BUF_LEN/2`".
201+
The longer the line-length is, the fewer strings will be wrapped to the
202+
next line. The downside with long output lines is, that the scanner loses
203+
precision in locating the findings.
204204

205205
**-s** *NUM*, **\--counter-offset**=*NUM*
206206

src/finding.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,13 @@ impl<'a> Deref for FindingCollection<'a> {
142142
/// not determine its exact position.
143143
pub enum Precision {
144144
/// The finding is located somewhere before `Finding::position`. It is
145-
/// guarantied, that the finding is not farer than `--output-line-len -1`
145+
/// guarantied, that the finding is not farer than 2*`--output-line-len`
146146
/// bytes (or the previous finding from the same scanner) away.
147147
Before,
148148
/// The algorithm could determine the exact position of the `Finding` at
149149
/// `Finding::position`.
150150
Exact,
151-
/// The finding is located some `[1..output_line_len]` bytes after
151+
/// The finding is located some `[1..2* --output_line_len]` bytes after
152152
/// `Finding::position` or - in any case - always before the next
153153
/// `Finding::position`.
154154
After,

src/helper.rs

Lines changed: 22 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,11 @@ pub struct SplitStr<'a> {
6060
inp: &'a str,
6161

6262
/// Initially points to the first byte of the `inp`-buffer. In case `ok_s` is
63-
/// very long and has `>=ok_s_len_max`, the iterator stops and sends out
64-
/// `ok_s`. Then `inp_start_p` is moved to the first byte after `ok_s` so that
65-
/// the next `next()` deals with the rest of the string. This way the second
66-
/// half will be identified to be the continuation of the first part.
63+
/// very long and has `>=ok_char_nb_max` characters, the iterator stops and
64+
/// sends out `ok_s`. Then `inp_start_p` is moved to the first byte after
65+
/// `ok_s` so that the next `next()` deals with the rest of the string. This
66+
/// way the second half will be identified to be the continuation of the
67+
/// first part.
6768
inp_start_p: *const u8,
6869

6970
/// Points to the first byte after the end of `inp` buffer.
@@ -75,7 +76,7 @@ pub struct SplitStr<'a> {
7576

7677
/// Criteria that influences the search performed by `next()`. Normally only
7778
/// substrings larger than `>=chars_min_nb` will be returned by `next()`.
78-
/// This rule concerning only substrings touching one o fthe `inp` buffer
79+
/// This rule concerning only substrings touching one of the `inp` buffer
7980
/// boundaries has 2 exceptions:
8081
///
8182
/// 1. When `last_s_was_maybe_cut` is set and
@@ -109,12 +110,8 @@ pub struct SplitStr<'a> {
109110
utf8f: Utf8Filter,
110111

111112
/// This imposes an additional constraint to the iterator and instructs him
112-
/// to never return substrings longer than `s_len_max`. Usually this is equal
113-
/// the `inp`-buffer's length, but there can be exceptions of longer
114-
/// `inp`-buffers. For example when the previous run has left some
115-
/// non-treated `left_over` bytes which are then prepended to the
116-
/// `inp`-buffer. In the worst case, such an `inp` is then twice as large.
117-
s_len_max: usize,
113+
/// to never return substrings longer than `s_char_nb_max`.
114+
s_char_nb_max: usize,
118115
}
119116

120117
/// This enum describes result variants of the `SplitStr::next()` output.
@@ -169,7 +166,7 @@ impl<'a> SplitStr<'a> {
169166
last_s_was_maybe_cut: bool,
170167
invalid_bytes_after_inp: bool,
171168
utf8f: Utf8Filter,
172-
s_len_max: usize,
169+
s_char_nb_max: usize,
173170
) -> SplitStr {
174171
unsafe {
175172
SplitStr {
@@ -187,7 +184,7 @@ impl<'a> SplitStr<'a> {
187184
// We will set this to false later, if `utf8f.grep_char` requires some
188185
// additional checking.
189186
utf8f,
190-
s_len_max,
187+
s_char_nb_max,
191188
}
192189
}
193190
}
@@ -209,24 +206,21 @@ impl<'a> Iterator for SplitStr<'a> {
209206
let mut ok_s_len = 0usize;
210207
let mut ok_char_nb = 0usize;
211208
// The longest `ok_s` we want to return in one `next()` iteration is
212-
// of length `ok_s_len_max`, which the usual `inp`-buffer size
213-
// when no extra bytes are prepended.
209+
// of length `ok_char_nb_max`.
214210
// When we return such a maximum length string, we
215-
// keep the rest in `inp` for `next()`. Such a long string can only
216-
// appear, when some bytes form the last run had been prepended to
217-
// 'inp'.
218-
let ok_s_len_max = self.s_len_max;
211+
// keep the rest in `inp` for `next()`.
212+
let ok_char_nb_max = self.s_char_nb_max;
219213

220214
// The following loop has 4 exits:
221215
// 1. We finished the whole buffer: `self.p >= self.inp`
222-
// 2. A long string was found: `ok_s_len > ok_s_len_max`,
216+
// 2. A long string was found: `ok_char_nb > ok_char_nb_max`,
223217
// `p` points to the first of the remaining bytes, left
224218
// for the next `next()` run.
225219
// 3. We found a substring at the beginning of the buffer;
226220
// 4. We found a substring in somewhere in middle of the buffer;
227221

228222
// Exit 1. and 2.
229-
while self.p < self.inp_end_p && ok_s_len < ok_s_len_max {
223+
while self.p < self.inp_end_p && ok_char_nb < ok_char_nb_max {
230224
// We do not need an additional boundary check, because we
231225
// know from above that there is at least one character in
232226
// `inp` and there are only valid UTF-8 in here.
@@ -314,18 +308,14 @@ impl<'a> Iterator for SplitStr<'a> {
314308
// Exit 2 or 3:
315309
let s_touches_right_boundary = unsafe { ok_s_p.add(ok_s_len) } >= self.inp_end_p;
316310

317-
let s_is_maybe_cut =
318-
ok_s_len >= ok_s_len_max || (s_touches_right_boundary && !self.invalid_bytes_after_inp);
311+
let s_is_maybe_cut = ok_char_nb >= ok_char_nb_max
312+
|| (s_touches_right_boundary && !self.invalid_bytes_after_inp);
319313
let s_completes_previous_s = s_touches_left_boundary && self.last_s_was_maybe_cut;
320314

321315
// With this flag we tell the caller, that he should not immediately
322316
// print the returned string, but rather insert it at the the beginning
323317
// of the next input buffer and decode and run `SplitStr` again.
324318
//
325-
// Note, we require, that `ok_s_len` is at least 1 byte SMALLER then
326-
// `self.s_len_max` (`ok_s_len < self.s_len_max`). This way
327-
// we print strings that fill the whole output line directly.
328-
//
329319
// Note, `&& !s_completes_previous_s` guarantees, that
330320
// `s_is_to_be_filtered_again` is only set out for the first part
331321
// of a longer cut string. We only want the first part of string to be
@@ -341,23 +331,23 @@ impl<'a> Iterator for SplitStr<'a> {
341331
// 2. When the first part (==`!not_completes_previous`) of a longer
342332
// string who touches the right buffer boundary
343333
// (`==s_touches_right_boundary`) did start somewhere in the middle of
344-
// the buffer (==`ok_s_len < self.s_len_max`). We actually could
334+
// the buffer (==`ok_char_nb < self.s_char_nb_max`). We actually could
345335
// print it out now, because it has the minimum length, but we want to
346336
// print the beginning of a every string as long as possible (approx
347-
// `output_line_length`). Instead, we rather set
337+
// `output_line_char_nb_max`). Instead, we rather set
348338
// `s_is_to_be_filtered_again` instruction the caller to insert
349339
// this string at the beginning of the next buffer. Doing so, we
350340
// guarantee, that string beginnings are always assembled, even if they
351341
// crossed buffer boundaries. Thus, the user can pipe the output of
352342
// `stringsext` through additional filters, e.g. searching for
353343
// particular patterns.
354344
//
355-
// As `ok_char_nb < chars_min_nb` is part of `ok_s_len < self.s_len_max`
345+
// As `ok_char_nb < chars_min_nb` is part of `ok_s_len < self.s_char_nb_max`
356346
// we do not need to add this condition explicitly below.
357347
let s_is_to_be_filtered_again = !s_completes_previous_s
358348
&& s_touches_right_boundary
359349
&& !self.invalid_bytes_after_inp
360-
&& (ok_s_len < self.s_len_max || !grep_char_ok);
350+
&& (ok_char_nb < self.s_char_nb_max || !grep_char_ok);
361351

362352
let s_satisfies_min_char_rule = ok_char_nb >= self.chars_min_nb as usize;
363353
let s_satisfies_grep_char_rule = grep_char_ok;
@@ -383,7 +373,7 @@ impl<'a> Iterator for SplitStr<'a> {
383373
};
384374

385375
// Exit was 2: prepare the inner state for the next `next()` run.
386-
if ok_s_len >= ok_s_len_max {
376+
if ok_char_nb >= ok_char_nb_max {
387377
self.inp_start_p = self.p;
388378
};
389379
self.last_s_was_maybe_cut = s_is_maybe_cut;

src/main.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
//!
1919
//! 3. Each thread runs a search in `main::slice` == `scanner::input_buffer`. The
2020
//! search is performed by `scanner::scan()`, which cuts the `scanner::input_buffer`
21-
//! into smaller chunks of size `output_line_len` hereafter called `input_window`.
21+
//! into smaller chunks of size 2*`output_line_char_nb_max` bytes hereafter called
22+
//! `input_window`.
2223
//!
2324
//! 4. The `Decoder` runs through the `input_window`, searches for valid strings and
2425
//! decodes them into UTF-8-chunks.
@@ -27,7 +28,7 @@
2728
//! analyzed if parts of it satisfy certain filter conditions.
2829
//!
2930
//! 6. Doing so, the `helper::SplitStr` cuts the UTF-8-chunk into even smaller
30-
//! `SplitStr`-chunks not longer than `output_line_len` and sends them back to the
31+
//! `SplitStr`-chunks not longer than `output_line_char_nb_max` and sends them back to the
3132
//! `scanner::scan()` loop.
3233
//!
3334
//! 7. There the `SplitStr`-chunk is packed into a `finding::Finding` object and
@@ -105,7 +106,8 @@ fn run() -> Result<(), anyhow::Error> {
105106
Some(ref fname) => {
106107
let f = File::create(&Path::new(fname.as_str()))?;
107108
// There is at least one `Mission` in `MISSIONS`.
108-
let output_line_len = MISSIONS[0].output_line_len + OUTPUT_LINE_METADATA_LEN;
109+
let output_line_len =
110+
2 * MISSIONS[0].output_line_char_nb_max + OUTPUT_LINE_METADATA_LEN;
109111
let f = LineWriter::with_capacity(output_line_len, f);
110112
Box::new(f) as Box<dyn Write>
111113
}

src/mission.rs

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ use crate::counter_offset_default;
99
use crate::encoding_default;
1010
use crate::input::ByteCounter;
1111
use crate::options::ARGS;
12-
use crate::options::OUTPUT_LINE_LEN_MIN;
13-
use crate::output_line_len_default;
12+
use crate::options::OUTPUT_LINE_CHAR_NB_MIN;
13+
use crate::output_line_char_nb_max_default;
1414
use anyhow::{anyhow, Context, Result};
1515
use encoding_rs::*;
1616
use lazy_static::lazy_static;
@@ -421,10 +421,10 @@ pub struct Mission {
421421
/// A filter, defining additional criteria for a finding to be printed.
422422
pub filter: Utf8Filter,
423423

424-
/// Maximum length of output-lines in UTF-8 bytes. Findings that do not fit,
425-
/// will be wrapped to two or more lines. The label `+` indicates that this
426-
/// line is the continuation of the previous line.
427-
pub output_line_len: usize,
424+
/// Maximum length of output-lines in UTF-8 characters. Findings that do not
425+
/// fit, will be wrapped to two or more lines. The label `+` indicates that
426+
/// this line is the continuation of the previous line.
427+
pub output_line_char_nb_max: usize,
428428

429429
/// The `encoding_rs` decoder has no direct support for ASCII. As a
430430
/// workaround, we simulate the missing ASCII-decoder with the
@@ -570,11 +570,11 @@ impl Missions {
570570
let flag_output_line_len =
571571
parse_integer!(flag_output_line_len, usize::from_str_radix, usize::from_str);
572572
if let Some(m) = flag_output_line_len {
573-
if m < OUTPUT_LINE_LEN_MIN {
573+
if m < OUTPUT_LINE_CHAR_NB_MIN {
574574
return Err(anyhow!(
575575
"minimum for `--output-line-len` is `{}`, \
576576
you tried: `{}`.",
577-
OUTPUT_LINE_LEN_MIN,
577+
OUTPUT_LINE_CHAR_NB_MIN,
578578
m
579579
));
580580
}
@@ -615,19 +615,19 @@ impl Missions {
615615
},
616616
};
617617

618-
let output_line_len = match flag_output_line_len {
618+
let output_line_char_nb_max = match flag_output_line_len {
619619
Some(n) => n,
620-
None => output_line_len_default!(),
620+
None => output_line_char_nb_max_default!(),
621621
};
622622

623-
if output_line_len < OUTPUT_LINE_LEN_MIN {
623+
if output_line_char_nb_max < OUTPUT_LINE_CHAR_NB_MIN {
624624
return Err(anyhow!(
625625
"Scanner {}: \
626626
minimum for `--output-line-len` is `{}`, \
627627
you tried: `{}`.",
628628
char::from((mission_id + 97) as u8),
629-
OUTPUT_LINE_LEN_MIN,
630-
output_line_len,
629+
OUTPUT_LINE_CHAR_NB_MIN,
630+
output_line_char_nb_max,
631631
));
632632
}
633633

@@ -703,7 +703,7 @@ impl Missions {
703703
encoding,
704704
chars_min_nb,
705705
filter,
706-
output_line_len,
706+
output_line_char_nb_max,
707707
mission_id: mission_id as u8,
708708
print_encoding_as_ascii,
709709
});

src/options.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,15 @@ macro_rules! counter_offset_default {
4646
/// Default value when no `--output-line-len`
4747
/// command-line-argument is given. Must be `usize`.
4848
#[macro_export]
49-
macro_rules! output_line_len_default {
49+
macro_rules! output_line_char_nb_max_default {
5050
() => {
5151
60usize
5252
};
5353
}
5454

5555
/// There must be space for at least 3 long Unicode characters,
5656
/// to guarantee progress in streaming. You want much longer lines.
57-
pub const OUTPUT_LINE_LEN_MIN: usize = 12;
57+
pub const OUTPUT_LINE_CHAR_NB_MIN: usize = 6;
5858

5959
/// Message printed for command-line `--help`.
6060
const USAGE: &str = concat!(
@@ -84,8 +84,8 @@ Options:
8484
chars_min_default!(),
8585
").
8686
-p FILE, --output=FILE Print not to stdout but in file.
87-
-q NUM, --output-line-len=NUM Output line length in UTF-8 bytes (default: ",
88-
output_line_len_default!(),
87+
-q NUM, --output-line-len=NUM Output line length in UTF-8 characters (default: ",
88+
output_line_char_nb_max_default!(),
8989
").
9090
-s NUM, --counter-offset=NUM Start counting input bytes with NUM (default: ",
9191
counter_offset_default!(),

0 commit comments

Comments
 (0)