count output-line length in charactes (not bytes)

getreu · getreu · commit 356f2d67d216 · 2020-01-08T13:38:24.000+02:00
diff --git a/doc/source/stringsext--man.md b/doc/source/stringsext--man.md
@@ -194,13 +194,13 @@ as *GNU strings* replacement.
 
 **-q** *NUM*, *\--output-line-len*=*NUM*  
 
-:   Set output-line-length in UTF-8 bytes. Length of the printed output line
-    in UTF-8 bytes (string-findings only, metadata excluded). The line-length
-    is limited by some internal buffer size value (see "`OUTPUT_BUF_LEN`" in
-    source code). A value "`NUM`" bigger than "`OUTPUT_BUF_LEN`" is set to
-    "`OUT_PUT_LEN`". The longer the line-length is, the fewer strings will be
-    wrapped to the next line. The downside with long output lines is, that
-    the scanner loses precision in locating the findings.
+:   Set the printed output-line-length in UTF-8 characters (string-findings
+    only, metadata excluded). The line-length is limited by some internal
+    buffer size value (see "`OUTPUT_BUF_LEN`" in source code). A value
+    "`NUM`" bigger than "`OUTPUT_BUF_LEN/2`" is set to "`OUTPUT_BUF_LEN/2`".
+    The longer the line-length is, the fewer strings will be wrapped to the
+    next line. The downside with long output lines is, that the scanner loses
+    precision in locating the findings.
 
 **-s** *NUM*, **\--counter-offset**=*NUM*
 
diff --git a/src/finding.rs b/src/finding.rs
@@ -142,13 +142,13 @@ impl<'a> Deref for FindingCollection<'a> {
 /// not determine its exact position.
 pub enum Precision {
     /// The finding is located somewhere before `Finding::position`. It is
-    /// guarantied, that the finding is not farer than `--output-line-len -1`
+    /// guarantied, that the finding is not farer than 2*`--output-line-len`
     /// bytes (or the previous finding from the same scanner) away.
     Before,
     /// The algorithm could determine the exact position of the `Finding` at
     /// `Finding::position`.
     Exact,
-    /// The finding is located some `[1..output_line_len]` bytes after
+    /// The finding is located some `[1..2* --output_line_len]` bytes after
     /// `Finding::position` or - in any case - always before the next
     /// `Finding::position`.
     After,
diff --git a/src/helper.rs b/src/helper.rs
@@ -60,10 +60,11 @@ pub struct SplitStr<'a> {
     inp: &'a str,
 
     /// Initially points to the first byte of the `inp`-buffer. In case `ok_s` is
-    /// very long and has `>=ok_s_len_max`, the iterator stops and sends out
-    /// `ok_s`. Then `inp_start_p` is moved to the first byte after `ok_s` so that
-    /// the next `next()` deals with the rest of the string. This way the second
-    /// half will be identified to be the continuation of the first part.
+    /// very long and has `>=ok_char_nb_max` characters, the iterator stops and
+    /// sends out `ok_s`. Then `inp_start_p` is moved to the first byte after
+    /// `ok_s` so that the next `next()` deals with the rest of the string. This
+    /// way the second half will be identified to be the continuation of the
+    /// first part.
     inp_start_p: *const u8,
 
     /// Points to the first byte after the end of `inp` buffer.
@@ -75,7 +76,7 @@ pub struct SplitStr<'a> {
 
     /// Criteria that influences the search performed by `next()`. Normally only
     /// substrings larger than `>=chars_min_nb` will be returned by `next()`.
-    /// This rule concerning only substrings touching one o fthe `inp` buffer
+    /// This rule concerning only substrings touching one of the `inp` buffer
     /// boundaries has 2 exceptions:
     ///   
     /// 1. When `last_s_was_maybe_cut` is set and
@@ -109,12 +110,8 @@ pub struct SplitStr<'a> {
     utf8f: Utf8Filter,
 
     /// This imposes an additional constraint to the iterator and instructs him
-    /// to never return substrings longer than `s_len_max`. Usually this is equal
-    /// the `inp`-buffer's length, but there can be exceptions of longer
-    /// `inp`-buffers. For example when the previous run has left some
-    /// non-treated `left_over` bytes which are then prepended to the
-    /// `inp`-buffer. In the worst case, such an `inp` is then twice as large.
-    s_len_max: usize,
+    /// to never return substrings longer than `s_char_nb_max`.
+    s_char_nb_max: usize,
 }
 
 /// This enum describes result variants of the `SplitStr::next()` output.
@@ -169,7 +166,7 @@ impl<'a> SplitStr<'a> {
         last_s_was_maybe_cut: bool,
         invalid_bytes_after_inp: bool,
         utf8f: Utf8Filter,
-        s_len_max: usize,
+        s_char_nb_max: usize,
     ) -> SplitStr {
         unsafe {
             SplitStr {
@@ -187,7 +184,7 @@ impl<'a> SplitStr<'a> {
                 // We will set this to false later, if `utf8f.grep_char` requires some
                 // additional checking.
                 utf8f,
-                s_len_max,
+                s_char_nb_max,
             }
         }
     }
@@ -209,24 +206,21 @@ impl<'a> Iterator for SplitStr<'a> {
         let mut ok_s_len = 0usize;
         let mut ok_char_nb = 0usize;
         // The longest `ok_s` we want to return in one `next()` iteration is
-        // of length `ok_s_len_max`, which the usual `inp`-buffer size
-        // when no extra bytes are prepended.
+        // of length `ok_char_nb_max`.
         // When we return such a maximum length string, we
-        // keep the rest in `inp` for `next()`. Such a long string can only
-        // appear, when some bytes form the last run had been prepended to
-        // 'inp'.
-        let ok_s_len_max = self.s_len_max;
+        // keep the rest in `inp` for `next()`.
+        let ok_char_nb_max = self.s_char_nb_max;
 
         // The following loop has 4 exits:
         // 1. We finished the whole buffer: `self.p >= self.inp`
-        // 2. A long string was found: `ok_s_len > ok_s_len_max`,
+        // 2. A long string was found: `ok_char_nb > ok_char_nb_max`,
         //   `p` points to the first of the remaining bytes, left
         //    for the next `next()` run.
         // 3. We found a substring at the beginning of the buffer;
         // 4. We found a substring in somewhere in middle of the buffer;
 
         // Exit 1. and 2.
-        while self.p < self.inp_end_p && ok_s_len < ok_s_len_max {
+        while self.p < self.inp_end_p && ok_char_nb < ok_char_nb_max {
             // We do not need an additional boundary check, because we
             // know from above that there is at least one character in
             // `inp` and there are only valid UTF-8 in here.
@@ -314,18 +308,14 @@ impl<'a> Iterator for SplitStr<'a> {
         // Exit 2 or 3:
         let s_touches_right_boundary = unsafe { ok_s_p.add(ok_s_len) } >= self.inp_end_p;
 
-        let s_is_maybe_cut =
-            ok_s_len >= ok_s_len_max || (s_touches_right_boundary && !self.invalid_bytes_after_inp);
+        let s_is_maybe_cut = ok_char_nb >= ok_char_nb_max
+            || (s_touches_right_boundary && !self.invalid_bytes_after_inp);
         let s_completes_previous_s = s_touches_left_boundary && self.last_s_was_maybe_cut;
 
         // With this flag we tell the caller, that he should not immediately
         // print the returned string, but rather insert it at the the beginning
         // of the next input buffer and decode and run `SplitStr` again.
         //
-        // Note, we require, that `ok_s_len` is at least 1 byte SMALLER then
-        // `self.s_len_max` (`ok_s_len < self.s_len_max`). This way
-        // we print strings that fill the whole output line directly.
-        //
         // Note, `&& !s_completes_previous_s` guarantees, that
         // `s_is_to_be_filtered_again` is only set out for the first part
         // of a longer cut string. We only want the first part of string to be
@@ -341,23 +331,23 @@ impl<'a> Iterator for SplitStr<'a> {
         // 2. When the first part (==`!not_completes_previous`) of a longer
         // string who touches the right buffer boundary
         // (`==s_touches_right_boundary`) did start somewhere in the middle of
-        // the buffer (==`ok_s_len < self.s_len_max`). We actually could
+        // the buffer (==`ok_char_nb < self.s_char_nb_max`). We actually could
         // print it out now, because it has the minimum length, but we want to
         // print the beginning of a every string as long as possible (approx
-        // `output_line_length`). Instead, we rather set
+        // `output_line_char_nb_max`). Instead, we rather set
         // `s_is_to_be_filtered_again` instruction the caller to insert
         // this string at the beginning of the next buffer. Doing so, we
         // guarantee, that string beginnings are always assembled, even if they
         // crossed buffer boundaries. Thus, the user can pipe the output of
         // `stringsext` through additional filters, e.g. searching for
         // particular patterns.
         //
-        // As `ok_char_nb < chars_min_nb` is part of `ok_s_len < self.s_len_max`
+        // As `ok_char_nb < chars_min_nb` is part of `ok_s_len < self.s_char_nb_max`
         // we do not need to add this condition explicitly below.
         let s_is_to_be_filtered_again = !s_completes_previous_s
             && s_touches_right_boundary
             && !self.invalid_bytes_after_inp
-            && (ok_s_len < self.s_len_max || !grep_char_ok);
+            && (ok_char_nb < self.s_char_nb_max || !grep_char_ok);
 
         let s_satisfies_min_char_rule = ok_char_nb >= self.chars_min_nb as usize;
         let s_satisfies_grep_char_rule = grep_char_ok;
@@ -383,7 +373,7 @@ impl<'a> Iterator for SplitStr<'a> {
         };
 
         // Exit was 2: prepare the inner state for the next `next()` run.
-        if ok_s_len >= ok_s_len_max {
+        if ok_char_nb >= ok_char_nb_max {
             self.inp_start_p = self.p;
         };
         self.last_s_was_maybe_cut = s_is_maybe_cut;
diff --git a/src/main.rs b/src/main.rs
@@ -18,7 +18,8 @@
 //!
 //!  3. Each thread runs a search in `main::slice` == `scanner::input_buffer`. The
 //!  search is performed by `scanner::scan()`, which cuts the `scanner::input_buffer`
-//!  into smaller chunks of size `output_line_len` hereafter called `input_window`.
+//!  into smaller chunks of size 2*`output_line_char_nb_max` bytes hereafter called
+//! `input_window`.
 //!
 //!  4. The `Decoder` runs through the `input_window`, searches for valid strings and
 //!  decodes them into UTF-8-chunks.
@@ -27,7 +28,7 @@
 //!  analyzed if parts of it satisfy certain filter conditions.
 //!
 //!  6. Doing so, the `helper::SplitStr` cuts the UTF-8-chunk into even smaller
-//!  `SplitStr`-chunks not longer than `output_line_len` and sends them back to the
+//!  `SplitStr`-chunks not longer than `output_line_char_nb_max` and sends them back to the
 //!  `scanner::scan()` loop.
 //!
 //!  7. There the `SplitStr`-chunk is packed into a `finding::Finding` object and
@@ -105,7 +106,8 @@ fn run() -> Result<(), anyhow::Error> {
                 Some(ref fname) => {
                     let f = File::create(&Path::new(fname.as_str()))?;
                     // There is at least one `Mission` in `MISSIONS`.
-                    let output_line_len = MISSIONS[0].output_line_len + OUTPUT_LINE_METADATA_LEN;
+                    let output_line_len =
+                        2 * MISSIONS[0].output_line_char_nb_max + OUTPUT_LINE_METADATA_LEN;
                     let f = LineWriter::with_capacity(output_line_len, f);
                     Box::new(f) as Box<dyn Write>
                 }
diff --git a/src/mission.rs b/src/mission.rs
@@ -9,8 +9,8 @@ use crate::counter_offset_default;
 use crate::encoding_default;
 use crate::input::ByteCounter;
 use crate::options::ARGS;
-use crate::options::OUTPUT_LINE_LEN_MIN;
-use crate::output_line_len_default;
+use crate::options::OUTPUT_LINE_CHAR_NB_MIN;
+use crate::output_line_char_nb_max_default;
 use anyhow::{anyhow, Context, Result};
 use encoding_rs::*;
 use lazy_static::lazy_static;
@@ -421,10 +421,10 @@ pub struct Mission {
     /// A filter, defining additional criteria for a finding to be printed.
     pub filter: Utf8Filter,
 
-    /// Maximum length of output-lines in UTF-8 bytes. Findings that do not fit,
-    /// will be wrapped to two or more lines. The label `+` indicates that this
-    /// line is the continuation of the previous line.
-    pub output_line_len: usize,
+    /// Maximum length of output-lines in UTF-8 characters. Findings that do not
+    /// fit, will be wrapped to two or more lines. The label `+` indicates that
+    /// this line is the continuation of the previous line.
+    pub output_line_char_nb_max: usize,
 
     /// The `encoding_rs` decoder has no direct support for ASCII. As a
     /// workaround, we simulate the missing ASCII-decoder with the
@@ -570,11 +570,11 @@ impl Missions {
         let flag_output_line_len =
             parse_integer!(flag_output_line_len, usize::from_str_radix, usize::from_str);
         if let Some(m) = flag_output_line_len {
-            if m < OUTPUT_LINE_LEN_MIN {
+            if m < OUTPUT_LINE_CHAR_NB_MIN {
                 return Err(anyhow!(
                     "minimum for `--output-line-len` is `{}`, \
                      you tried: `{}`.",
-                    OUTPUT_LINE_LEN_MIN,
+                    OUTPUT_LINE_CHAR_NB_MIN,
                     m
                 ));
             }
@@ -615,19 +615,19 @@ impl Missions {
                 },
             };
 
-            let output_line_len = match flag_output_line_len {
+            let output_line_char_nb_max = match flag_output_line_len {
                 Some(n) => n,
-                None => output_line_len_default!(),
+                None => output_line_char_nb_max_default!(),
             };
 
-            if output_line_len < OUTPUT_LINE_LEN_MIN {
+            if output_line_char_nb_max < OUTPUT_LINE_CHAR_NB_MIN {
                 return Err(anyhow!(
                     "Scanner {}: \
                      minimum for `--output-line-len` is `{}`, \
                      you tried: `{}`.",
                     char::from((mission_id + 97) as u8),
-                    OUTPUT_LINE_LEN_MIN,
-                    output_line_len,
+                    OUTPUT_LINE_CHAR_NB_MIN,
+                    output_line_char_nb_max,
                 ));
             }
 
@@ -703,7 +703,7 @@ impl Missions {
                 encoding,
                 chars_min_nb,
                 filter,
-                output_line_len,
+                output_line_char_nb_max,
                 mission_id: mission_id as u8,
                 print_encoding_as_ascii,
             });
diff --git a/src/options.rs b/src/options.rs
@@ -46,15 +46,15 @@ macro_rules! counter_offset_default {
 /// Default value when no `--output-line-len`
 /// command-line-argument is given. Must be `usize`.
 #[macro_export]
-macro_rules! output_line_len_default {
+macro_rules! output_line_char_nb_max_default {
     () => {
         60usize
     };
 }
 
 /// There must be space for at least 3 long Unicode characters,
 /// to guarantee progress in streaming. You want much longer lines.
-pub const OUTPUT_LINE_LEN_MIN: usize = 12;
+pub const OUTPUT_LINE_CHAR_NB_MIN: usize = 6;
 
 /// Message printed for command-line `--help`.
 const USAGE: &str = concat!(
@@ -84,8 +84,8 @@ Options:
     chars_min_default!(),
     ").
  -p FILE, --output=FILE         Print not to stdout but in file.
- -q NUM, --output-line-len=NUM  Output line length in UTF-8 bytes (default: ",
-    output_line_len_default!(),
+ -q NUM, --output-line-len=NUM  Output line length in UTF-8 characters (default: ",
+    output_line_char_nb_max_default!(),
     ").
  -s NUM, --counter-offset=NUM   Start counting input bytes with NUM (default: ",
     counter_offset_default!(),
diff --git a/src/scanner.rs b/src/scanner.rs