Skip to content

Commit f9d96c4

Browse files
naitohkou
andauthored
Accept String as a pattern at non head (#106)
It supports non-head match cases such as StringScanner#scan_until. If we use a String as a pattern, we can improve match performance. Here is a result of the including benchmark. ## CRuby It shows String as a pattern is 1.18x faster than Regexp as a pattern. ``` $ benchmark-driver benchmark/check_until.yaml Warming up -------------------------------------- regexp 9.403M i/s - 9.548M times in 1.015459s (106.35ns/i) regexp_var 9.162M i/s - 9.248M times in 1.009479s (109.15ns/i) string 8.966M i/s - 9.274M times in 1.034343s (111.54ns/i) string_var 11.051M i/s - 11.190M times in 1.012538s (90.49ns/i) Calculating ------------------------------------- regexp 10.319M i/s - 28.209M times in 2.733707s (96.91ns/i) regexp_var 10.032M i/s - 27.485M times in 2.739807s (99.68ns/i) string 9.681M i/s - 26.897M times in 2.778397s (103.30ns/i) string_var 12.162M i/s - 33.154M times in 2.726046s (82.22ns/i) Comparison: string_var: 12161920.6 i/s regexp: 10318949.7 i/s - 1.18x slower regexp_var: 10031617.6 i/s - 1.21x slower string: 9680843.7 i/s - 1.26x slower ``` ## JRuby It shows String as a pattern is 2.11x faster than Regexp as a pattern. ``` $ benchmark-driver benchmark/check_until.yaml Warming up -------------------------------------- regexp 7.591M i/s - 7.544M times in 0.993780s (131.74ns/i) regexp_var 6.143M i/s - 6.125M times in 0.997038s (162.77ns/i) string 14.135M i/s - 14.079M times in 0.996067s (70.75ns/i) string_var 14.079M i/s - 14.057M times in 0.998420s (71.03ns/i) Calculating ------------------------------------- regexp 9.409M i/s - 22.773M times in 2.420268s (106.28ns/i) regexp_var 10.116M i/s - 18.430M times in 1.821820s (98.85ns/i) string 21.389M i/s - 42.404M times in 1.982519s (46.75ns/i) string_var 20.897M i/s - 42.237M times in 2.021187s (47.85ns/i) Comparison: string: 21389191.1 i/s string_var: 20897327.5 i/s - 1.02x slower regexp_var: 10116464.7 i/s - 2.11x slower regexp: 9409222.3 i/s - 2.27x slower ``` See: https://github.com/jruby/jruby/blob/be7815ec02356a58891c8727bb448f0c6a826d96/core/src/main/java/org/jruby/util/StringSupport.java#L1706-L1736 --------- Co-authored-by: Sutou Kouhei <[email protected]>
1 parent badf6db commit f9d96c4

File tree

4 files changed

+117
-33
lines changed

4 files changed

+117
-33
lines changed

benchmark/check_until.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
prelude: |-
2+
$LOAD_PATH.unshift(File.expand_path("lib"))
3+
require "strscan"
4+
scanner = StringScanner.new("test string")
5+
str = "string"
6+
reg = /string/
7+
benchmark:
8+
regexp: |
9+
scanner.check_until(/string/)
10+
regexp_var: |
11+
scanner.check_until(reg)
12+
string: |
13+
scanner.check_until("string")
14+
string_var: |
15+
scanner.check_until(str)

ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -262,17 +262,6 @@ private IRubyObject extractBegLen(Ruby runtime, int beg, int len) {
262262
// MRI: strscan_do_scan
263263
private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succptr, boolean getstr, boolean headonly) {
264264
final Ruby runtime = context.runtime;
265-
266-
if (headonly) {
267-
if (!(regex instanceof RubyRegexp)) {
268-
regex = regex.convertToString();
269-
}
270-
} else {
271-
if (!(regex instanceof RubyRegexp)) {
272-
throw runtime.newTypeError("wrong argument type " + regex.getMetaClass() + " (expected Regexp)");
273-
}
274-
}
275-
276265
check(context);
277266

278267
ByteList strBL = str.getByteList();
@@ -310,9 +299,9 @@ private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succp
310299
}
311300
if (ret < 0) return context.nil;
312301
} else {
313-
RubyString pattern = (RubyString) regex;
302+
RubyString pattern = regex.convertToString();
314303

315-
str.checkEncoding(pattern);
304+
Encoding patternEnc = str.checkEncoding(pattern);
316305

317306
if (restLen() < pattern.size()) {
318307
return context.nil;
@@ -321,11 +310,18 @@ private IRubyObject scan(ThreadContext context, IRubyObject regex, boolean succp
321310
ByteList patternBL = pattern.getByteList();
322311
int patternSize = patternBL.realSize();
323312

324-
if (ByteList.memcmp(strBL.unsafeBytes(), strBeg + curr, patternBL.unsafeBytes(), patternBL.begin(), patternSize) != 0) {
325-
return context.nil;
313+
if (headonly) {
314+
if (ByteList.memcmp(strBL.unsafeBytes(), strBeg + curr, patternBL.unsafeBytes(), patternBL.begin(), patternSize) != 0) {
315+
return context.nil;
316+
}
317+
setRegisters(patternSize);
318+
} else {
319+
int pos = StringSupport.index(strBL, patternBL, strBeg + curr, patternEnc);
320+
if (pos == -1) {
321+
return context.nil;
322+
}
323+
setRegisters(patternSize + pos - curr);
326324
}
327-
328-
setRegisters(patternSize);
329325
}
330326

331327
setMatched();

ext/strscan/strscan.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -686,14 +686,6 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly
686686
{
687687
struct strscanner *p;
688688

689-
if (headonly) {
690-
if (!RB_TYPE_P(pattern, T_REGEXP)) {
691-
StringValue(pattern);
692-
}
693-
}
694-
else {
695-
Check_Type(pattern, T_REGEXP);
696-
}
697689
GET_SCANNER(self, p);
698690

699691
CLEAR_MATCH_STATUS(p);
@@ -714,14 +706,25 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly
714706
}
715707
}
716708
else {
709+
StringValue(pattern);
717710
rb_enc_check(p->str, pattern);
718711
if (S_RESTLEN(p) < RSTRING_LEN(pattern)) {
719712
return Qnil;
720713
}
721-
if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) {
722-
return Qnil;
714+
715+
if (headonly) {
716+
if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) {
717+
return Qnil;
718+
}
719+
set_registers(p, RSTRING_LEN(pattern));
720+
} else {
721+
long pos = rb_memsearch(RSTRING_PTR(pattern), RSTRING_LEN(pattern),
722+
CURPTR(p), S_RESTLEN(p), rb_enc_get(pattern));
723+
if (pos == -1) {
724+
return Qnil;
725+
}
726+
set_registers(p, RSTRING_LEN(pattern) + pos);
723727
}
724-
set_registers(p, RSTRING_LEN(pattern));
725728
}
726729

727730
MATCHED(p);

test/strscan/test_stringscanner.rb

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -262,15 +262,15 @@ def test_concat
262262
end
263263

264264
def test_scan
265-
s = create_string_scanner('stra strb strc', true)
265+
s = create_string_scanner("stra strb\0strc", true)
266266
tmp = s.scan(/\w+/)
267267
assert_equal 'stra', tmp
268268

269269
tmp = s.scan(/\s+/)
270270
assert_equal ' ', tmp
271271

272272
assert_equal 'strb', s.scan(/\w+/)
273-
assert_equal ' ', s.scan(/\s+/)
273+
assert_equal "\u0000", s.scan(/\0/)
274274

275275
tmp = s.scan(/\w+/)
276276
assert_equal 'strc', tmp
@@ -312,11 +312,14 @@ def test_scan
312312
end
313313

314314
def test_scan_string
315-
s = create_string_scanner('stra strb strc')
315+
s = create_string_scanner("stra strb\0strc")
316316
assert_equal 'str', s.scan('str')
317317
assert_equal 'str', s[0]
318318
assert_equal 3, s.pos
319319
assert_equal 'a ', s.scan('a ')
320+
assert_equal 'strb', s.scan('strb')
321+
assert_equal "\u0000", s.scan("\0")
322+
assert_equal 'strc', s.scan('strc')
320323

321324
str = 'stra strb strc'.dup
322325
s = create_string_scanner(str, false)
@@ -668,13 +671,47 @@ def test_exist_p
668671
assert_equal(nil, s.exist?(/e/))
669672
end
670673

671-
def test_exist_p_string
674+
def test_exist_p_invalid_argument
672675
s = create_string_scanner("test string")
673676
assert_raise(TypeError) do
674-
s.exist?(" ")
677+
s.exist?(1)
675678
end
676679
end
677680

681+
def test_exist_p_string
682+
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
683+
s = create_string_scanner("test string")
684+
assert_equal(3, s.exist?("s"))
685+
assert_equal(0, s.pos)
686+
s.scan("test")
687+
assert_equal(2, s.exist?("s"))
688+
assert_equal(4, s.pos)
689+
assert_equal(nil, s.exist?("e"))
690+
end
691+
692+
def test_scan_until
693+
s = create_string_scanner("Foo Bar\0Baz")
694+
assert_equal("Foo", s.scan_until(/Foo/))
695+
assert_equal(3, s.pos)
696+
assert_equal(" Bar", s.scan_until(/Bar/))
697+
assert_equal(7, s.pos)
698+
assert_equal(nil, s.skip_until(/Qux/))
699+
assert_equal("\u0000Baz", s.scan_until(/Baz/))
700+
assert_equal(11, s.pos)
701+
end
702+
703+
def test_scan_until_string
704+
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
705+
s = create_string_scanner("Foo Bar\0Baz")
706+
assert_equal("Foo", s.scan_until("Foo"))
707+
assert_equal(3, s.pos)
708+
assert_equal(" Bar", s.scan_until("Bar"))
709+
assert_equal(7, s.pos)
710+
assert_equal(nil, s.skip_until("Qux"))
711+
assert_equal("\u0000Baz", s.scan_until("Baz"))
712+
assert_equal(11, s.pos)
713+
end
714+
678715
def test_skip_until
679716
s = create_string_scanner("Foo Bar Baz")
680717
assert_equal(3, s.skip_until(/Foo/))
@@ -684,6 +721,16 @@ def test_skip_until
684721
assert_equal(nil, s.skip_until(/Qux/))
685722
end
686723

724+
def test_skip_until_string
725+
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
726+
s = create_string_scanner("Foo Bar Baz")
727+
assert_equal(3, s.skip_until("Foo"))
728+
assert_equal(3, s.pos)
729+
assert_equal(4, s.skip_until("Bar"))
730+
assert_equal(7, s.pos)
731+
assert_equal(nil, s.skip_until("Qux"))
732+
end
733+
687734
def test_check_until
688735
s = create_string_scanner("Foo Bar Baz")
689736
assert_equal("Foo", s.check_until(/Foo/))
@@ -693,6 +740,16 @@ def test_check_until
693740
assert_equal(nil, s.check_until(/Qux/))
694741
end
695742

743+
def test_check_until_string
744+
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
745+
s = create_string_scanner("Foo Bar Baz")
746+
assert_equal("Foo", s.check_until("Foo"))
747+
assert_equal(0, s.pos)
748+
assert_equal("Foo Bar", s.check_until("Bar"))
749+
assert_equal(0, s.pos)
750+
assert_equal(nil, s.check_until("Qux"))
751+
end
752+
696753
def test_search_full
697754
s = create_string_scanner("Foo Bar Baz")
698755
assert_equal(8, s.search_full(/Bar /, false, false))
@@ -705,6 +762,19 @@ def test_search_full
705762
assert_equal(11, s.pos)
706763
end
707764

765+
def test_search_full_string
766+
omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby"
767+
s = create_string_scanner("Foo Bar Baz")
768+
assert_equal(8, s.search_full("Bar ", false, false))
769+
assert_equal(0, s.pos)
770+
assert_equal("Foo Bar ", s.search_full("Bar ", false, true))
771+
assert_equal(0, s.pos)
772+
assert_equal(8, s.search_full("Bar ", true, false))
773+
assert_equal(8, s.pos)
774+
assert_equal("Baz", s.search_full("az", true, true))
775+
assert_equal(11, s.pos)
776+
end
777+
708778
def test_peek
709779
s = create_string_scanner("test string")
710780
assert_equal("test st", s.peek(7))

0 commit comments

Comments
 (0)