8000 Improve comment parse performance (#245) · ruby/rexml@4349091 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4349091

Browse files
authored
Improve comment parse performance (#245)
## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) top_level 999.440 5.058k 922.416 3.340k i/s - 100.000 times in 0.100056s 0.019770s 0.108411s 0.029936s in_doctype 1.063k 4.890k 980.498 3.341k i/s - 100.000 times in 0.094116s 0.020449s 0.101989s 0.029927s after_doctype 638.321 1.304k 603.952 1.153k i/s - 100.000 times in 0.156661s 0.076710s 0.165576s 0.086748s Comparison: top_level master: 5058.2 i/s master(YJIT): 3340.5 i/s - 1.51x slower rexml 3.4.1: 999.4 i/s - 5.06x slower 3.4.1(YJIT): 922.4 i/s - 5.48x slower in_doctype master: 4890.2 i/s master(YJIT): 3341.5 i/s - 1.46x slower rexml 3.4.1: 1062.5 i/s - 4.60x slower 3.4.1(YJIT): 980.5 i/s - 4.99x slower after_doctype master: 1303.6 i/s master(YJIT): 1152.8 i/s - 1.13x slower rexml 3.4.1: 638.3 i/s - 2.04x slower 3.4.1(YJIT): 604.0 i/s - 2.16x slower ``` - YJIT=ON : 1.90x - 3.62x faster - YJIT=OFF : 2.04x - 5.06x faster
1 parent 64a709e commit 4349091

File tree

3 files changed

+69
-27
lines changed

3 files changed

+69
-27
lines changed

benchmark/parse_comment.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
loop_count: 100
2+
contexts:
3+
- gems:
4+
rexml: 3.2.6
5+
require: false
6+
prelude: require 'rexml'
7+
- name: master
8+
prelude: |
9+
$LOAD_PATH.unshift(File.expand_path("lib"))
10+
require 'rexml'
11+
- name: 3.2.6(YJIT)
12+
gems:
13+
rexml: 3.2.6
14+
require: false
15+
prelude: |
16+
require 'rexml'
17+
RubyVM::YJIT.enable
18+
- name: master(YJIT)
19+
prelude: |
20+
$LOAD_PATH.unshift(File.expand_path("lib"))
21+
require 'rexml'
22+
RubyVM::YJIT.enable
23+
24+
prelude: |
25+
require 'rexml/document'
26+
27+
SIZE = 100000
28+
29+
top_level_xml = "<!--" + "a" * SIZE + "-->\n"
30+
in_doctype_xml = "<!DOCTYPE foo [<!--" + "a" * SIZE + "-->]>"
31+
after_doctype_xml = "<root/><!--" + "a" * SIZE + "-->"
32+
33+
benchmark:
34+
'top_level' : REXML::Document.new(top_level_xml)
35+
'in_doctype' : REXML::Document.new(in_doctype_xml)
36+
'after_doctype' : REXML::Document.new(after_doctype_xml)

lib/rexml/parsers/baseparser.rb

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -277,14 +277,7 @@ def pull_event
277277
return process_instruction
278278
elsif @source.match?("<!", true)
279279
if @source.match?("--", true)
280-
md = @source.match(/(.*?)-->/um, true)
281-
if md.nil?
282-
raise REXML::ParseException.new("Unclosed comment", @source)
283-
end
284-
if /--|-\z/.match?(md[1])
285-
raise REXML::ParseException.new("Malformed comment", @source)
286-
end
287-
return [ :comment, md[1] ]
280+
return [ :comment, process_comment ]
288281
elsif @source.match?("DOCTYPE", true)
289282
base_error_message = "Malformed DOCTYPE"
290283
unless @source.match?(/\s+/um, true)
@@ -417,12 +410,8 @@ def pull_event
417410
raise REXML::ParseException.new(message, @source)
418411
end
419412
return [:notationdecl, name, *id]
420-
elsif md = @source.match(/--(.*?)-->/um, true)
421-
case md[1]
422-
when /--/, /-\z/
423-
raise REXML::ParseException.new("Malformed comment", @source)
424-
end
425-
return [ :comment, md[1] ] if md
413+
elsif @source.match?("--", true)
414+
return [ :comment, process_comment ]
426415
end
427416
elsif match = @source.match(/(%.*?;)\s*/um, true)
428417
return [ :externalentity, match[1] ]
@@ -463,14 +452,8 @@ def pull_event
463452
md = @source.match(/([^>]*>)/um)
464453
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
465454
raise REXML::ParseException.new("Malformed node", @source) unless md
466-
if md[0][0] == ?-
467-
md = @source.match(/--(.*?)-->/um, true)
468-
469-
if md.nil? || /--|-\z/.match?(md[1])
470-
raise REXML::ParseException.new("Malformed comment", @source)
471-
end
472-
473-
return [ :comment, md[1] ]
455+
if @source.match?("--", true)
456+
return [ :comment, process_comment ]
474457
elsif @source.match?("[CDATA[", true)
475458
text = @source.read_until("]]>")
476459
if text.chomp!("]]>")
@@ -738,6 +721,18 @@ def parse_id_invalid_details(accept_external_id:,
738721
end
739722
end
740723

724+
def process_comment
725+
text = @source.read_until("-->")
726+
unless text.chomp!("-->")
727+
raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source)
728+
end
729+
730+
if text.include? "--" or text.end_with?("-")
731+
raise REXML::ParseException.new("Malformed comment", @source)
732+
end
733+
text
734+
end
735+
741736
def process_instruction
742737
name = parse_name("Malformed XML: Invalid processing instruction node")
743738
if @source.match?(/\s+/um, true)

test/parse/test_comment.rb

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def test_toplevel_unclosed_comment
1717
parse("<!--")
1818
end
1919
assert_equal(<<~DETAIL, exception.to_s)
20-
Unclosed comment
20+
Unclosed comment: Missing end '-->'
2121
Line: 1
2222
Position: 4
2323
Last 80 unconsumed characters:
@@ -48,6 +48,18 @@ def test_toplevel_malformed_comment_end
4848
DETAIL
4949
end
5050

51+
def test_doctype_unclosed_comment
52+
exception = assert_raise(REXML::ParseException) do
53+
parse("<!DOCTYPE foo [<!--")
54+
end
55+
assert_equal(<<~DETAIL, exception.to_s)
56+
Unclosed comment: Missing end '-->'
57+
Line: 1
58+
Position: 19
59+
Last 80 unconsumed characters:
60+
DETAIL
61+
end
62+
5163
def test_doctype_malformed_comment_inner
5264
exception = assert_raise(REXML::ParseException) do
5365
parse("<!DOCTYPE foo [<!-- -- -->")
@@ -72,16 +84,15 @@ def test_doctype_malformed_comment_end
7284
DETAIL
7385
end
7486

75-
def test_after_doctype_malformed_comment_short
87+
def test_after_doctype_unclosed_comment
7688
exception = assert_raise(REXML::ParseException) do
7789
parse("<a><!-->")
7890
end
79-
assert_equal(<<~DETAIL.chomp, exception.to_s)
80-
Malformed comment
91+
assert_equal(<<~DETAIL, exception.to_s)
92+
Unclosed comment: Missing end '-->'
8193
Line: 1
8294
Position: 8
8395
Last 80 unconsumed characters:
84-
-->
8596
DETAIL
8697
end
8798

0 commit comments

Comments
 (0)
0