From b97e454ceb2e1719a487bfebaae3da4a706a854b Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 16 Feb 2025 16:48:06 +0900 Subject: [PATCH 01/12] Bump version --- lib/rexml/rexml.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index a653f028..bf3c0d32 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.4.1" + VERSION = "3.4.2" REVISION = "" Copyright = COPYRIGHT From 64a709e74551d5968f2241a772876f4b0c8dea22 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 2 Mar 2025 11:38:54 +0900 Subject: [PATCH 02/12] Improve CDATA parse performance (#244) ## Why? GitHub: fix #243 ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) dom 648.361 1.178k 591.590 1.046k i/s - 100.000 times in 0.154235s 0.084913s 0.169036s 0.095627s sax 699.061 1.378k 651.148 1.196k i/s - 100.000 times in 0.143049s 0.072549s 0.153575s 0.083611s pull 699.271 1.379k 660.275 1.210k i/s - 100.000 times in 0.143006s 0.072527s 0.151452s 0.082622s stream 701.725 1.383k 659.483 1.228k i/s - 100.000 times in 0.142506s 0.072307s 0.151634s 0.081455s Comparison: dom master: 1177.7 i/s master(YJIT): 1045.7 i/s - 1.13x slower rexml 3.4.1: 648.4 i/s - 1.82x slower 3.4.1(YJIT): 591.6 i/s - 1.99x slower sax master: 1378.4 i/s master(YJIT): 1196.0 i/s - 1.15x slower rexml 3.4.1: 699.1 i/s - 1.97x slower 3.4.1(YJIT): 651.1 i/s - 2.12x slower pull master: 1378.8 i/s master(YJIT): 1210.3 i/s - 1.14x slower rexml 3.4.1: 699.3 i/s - 1.97x slower 3.4.1(YJIT): 660.3 i/s - 2.09x slower stream master: 1383.0 i/s master(YJIT): 1227.7 i/s - 1.13x slower rexml 3.4.1: 701.7 i/s - 1.97x slower 3.4.1(YJIT): 659.5 i/s - 2.10x slower ``` - YJIT=ON : 1.76x - 1.83x faster - YJIT=OFF : 1.82x - 1.97x faster Reported by Masamune. Thanks!!! Co-authored-by: Sutou Kouhei --- benchmark/parse_cdata.yaml | 50 +++++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 10 +++++-- lib/rexml/source.rb | 2 +- test/parse/test_cdata.rb | 20 ++++++++++++- 4 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 benchmark/parse_cdata.yaml diff --git a/benchmark/parse_cdata.yaml b/benchmark/parse_cdata.yaml new file mode 100644 index 00000000..cde04306 --- /dev/null +++ b/benchmark/parse_cdata.yaml @@ -0,0 +1,50 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/pullparser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + def build_xml(size) + xml = "\n" + + "Test\n" + + "\n" + end + xml = build_xml(100000) + + class Listener + include REXML::StreamListener + end + +benchmark: + 'dom' : REXML::Document.new(xml) + 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse + 'pull' : | + parser = REXML::Parsers::PullParser.new(xml) + while parser.has_next? + parser.pull + end + 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 44aacfa2..e666c2af 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -471,9 +471,13 @@ def pull_event end return [ :comment, md[1] ] - else - md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) - return [ :cdata, md[1] ] if md + elsif @source.match?("[CDATA[", true) + text = @source.read_until("]]>") + if text.chomp!("]]>") + return [ :cdata, text ] + else + raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) + end end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 5ba5ab12..3ec1141e 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -67,7 +67,7 @@ class Source module Private SCANNER_RESET_SIZE = 100000 PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"', "<"] + pre_defined_terms = ["'", '"', "<", "]]>"] if StringScanner::Version < "3.1.1" pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb index b5f1a3bc..c742d6a1 100644 --- a/test/parse/test_cdata.rb +++ b/test/parse/test_cdata.rb @@ -7,10 +7,28 @@ module REXMLTests class TestParseCData < Test::Unit::TestCase include Test::Unit::CoreAssertions + def parse(xml) + REXML::Document.new(xml) + end + def test_linear_performance_gt seq = [10000, 50000, 100000, 150000, 200000] assert_linear_performance(seq, rehearsal: 10) do |n| - REXML::Document.new('" * n + ' ]]>') + parse('" * n + ' ]]>') + end + end + + class TestInvalid < self + def test_unclosed_cdata + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed CDATA: Missing end ']]>' + Line: 1 + Position: 25 + Last 80 unconsumed characters: + DETAIL end end end From 434909171ef3756c1ca2b84f5c90923a72c6a591 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 3 Mar 2025 13:47:31 +0900 Subject: [PATCH 03/12] Improve comment parse performance (#245) ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) top_level 999.440 5.058k 922.416 3.340k i/s - 100.000 times in 0.100056s 0.019770s 0.108411s 0.029936s in_doctype 1.063k 4.890k 980.498 3.341k i/s - 100.000 times in 0.094116s 0.020449s 0.101989s 0.029927s after_doctype 638.321 1.304k 603.952 1.153k i/s - 100.000 times in 0.156661s 0.076710s 0.165576s 0.086748s Comparison: top_level master: 5058.2 i/s master(YJIT): 3340.5 i/s - 1.51x slower rexml 3.4.1: 999.4 i/s - 5.06x slower 3.4.1(YJIT): 922.4 i/s - 5.48x slower in_doctype master: 4890.2 i/s master(YJIT): 3341.5 i/s - 1.46x slower rexml 3.4.1: 1062.5 i/s - 4.60x slower 3.4.1(YJIT): 980.5 i/s - 4.99x slower after_doctype master: 1303.6 i/s master(YJIT): 1152.8 i/s - 1.13x slower rexml 3.4.1: 638.3 i/s - 2.04x slower 3.4.1(YJIT): 604.0 i/s - 2.16x slower ``` - YJIT=ON : 1.90x - 3.62x faster - YJIT=OFF : 2.04x - 5.06x faster --- benchmark/parse_comment.yaml | 36 ++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 39 ++++++++++++++------------------- test/parse/test_comment.rb | 21 +++++++++++++----- 3 files changed, 69 insertions(+), 27 deletions(-) create mode 100644 benchmark/parse_comment.yaml diff --git a/benchmark/parse_comment.yaml b/benchmark/parse_comment.yaml new file mode 100644 index 00000000..a0a3a771 --- /dev/null +++ b/benchmark/parse_comment.yaml @@ -0,0 +1,36 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + SIZE = 100000 + + top_level_xml = "\n" + in_doctype_xml = "]>" + after_doctype_xml = "" + +benchmark: + 'top_level' : REXML::Document.new(top_level_xml) + 'in_doctype' : REXML::Document.new(in_doctype_xml) + 'after_doctype' : REXML::Document.new(after_doctype_xml) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e666c2af..61d38ae2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -277,14 +277,7 @@ def pull_event return process_instruction elsif @source.match?("/um, true) - if md.nil? - raise REXML::ParseException.new("Unclosed comment", @source) - end - if /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] + return [ :comment, process_comment ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match?(/\s+/um, true) @@ -417,12 +410,8 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - elsif md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ - raise REXML::ParseException.new("Malformed comment", @source) - end - return [ :comment, md[1] ] if md + elsif @source.match?("--", true) + return [ :comment, process_comment ] end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] @@ -463,14 +452,8 @@ def pull_event md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][0] == ?- - md = @source.match(/--(.*?)-->/um, true) - - if md.nil? || /--|-\z/.match?(md[1]) - raise REXML::ParseException.new("Malformed comment", @source) - end - - return [ :comment, md[1] ] + if @source.match?("--", true) + return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) text = @source.read_until("]]>") if text.chomp!("]]>") @@ -738,6 +721,18 @@ def parse_id_invalid_details(accept_external_id:, end end + def process_comment + text = @source.read_until("-->") + unless text.chomp!("-->") + raise REXML::ParseException.new("Unclosed comment: Missing end '-->'", @source) + end + + if text.include? "--" or text.end_with?("-") + raise REXML::ParseException.new("Malformed comment", @source) + end + text + end + def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") if @source.match?(/\s+/um, true) diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 4475dca7..c573e711 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -17,7 +17,7 @@ def test_toplevel_unclosed_comment parse("' Line: 1 Position: 4 Last 80 unconsumed characters: @@ -48,6 +48,18 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("' + Line: 1 + Position: 19 + Last 80 unconsumed characters: + DETAIL + end + def test_doctype_malformed_comment_inner exception = assert_raise(REXML::ParseException) do parse("") @@ -72,16 +84,15 @@ def test_doctype_malformed_comment_end DETAIL end - def test_after_doctype_malformed_comment_short + def test_after_doctype_unclosed_comment exception = assert_raise(REXML::ParseException) do parse("") end - assert_equal(<<~DETAIL.chomp, exception.to_s) - Malformed comment + assert_equal(<<~DETAIL, exception.to_s) + Unclosed comment: Missing end '-->' Line: 1 Position: 8 Last 80 unconsumed characters: - --> DETAIL end From a5f31c49be106011c4d96cb0e308ebbba118d192 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 06:20:42 +0900 Subject: [PATCH 04/12] Improve CDATA and comment parse performance (#246) ## Why? Since `` are malformed node, they do not need to be checked before comments and CDATA. ## Benchmark : comment (after_doctype) ``` $ benchmark-driver benchmark/parse_comment.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) after_doctype 1.306k 5.586k 1.152k 3.569k i/s - 100.000 times in 0.076563s 0.017903s 0.086822s 0.028020s Comparison: after_doctype after: 5585.7 i/s after(YJIT): 3568.9 i/s - 1.57x slower before: 1306.1 i/s - 4.28x slower before(YJIT): 1151.8 i/s - 4.85x slower ``` - YJIT=ON : 3.09x faster - YJIT=OFF : 4.28x faster ## Benchmark : CDATA ``` $ benchmark-driver benchmark/parse_cdata.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 1.269k 5.548k 1.053k 3.072k i/s - 100.000 times in 0.078808s 0.018026s 0.094976s 0.032553s sax 1.399k 8.244k 1.220k 4.460k i/s - 100.000 times in 0.071458s 0.012130s 0.081958s 0.022422s pull 1.411k 8.319k 1.260k 4.806k i/s - 100.000 times in 0.070883s 0.012021s 0.079335s 0.020809s stream 1.420k 8.320k 1.254k 4.728k i/s - 100.000 times in 0.070406s 0.012019s 0.079738s 0.021149s Comparison: dom after: 5547.5 i/s after(YJIT): 3071.9 i/s - 1.81x slower before: 1268.9 i/s - 4.37x slower before(YJIT): 1052.9 i/s - 5.27x slower sax after: 8244.0 i/s after(YJIT): 4459.9 i/s - 1.85x slower before: 1399.4 i/s - 5.89x slower before(YJIT): 1220.1 i/s - 6.76x slower pull after: 8318.8 i/s after(YJIT): 4805.6 i/s - 1.73x slower before: 1410.8 i/s - 5.90x slower before(YJIT): 1260.5 i/s - 6.60x slower stream after: 8320.2 i/s after(YJIT): 4728.4 i/s - 1.76x slower before: 1420.3 i/s - 5.86x slower before(YJIT): 1254.1 i/s - 6.63x slower ``` - YJIT=ON : 2.91x - 3.80x faster - YJIT=OFF : 4.37x - 5.90x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 6 ++---- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 61d38ae2..de85aebd 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -449,9 +449,7 @@ def pull_event end return [ :end_element, last_tag ] elsif @source.match?("!", true) - md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" - raise REXML::ParseException.new("Malformed node", @source) unless md if @source.match?("--", true) return [ :comment, process_comment ] elsif @source.match?("[CDATA[", true) @@ -461,9 +459,9 @@ def pull_event else raise REXML::ParseException.new("Malformed CDATA: Missing end ']]>'", @source) end + else + raise REXML::ParseException.new("Malformed node: Started with '") From a85203e88c8f50f64140fb50492cf9dbe3d79301 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 5 Mar 2025 09:45:19 +0900 Subject: [PATCH 05/12] Raise appropriate exception when failing to match start tag in DOCTYPE (#247) ## Why? Added exception to make the process easier to understand. --- lib/rexml/parsers/baseparser.rb | 5 +++-- test/parse/test_comment.rb | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index de85aebd..750b1697 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -412,14 +412,15 @@ def pull_event return [:notationdecl, name, *id] elsif @source.match?("--", true) return [ :comment, process_comment ] + else + raise REXML::ParseException.new("Malformed node: Started with '/um, true) @document_status = :after_doctype return [ :end_doctype ] - end - if @document_status == :in_doctype + else raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) end end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb index 5349c18e..6339835d 100644 --- a/test/parse/test_comment.rb +++ b/test/parse/test_comment.rb @@ -48,6 +48,19 @@ def test_toplevel_malformed_comment_end DETAIL end + def test_doctype_malformed_node + exception = assert_raise(REXML::ParseException) do + parse(" Date: Thu, 3 Apr 2025 03:45:35 -0400 Subject: [PATCH 06/12] Fix docs typo in code example (#248) --- lib/rexml/document.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index d1747dd4..1960012c 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -309,8 +309,8 @@ def stand_alone? end # :call-seq: - # doc.write(output=$stdout, indent=-1, transtive=false, ie_hack=false, encoding=nil) - # doc.write(options={:output => $stdout, :indent => -1, :transtive => false, :ie_hack => false, :encoding => nil}) + # doc.write(output=$stdout, indent=-1, transitive=false, ie_hack=false, encoding=nil) + # doc.write(options={:output => $stdout, :indent => -1, :transitive => false, :ie_hack => false, :encoding => nil}) # # Write the XML tree out, optionally with indent. This writes out the # entire XML document, including XML declarations, doctype declarations, From d944fa478a972febe9c3ad2cf35232223d391597 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 3 May 2025 09:03:12 +0900 Subject: [PATCH 07/12] NEWS.md : Fix the mentioned of the PR in CVE-2024-35176. (#253) I think the mentioned of CVE-2024-35176 in NEWS.md is incorrect. ``` - Improved parse performance when an attribute has many ' characters. --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 51a45cab..7f95d829 100644 --- a/NEWS.md +++ b/NEWS.md @@ -386,7 +386,7 @@ * Patch by NAITOH Jun. - * Improved parse performance when an attribute has many `<`s. + * Improved parse performance when an attribute has many `>`s. * GH-126 From de6f40ed8749dd6ab4b7c4b80494a824f7f9027a Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Sat, 3 May 2025 09:21:27 +0900 Subject: [PATCH 08/12] Fix reverse sort in xpath_parser (#251) The code below was failing with `REXML::XPathParser#sort': undefined method '-@' for an instance of Array` ```ruby d = REXML::Document.new("") matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") # Before: error # After: [, , ] ``` This pull request will fix it. --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 5eb1e5a9..f86a87e6 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -671,7 +671,7 @@ def sort(array_of_nodes, order) if order == :forward index else - -index + index.map(&:-@) end end ordered.collect do |_index, node| diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 1dacd69d..53264a9e 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,6 +416,12 @@ def test_preceding assert_equal( 4, cs.length ) end + def test_preceding_sibling + d = REXML::Document.new("") + matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + assert_equal(["e", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From 249d770b4ead129abf475708e84e3f1f7908962a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 6 May 2025 21:33:00 +0900 Subject: [PATCH 09/12] Fix duplicate responses in XPath following, following-sibling, preceding, preceding-sibling (#255) ## Why? See: https://github.com/ruby/rexml/pull/251#issuecomment-2845103143 ## Expected values - XPath : a/d/preceding::* => ["d", "c", "b"] ```xml ``` - XPath : a/d/following::* => ["d", "e", "f"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "e"] ```xml ``` - XPath : a/b/x/following-sibling:* => ["c", "d", "x", "e"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "d", "c"] ```xml ``` - XPath : a/b/x/preceding-sibling::* => ["e", "x", "d", "c"] ```xml ``` - XPath : //a/following-sibling:*[1] => ["w", "x", "y", "z"] ```xml ``` --- lib/rexml/xpath_parser.rb | 2 +- test/xpath/test_base.rb | 97 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 4 deletions(-) diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index f86a87e6..cde2e5d5 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -144,7 +144,7 @@ def match(path_stack, nodeset) result = expr(path_stack, nodeset) case result when Array # nodeset - unnode(result) + unnode(result).uniq else [result] end diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 53264a9e..b923eed2 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -416,12 +416,103 @@ def test_preceding assert_equal( 4, cs.length ) end - def test_preceding_sibling - d = REXML::Document.new("") - matches = REXML::XPath.match(d, "a/b/x/preceding-sibling::node()") + def test_preceding_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/preceding::*") + assert_equal(["d", "c", "b"], matches.map(&:name)) + end + + def test_following_multiple + source = <<-XML + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/d/following::*") + assert_equal(["d", "e", "f"], matches.map(&:name)) + end + + def test_following_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "e"], matches.map(&:name)) + end + + def test_following_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/following-sibling::*") + assert_equal(["c", "d", "x", "e"], matches.map(&:name)) + end + + def test_following_sibling_predicates + source = <<-XML + + XML + doc = REXML::Document.new(source) + # Finds a node flowing + matches = REXML::XPath.match(doc, "//a/following-sibling::*[1]") + assert_equal(["w", "x", "y", "z"], matches.map(&:name)) + end + + def test_preceding_sibling_across_multiple_nodes + source = <<-XML + + + + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") assert_equal(["e", "d", "c"], matches.map(&:name)) end + def test_preceding_sibling_within_single_node + source = <<-XML + + + + + + XML + doc = REXML::Document.new(source) + matches = REXML::XPath.match(doc, "a/b/x/preceding-sibling::*") + assert_equal(["e", "x", "d", "c"], matches.map(&:name)) + end + def test_following d = Document.new "" start = XPath.first( d, "/a/b[@id='0']" ) From cd575a10cac58eb47f235ed186060ac65ffb5284 Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Wed, 7 May 2025 21:02:31 +0900 Subject: [PATCH 10/12] Deprecate accepting array as an element in XPath.match, first and each (#252) `XPath.match`, `XPath.first`, `XPath.each`, `XPathParser#parse` and `XPathParser#match` accepted nodeset as element. This pull request changes the first parameter of these method to be an element instead of nodeset. Passing nodeset will be deprecated. ```ruby # Documented usage. OK REXML::XPath.match(element, xpath) # Undocumented usage. Deprecate in this pull request nodeset = [element] REXML::XPath.match(nodeset, xpath) ``` ### Background #249 will introduce a temporary cache. ```ruby def parse path, nodeset path_stack = @parser.parse( path ) nodeset.first.document.send(:enable_cache) do match( path_stack, nodeset ) end end ``` But the signature `XPathParser#match(path, nodeset)` does not guarantee that all nodes in the nodeset has the same root document. So cache does not work in the code below. It's still slow. ```ruby REXML::XPath.match(2.times.map { REXML::Document.new(''*400+''*400) }, 'a//a') ``` The interface is holding our back, so I propose to drop accepting array as element. This change is a backward incompatibility, but it just drops undocumented feature. I think only the test code was unintentionally using this feature. ### XPath.match with array XPath.match only traverse the first element of the array for some selectors. ```ruby nodeset = [REXML::Document.new(""), REXML::Document.new("")] REXML::XPath.match(nodeset, "a/*") #=> [, ] REXML::XPath.match(nodeset, "//a/*") #=> [] # I expect [, ] but the second document is ignored ``` It indicates that `XPath.match` is not designed to search inside multiple nodes/documents. --------- Co-authored-by: Sutou Kouhei --- lib/rexml/xpath.rb | 3 --- lib/rexml/xpath_parser.rb | 22 ++++++++++++---------- test/test_jaxen.rb | 16 ++++++++++------ test/xpath/test_base.rb | 17 ++++++++++++++--- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index a0921bd8..666d764f 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -35,7 +35,6 @@ def XPath::first(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).flatten[0] end @@ -64,7 +63,6 @@ def XPath::each(element, path=nil, namespaces=nil, variables={}, options={}, &bl parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path, element).each( &block ) end @@ -74,7 +72,6 @@ def XPath::match(element, path=nil, namespaces=nil, variables={}, options={}) parser.namespaces = namespaces parser.variables = variables path = "*" unless path - element = [element] unless element.kind_of? Array parser.parse(path,element) end end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index cde2e5d5..8440015b 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -76,19 +76,19 @@ def variables=( vars={} ) @variables = vars end - def parse path, nodeset + def parse path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end - def get_first path, nodeset + def get_first path, node path_stack = @parser.parse( path ) - first( path_stack, nodeset ) + first( path_stack, node ) end - def predicate path, nodeset + def predicate path, node path_stack = @parser.parse( path ) - match( path_stack, nodeset ) + match( path_stack, node ) end def []=( variable_name, value ) @@ -136,11 +136,13 @@ def first( path_stack, node ) end - def match(path_stack, nodeset) - nodeset = nodeset.collect.with_index do |node, i| - position = i + 1 - XPathNode.new(node, position: position) + def match(path_stack, node) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first end + nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result when Array # nodeset diff --git a/test/test_jaxen.rb b/test/test_jaxen.rb index 6038e88e..548120d6 100644 --- a/test/test_jaxen.rb +++ b/test/test_jaxen.rb @@ -56,7 +56,9 @@ def process_test_case(name) # processes a tests/document/context node def process_context(doc, context) - test_context = XPath.match(doc, context.attributes["select"]) + matched = XPath.match(doc, context.attributes["select"]) + assert_equal(1, matched.size) + test_context = matched.first namespaces = context.namespaces namespaces.delete("var") namespaces = nil if namespaces.empty? @@ -101,10 +103,14 @@ def process_nominal_test(context, variables, namespaces, test) assert_equal(Integer(expected, 10), matched.size, user_message(context, xpath, matched)) + else + assert_operator(matched.size, :>, 0, user_message(context, xpath, matched)) end XPath.each(test, "valueOf") do |value_of| - process_value_of(matched, variables, namespaces, value_of) + matched.each do |subcontext| + process_value_of(subcontext, variables, namespaces, value_of) + end end end @@ -118,10 +124,8 @@ def process_exceptional_test(context, variables, namespaces, test) def user_message(context, xpath, matched) message = "" - context.each_with_index do |node, i| - message << "Node#{i}:\n" - message << "#{node}\n" - end + message << "Node:\n" + message << "#{context}\n" message << "XPath: <#{xpath}>\n" message << "Matched <#{matched}>" message diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index b923eed2..ab22f6f9 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -411,9 +411,10 @@ def test_preceding s = "" d = REXML::Document.new(s) - c = REXML::XPath.match( d, "//c[@id = '5']") - cs = REXML::XPath.match( c, "preceding::c" ) - assert_equal( 4, cs.length ) + c = REXML::XPath.match(d, "//c[@id = '5']") + assert_equal(1, c.length) + cs = REXML::XPath.match(c.first, "preceding::c") + assert_equal(4, cs.length) end def test_preceding_multiple @@ -1255,5 +1256,15 @@ def test_or_and end assert_equal(["/"], hrefs, "Bug #3842 [ruby-core:32447]") end + + def test_match_with_deprecated_usage + verbose, $VERBOSE = $VERBOSE, nil + doc = Document.new("") + assert_equal(['b'], XPath.match([doc, doc], '//b').map(&:name)) + assert_equal(['b'], XPath.match([doc], '//b').map(&:name)) + assert_equal([], XPath.match([], '//b').map(&:name)) + ensure + $VERBOSE = verbose + end end end From e80ffdd12713cd138dbe33f26968452dc33d20df Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 12 May 2025 10:22:11 +0900 Subject: [PATCH 11/12] Improve using `//` in XPath performance (#249) When using `//` in XPath, the deeper the tag hierarchy, the slower it becomes due to the namespace acquisition process. Caching namespace information improves performance when using `//` with XPath. ## Benchmark (Comparison with rexml 3.4.1) ``` $ benchmark-driver benchmark/xpath.yaml Calculating ------------------------------------- rexml 3.4.1 master 3.4.1(YJIT) master(YJIT) REXML::XPath.match(REXML::Document.new(xml), 'a//a') 29.215 234.909 108.945 492.410 i/s - 100.000 times in 3.422925s 0.425697s 0.917898s 0.203083s Comparison: REXML::XPath.match(REXML::Document.new(xml), 'a//a') master(YJIT): 492.4 i/s master: 234.9 i/s - 2.10x slower 3.4.1(YJIT): 108.9 i/s - 4.52x slower rexml 3.4.1: 29.2 i/s - 16.85x slower ``` - YJIT=ON : 4.52x faster - YJIT=OFF : 8.04x faster --------- Co-authored-by: tomoya ishida Co-authored-by: Sutou Kouhei --- benchmark/xpath.yaml | 32 ++++++++++++++++++++++++++++++++ lib/rexml/attribute.rb | 4 ++++ lib/rexml/document.rb | 14 ++++++++++++++ lib/rexml/element.rb | 33 +++++++++++++++++---------------- lib/rexml/xpath_parser.rb | 27 ++++++++++++--------------- test/test_core.rb | 23 +++++++++++++++++------ test/xpath/test_base.rb | 10 ++++++++++ 7 files changed, 106 insertions(+), 37 deletions(-) create mode 100644 benchmark/xpath.yaml diff --git a/benchmark/xpath.yaml b/benchmark/xpath.yaml new file mode 100644 index 00000000..d6e970eb --- /dev/null +++ b/benchmark/xpath.yaml @@ -0,0 +1,32 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + DEPTH = 100 + xml = '' * DEPTH + '' * DEPTH + doc = REXML::Document.new(xml) + +benchmark: + "REXML::XPath.match(REXML::Document.new(xml), 'a//a')" : REXML::XPath.match(doc, "a//a") diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index fe48745c..7a190225 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -206,6 +206,10 @@ def xpath path += "/@#{self.expanded_name}" return path end + + def document + @element&.document + end end end #vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 1960012c..1c678bef 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -448,6 +448,20 @@ def document end private + + attr_accessor :namespaces_cache + + # New document level cache is created and available in this block. + # This API is thread unsafe. Users can't change this document in this block. + def enable_cache + @namespaces_cache = {} + begin + yield + ensure + @namespaces_cache = nil + end + end + def build( source ) Parsers::TreeParser.new( source, self ).parse end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4e3a60b9..b62b6cc2 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -589,10 +589,12 @@ def prefixes # d.elements['//c'].namespaces # => {"x"=>"1", "y"=>"2", "z"=>"3"} # def namespaces - namespaces = {} - namespaces = parent.namespaces if parent - namespaces = namespaces.merge( attributes.namespaces ) - return namespaces + namespaces_cache = document&.__send__(:namespaces_cache) + if namespaces_cache + namespaces_cache[self] ||= calculate_namespaces + else + calculate_namespaces + end end # :call-seq: @@ -619,17 +621,9 @@ def namespace(prefix=nil) if prefix.nil? prefix = prefix() end - if prefix == '' - prefix = "xmlns" - else - prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' - end - ns = nil - target = self - while ns.nil? and target - ns = target.attributes[prefix] - target = target.parent - end + prefix = (prefix == '') ? 'xmlns' : prefix.delete_prefix("xmlns:") + ns = namespaces[prefix] + ns = '' if ns.nil? and prefix == 'xmlns' return ns end @@ -1516,8 +1510,15 @@ def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) formatter.write( self, output ) end - private + def calculate_namespaces + if parent + parent.namespaces.merge(attributes.namespaces) + else + attributes.namespaces + end + end + def __to_xpath_helper node rv = node.expanded_name.clone if node.parent diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 8440015b..70ae8919 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -78,7 +78,15 @@ def variables=( vars={} ) def parse path, node path_stack = @parser.parse( path ) - match( path_stack, node ) + if node.is_a?(Array) + Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) + return [] if node.empty? + node = node.first + end + + node.document.__send__(:enable_cache) do + match( path_stack, node ) + end end def get_first path, node @@ -137,11 +145,6 @@ def first( path_stack, node ) def match(path_stack, node) - if node.is_a?(Array) - Kernel.warn("REXML::XPath.each, REXML::XPath.first, REXML::XPath.match dropped support for nodeset...", uplevel: 1) - return [] if node.empty? - node = node.first - end nodeset = [XPathNode.new(node, position: 1)] result = expr(path_stack, nodeset) case result @@ -494,14 +497,10 @@ def node_test(path_stack, nodesets, any_type: :element) if strict? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node, prefix) end when :attribute if prefix.nil? @@ -509,9 +508,7 @@ def node_test(path_stack, nodesets, any_type: :element) elsif prefix.empty? raw_node.name == name and raw_node.namespace == "" else - # FIXME: This DOUBLES the time XPath searches take - ns = get_namespace(raw_node.element, prefix) - raw_node.name == name and raw_node.namespace == ns + raw_node.name == name and raw_node.namespace == get_namespace(raw_node.element, prefix) end else false diff --git a/test/test_core.rb b/test/test_core.rb index 34fe9e07..651056f2 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -653,18 +653,23 @@ def test_namespace assert_equal "Some text", out end - def test_add_namespace e = Element.new 'a' + assert_equal("", e.namespace) + assert_nil(e.namespace('foo')) e.add_namespace 'someuri' e.add_namespace 'foo', 'otheruri' e.add_namespace 'xmlns:bar', 'thirduri' - assert_equal 'someuri', e.attributes['xmlns'] - assert_equal 'otheruri', e.attributes['xmlns:foo'] - assert_equal 'thirduri', e.attributes['xmlns:bar'] + assert_equal("someuri", e.namespace) + assert_equal("otheruri", e.namespace('foo')) + assert_equal("otheruri", e.namespace('xmlns:foo')) + assert_equal("thirduri", e.namespace('bar')) + assert_equal("thirduri", e.namespace('xmlns:bar')) + assert_equal('someuri', e.attributes['xmlns']) + assert_equal('otheruri', e.attributes['xmlns:foo']) + assert_equal('thirduri', e.attributes['xmlns:bar']) end - def test_big_documentation d = File.open(fixture_path("documentation.xml")) {|f| Document.new f } assert_equal "Sean Russell", d.elements["documentation/head/author"].text.tr("\n\t", " ").squeeze(" ") @@ -764,9 +769,15 @@ def test_attributes_each def test_delete_namespace doc = Document.new "" + assert_equal("1", doc.root.namespace) + assert_equal("2", doc.root.namespace('x')) + assert_equal("2", doc.root.namespace('xmlns:x')) doc.root.delete_namespace doc.root.delete_namespace 'x' - assert_equal "", doc.to_s + assert_equal("", doc.to_s) + assert_equal("", doc.root.namespace) + assert_nil(doc.root.namespace('x')) + assert_nil(doc.root.namespace('xmlns:x')) end def test_each_element_with_attribute diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index ab22f6f9..764171ab 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -1193,6 +1193,16 @@ def test_namespaces_0 assert_equal( 1, XPath.match( d, "//x:*" ).size ) end + def test_namespaces_cache + doc = Document.new("") + assert_equal("", XPath.first(doc, "//b[namespace-uri()='1']").to_s) + assert_nil(XPath.first(doc, "//b[namespace-uri()='']")) + + doc.root.delete_namespace + assert_nil(XPath.first(doc, "//b[namespace-uri()='1']")) + assert_equal("", XPath.first(doc, "//b[namespace-uri()='']").to_s) + end + def test_ticket_71 doc = Document.new(%Q{}) el = doc.root.elements[1] From 3dc9eca877f8444b7ac1d6008feb724cbfdc239a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Thu, 29 May 2025 10:14:32 +0900 Subject: [PATCH 12/12] Improve `Text.check` performance (#256) The doctype parameter of Text.check is not being used. Changing the doctype parameter to an optional parameter improves the parsing speed of the DOM. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 19.854 23.805 33.969 37.712 i/s - 100.000 times in 5.036779s 4.200839s 2.943877s 2.651709s sax 29.436 30.494 54.070 55.089 i/s - 100.000 times in 3.397155s 3.279348s 1.849463s 1.815255s pull 34.908 34.857 62.969 64.895 i/s - 100.000 times in 2.864651s 2.868842s 1.588082s 1.540939s stream 34.570 34.281 60.616 60.355 i/s - 100.000 times in 2.892656s 2.917080s 1.649737s 1.656866s Comparison: dom after(YJIT): 37.7 i/s before(YJIT): 34.0 i/s - 1.11x slower after: 23.8 i/s - 1.58x slower before: 19.9 i/s - 1.90x slower sax after(YJIT): 55.1 i/s before(YJIT): 54.1 i/s - 1.02x slower after: 30.5 i/s - 1.81x slower before: 29.4 i/s - 1.87x slower pull after(YJIT): 64.9 i/s before(YJIT): 63.0 i/s - 1.03x slower before: 34.9 i/s - 1.86x slower after: 34.9 i/s - 1.86x slower stream before(YJIT): 60.6 i/s after(YJIT): 60.4 i/s - 1.00x slower before: 34.6 i/s - 1.75x slower after: 34.3 i/s - 1.77x slower ``` - YJIT=ON : 1.00x - 1.11x faster (dom: 1.11x faster) - YJIT=OFF : 1.00x - 1.20x faster (dom: 1.20x faster) --- lib/rexml/attribute.rb | 2 +- lib/rexml/text.rb | 6 +++--- test/test_text_check.rb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 7a190225..ba49207c 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -173,7 +173,7 @@ def element=( element ) @element = element if @normalized - Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype ) + Text.check( @normalized, NEEDS_A_SECOND_CHECK ) end self diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 2bf480fb..6f821472 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -104,16 +104,16 @@ def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, @entity_filter = entity_filter if entity_filter clear_cache - Text.check(@string, illegal, doctype) if @raw + Text.check(@string, illegal) if @raw end def parent= parent super(parent) - Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + Text.check(@string, NEEDS_A_SECOND_CHECK) if @raw and @parent end # check for illegal characters - def Text.check string, pattern, doctype + def Text.check string, pattern, doctype = nil # illegal anywhere if !string.match?(VALID_XML_CHARS) diff --git a/test/test_text_check.rb b/test/test_text_check.rb index 11cf65a3..3f2f7864 100644 --- a/test/test_text_check.rb +++ b/test/test_text_check.rb @@ -4,7 +4,7 @@ module REXMLTests class TextCheckTester < Test::Unit::TestCase def check(string) - REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK) end def assert_check(string)