diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 00000000..52349b44 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,29 @@ +name: Benchmark + +on: + - push + - pull_request + +jobs: + benchmark: + name: "Benchmark: Ruby ${{ matrix.ruby-version }}: ${{ matrix.runs-on }}" + strategy: + fail-fast: false + matrix: + ruby-version: + - '3.3' + runs-on: + - ubuntu-latest + runs-on: ${{ matrix.runs-on }} + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby-version }} + - name: Install dependencies + run: | + bundle install + gem install rexml -v 3.2.6 + - name: Benchmark + run: | + rake benchmark diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2755192a..20ff87e7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 10 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Extract release note run: | ruby \ diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0e7df009..fd26b9ab 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,7 +3,14 @@ on: - push - pull_request jobs: + ruby-versions: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 2.5 + inplace: + needs: ruby-versions name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -13,19 +20,14 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: - - "2.5" - - "2.6" - - "2.7" - - "3.0" - - "3.1" - - "3.2" - - jruby + ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + exclude: + - {runs-on: macos-latest, ruby-version: 2.5} # include: # - runs-on: ubuntu-latest # ruby-version: truffleruby steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -33,6 +35,18 @@ jobs: - name: Test run: bundle exec rake test + frozen-string-literal: + name: frozen-string-literal + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ruby + bundler-cache: true + - name: Test + run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal" + gem: name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} @@ -47,13 +61,17 @@ jobs: - "3.0" - head steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} - name: Install as gem + env: + BUNDLE_PATH__SYSTEM: "true" + BUNDLE_WITHOUT: "benchmark:development" run: | rake install + bundle install - name: Test run: | ruby -run -e mkdir -- tmp @@ -65,7 +83,7 @@ jobs: name: "Document" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: 2.7 @@ -75,7 +93,7 @@ jobs: - name: Build document run: | bundle exec rake warning:error rdoc - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 if: | github.event_name == 'push' with: diff --git a/Gemfile b/Gemfile index 54da2c0c..67f21dfb 100644 --- a/Gemfile +++ b/Gemfile @@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in rexml.gemspec gemspec + +group :development do + gem "bundler" + gem "rake" +end + +group :benchmark do + gem "benchmark_driver" +end + +group :test do + gem "test-unit" + gem "test-unit-ruby-core" +end diff --git a/NEWS.md b/NEWS.md index 271c303b..013409e6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,71 @@ # News +## 3.2.8 - 2024-05-16 {#version-3-2-8} + +### Fixes + + * Suppressed a warning + +## 3.2.7 - 2024-05-16 {#version-3-2-7} + +### Improvements + + * Improve parse performance by using `StringScanner`. + + * GH-106 + * GH-107 + * GH-108 + * GH-109 + * GH-112 + * GH-113 + * GH-114 + * GH-115 + * GH-116 + * GH-117 + * GH-118 + * GH-119 + * GH-121 + + * Patch by NAITOH Jun. + + * Improved parse performance when an attribute has many `<`s. + + * GH-124 + +### Fixes + + * XPath: Fixed a bug of `normalize_space(array)`. + + * GH-110 + * GH-111 + + * Patch by flatisland. + + * XPath: Fixed a bug that wrong position is used with nested path. + + * GH-110 + * GH-122 + + * Reported by jcavalieri. + * Patch by NAITOH Jun. + + * Fixed a bug that an exception message can't be generated for + invalid encoding XML. + + * GH-29 + * GH-123 + + * Reported by DuKewu. + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + * flatisland + * jcavalieri + * DuKewu + + ## 3.2.6 - 2023-07-27 {#version-3-2-6} ### Improvements diff --git a/Rakefile b/Rakefile index 7143e754..76a56296 100644 --- a/Rakefile +++ b/Rakefile @@ -28,3 +28,42 @@ RDoc::Task.new do |rdoc| end load "#{__dir__}/tasks/tocs.rake" + +benchmark_tasks = [] +namespace :benchmark do + Dir.glob("benchmark/*.yaml").sort.each do |yaml| + name = File.basename(yaml, ".*") + env = { + "RUBYLIB" => nil, + "BUNDLER_ORIG_RUBYLIB" => nil, + } + command_line = [ + RbConfig.ruby, "-v", "-S", "benchmark-driver", File.expand_path(yaml), + ] + + desc "Run #{name} benchmark" + task name do + puts("```") + sh(env, *command_line) + puts("```") + end + benchmark_tasks << "benchmark:#{name}" + + case name + when /\Aparse/ + namespace name do + desc "Run #{name} benchmark: small" + task :small do + puts("```") + sh(env.merge("N_ELEMENTS" => "500", "N_ATTRIBUTES" => "1"), + *command_line) + puts("```") + end + benchmark_tasks << "benchmark:#{name}:small" + end + end + end +end + +desc "Run all benchmarks" +task :benchmark => benchmark_tasks diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml new file mode 100644 index 00000000..e7066fcb --- /dev/null +++ b/benchmark/parse.yaml @@ -0,0 +1,57 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/pullparser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + n_elements = Integer(ENV.fetch("N_ELEMENTS", "5000"), 10) + n_attributes = Integer(ENV.fetch("N_ATTRIBUTES", "2"), 10) + + def build_xml(n_elements, n_attributes) + xml = '' + n_elements.times do |i| + xml << '' + end + xml << '' + end + xml = build_xml(n_elements, n_attributes) + + class Listener + include REXML::StreamListener + end + +benchmark: + 'dom' : REXML::Document.new(xml).elements.each("root/child") {|_|} + 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse + 'pull' : | + parser = REXML::Parsers::PullParser.new(xml) + while parser.has_next? + parser.pull + end + 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index 77926bf2..4c114616 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -262,11 +262,10 @@ def Functions::string_length( string ) string(string).length end - # UNTESTED def Functions::normalize_space( string=nil ) string = string(@@context[:node]) if string.nil? if string.kind_of? Array - string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string} + string.collect{|x| x.to_s.strip.gsub(/\s+/um, ' ') if x} else string.to_s.strip.gsub(/\s+/um, ' ') end diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb index 7b16cd1a..e57d05fd 100644 --- a/lib/rexml/parseexception.rb +++ b/lib/rexml/parseexception.rb @@ -29,6 +29,7 @@ def to_s err << "\nLine: #{line}\n" err << "Position: #{position}\n" err << "Last 80 unconsumed characters:\n" + err.force_encoding("ASCII-8BIT") err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ') end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 305b1207..d09237c5 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative '../parseexception' require_relative '../undefinednamespaceexception' require_relative '../source' @@ -96,7 +96,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s* [/'/, "'", "'", /'/] } + module Private + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um + TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um + CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um + NAME_PATTERN = /\s*#{NAME}/um + GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + end + private_constant :Private + include Private + def initialize( source ) self.stream = source @listeners = [] @@ -196,181 +209,180 @@ def pull_event return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + + @source.ensure_buffer if @document_status == nil - word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) - word = word[1] unless word.nil? - #STDERR.puts "WORD = #{word.inspect}" - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - #STDERR.puts "XMLDECL" - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" - end - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] - when INSTRUCTION_START - return process_instruction - when DOCTYPE_START - base_error_message = "Malformed DOCTYPE" - @source.match(DOCTYPE_START, true) - @nsstack.unshift(curr_ns=Set.new) - name = parse_name(base_error_message) - if @source.match(/\A\s*\[/um, true) - id = [nil, nil, nil] - @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) - id = [nil, nil, nil] - @document_status = :after_doctype - else - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: false) - if id[0] == "SYSTEM" - # For backward compatibility - id[1], id[2] = id[2], nil + start_position = @source.position + if @source.match("/um, true)[1] ] + elsif @source.match("DOCTYPE", true) + base_error_message = "Malformed DOCTYPE" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.position = start_position + raise REXML::ParseException.new(message, @source) end - if @source.match(/\A\s*\[/um, true) + @nsstack.unshift(curr_ns=Set.new) + name = parse_name(base_error_message) + if @source.match(/\s*\[/um, true) + id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) + elsif @source.match(/\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype + @source.ensure_buffer else - message = "#{base_error_message}: garbage after external ID" - raise REXML::ParseException.new(message, @source) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + @document_status = :after_doctype + @source.ensure_buffer + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end end - end - args = [:start_doctype, name, *id] - if @document_status == :after_doctype - @source.match(/\A\s*/um, true) - @stack << [ :end_doctype ] - end - return args - when /\A\s+/ - else - @document_status = :after_doctype - if @source.encoding == "UTF-8" - @source.buffer.force_encoding(::Encoding::UTF_8) + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\s*/um, true) + @stack << [ :end_doctype ] + end + return args + else + message = "Invalid XML" + raise REXML::ParseException.new(message, @source) end end end if @document_status == :in_doctype - md = @source.match(/\A\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] - - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - - when ENTITY_START - match = @source.match( ENTITYDECL, true ).to_a.compact - match[0] = :entitydecl - ref = false - if match[1] == '%' - ref = true - match.delete_at 1 - end - # Now we have to sort out what kind of entity reference this is - if match[2] == 'SYSTEM' - # External reference - match[3] = match[3][1..-2] # PUBID - match.delete_at(4) if match.size > 4 # Chop out NDATA decl - # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] - elsif match[2] == 'PUBLIC' - # External reference - match[3] = match[3][1..-2] # PUBID - match[4] = match[4][1..-2] # HREF - match.delete_at(5) if match.size > 5 # Chop out NDATA decl - # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] - else - match[2] = match[2][1..-2] - match.pop if match.size == 4 - # match is [ :entity, name, value ] - end - match << '%' if ref - return match - when ATTLISTDECL_START - md = @source.match( ATTLISTDECL_PATTERN, true ) - raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? - element = md[1] - contents = md[0] - - pairs = {} - values = md[0].scan( ATTDEF_RE ) - values.each do |attdef| - unless attdef[3] == "#IMPLIED" - attdef.compact! - val = attdef[3] - val = attdef[4] if val == "#FIXED " - pairs[attdef[0]] = val - if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 - end + @source.match(/\s*/um, true) # skip spaces + start_position = @source.position + if @source.match("/um, true) + raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? + return [ :elementdecl, "/um) - message = "#{base_error_message}: name is missing" + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + match.delete_at(5) if match.size > 5 # Chop out NDATA decl + # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else - message = "#{base_error_message}: invalid declaration name" + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] end - raise REXML::ParseException.new(message, @source) - end - name = parse_name(base_error_message) - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: true) - unless @source.match(/\A\s*>/um, true) - message = "#{base_error_message}: garbage before end >" - raise REXML::ParseException.new(message, @source) + match << '%' if ref + return match + elsif @source.match("ATTLIST", true) + md = @source.match(ATTLISTDECL_END, true) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] + + pairs = {} + values = md[0].scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + if attdef[0] =~ /^xmlns:(.*)/ + @nsstack[0] << $1 + end + end + end + return [ :attlistdecl, element, pairs, contents ] + elsif @source.match("NOTATION", true) + base_error_message = "Malformed notation declaration" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.position = start_position + raise REXML::ParseException.new(message, @source) + end + name = parse_name(base_error_message) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: true) + unless @source.match(/\s*>/um, true) + message = "#{base_error_message}: garbage before end >" + raise REXML::ParseException.new(message, @source) + end + return [:notationdecl, name, *id] + elsif md = @source.match(/--(.*?)-->/um, true) + case md[1] + when /--/, /-\z/ + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] if md end - return [:notationdecl, name, *id] - when DOCTYPE_END + elsif match = @source.match(/(%.*?;)\s*/um, true) + return [ :externalentity, match[1] ] + elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype - @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end end if @document_status == :after_doctype - @source.match(/\A\s*/um, true) + @source.match(/\s*/um, true) end begin - @source.read if @source.buffer.size<2 - if @source.buffer[0] == ?< - if @source.buffer[1] == ?/ + start_position = @source.position + if @source.match("<", true) + if @source.match("/", true) @nsstack.shift last_tag = @tags.pop - md = @source.match( CLOSE_MATCH, true ) + md = @source.match(CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) end if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" - message << " (got '#{md[1]}')" if md + message += " (got '#{md[1]}')" if md + @source.position = start_position if md.nil? raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] - elsif @source.buffer[1] == ?! - md = @source.match(/\A(\s*[^>]*>)/um) + elsif @source.match("!", true) + md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) + if md[0][0] == ?- + md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ @@ -379,19 +391,21 @@ def pull_event return [ :comment, md[1] ] if md else - md = @source.match( CDATA_PATTERN, true ) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif @source.buffer[1] == ?? - return process_instruction + elsif @source.match("?", true) + return process_instruction(start_position) else # Get the next tag - md = @source.match(TAG_MATCH, true) + md = @source.match(TAG_PATTERN, true) unless md + @source.position = start_position raise REXML::ParseException.new("malformed XML: missing tag start", @source) end + tag = md[1] @document_status = :in_element prefixes = Set.new prefixes << md[2] if md[2] @@ -405,23 +419,17 @@ def pull_event end if closed - @closed = md[1] + @closed = tag @nsstack.shift else - @tags.push( md[1] ) + @tags.push( tag ) end - return [ :start_element, md[1], attributes ] + return [ :start_element, tag, attributes ] end else - md = @source.match( TEXT_PATTERN, true ) - if md[0].length == 0 - @source.match( /(\s+)/, true ) - end - #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 - #return [ :text, "" ] if md[0].length == 0 - # unnormalized = Text::unnormalize( md[1], self ) - # return PullEvent.new( :text, md[1], unnormalized ) - return [ :text, md[1] ] + md = @source.match(/([^<]*)/um, true) + text = md[1] + return [ :text, text ] end rescue REXML::UndefinedNamespaceException raise @@ -463,8 +471,7 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) + rv = string.gsub( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { @@ -499,9 +506,9 @@ def need_source_encoding_update?(xml_declaration_encoding) end def parse_name(base_error_message) - md = @source.match(/\A\s*#{NAME}/um, true) + md = @source.match(NAME_PATTERN, true) unless md - if @source.match(/\A\s*\S/um) + if @source.match(/\s*\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" @@ -577,97 +584,89 @@ def parse_id_invalid_details(accept_external_id:, end end - def process_instruction - match_data = @source.match(INSTRUCTION_PATTERN, true) + def process_instruction(start_position) + match_data = @source.match(INSTRUCTION_END, true) unless match_data message = "Invalid processing instruction node" + @source.position = start_position raise REXML::ParseException.new(message, @source) end + if @document_status.nil? and match_data[1] == "xml" + content = match_data[2] + version = VERSION.match(content) + version = version[1] unless version.nil? + encoding = ENCODING.match(content) + encoding = encoding[1] unless encoding.nil? + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end + standalone = STANDALONE.match(content) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] + end [:processing_instruction, match_data[1], match_data[2]] end def parse_attributes(prefixes, curr_ns) attributes = {} closed = false - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data.nil? - message = "Start tag isn't ended" - raise REXML::ParseException.new(message, @source) - end - - raw_attributes = match_data[1] - closed = !match_data[2].nil? - return attributes, closed if raw_attributes.nil? - return attributes, closed if raw_attributes.empty? - - scanner = StringScanner.new(raw_attributes) - until scanner.eos? - if scanner.scan(/\s+/) - break if scanner.eos? - end - - pos = scanner.pos - loop do - break if scanner.scan(ATTRIBUTE_PATTERN) - unless scanner.scan(QNAME) - message = "Invalid attribute name: <#{scanner.rest}>" - raise REXML::ParseException.new(message, @source) - end - name = scanner[0] - unless scanner.scan(/\s*=\s*/um) + while true + if @source.match(">", true) + return attributes, closed + elsif @source.match("/>", true) + closed = true + return attributes, closed + elsif match = @source.match(QNAME, true) + name = match[1] + prefix = match[2] + local_part = match[3] + + unless @source.match(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = scanner.scan(/['"]/) - unless quote + unless match = @source.match(/(['"])/, true) message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end - unless scanner.scan(/.*#{Regexp.escape(quote)}/um) - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data - scanner << "/" if closed - scanner << ">" - scanner << match_data[1] - scanner.pos = pos - closed = !match_data[2].nil? - next - end - message = - "Missing attribute value end quote: <#{name}>: <#{quote}>" + quote = match[1] + value = @source.read_until(quote) + unless value.chomp!(quote) + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end - end - name = scanner[1] - prefix = scanner[2] - local_part = scanner[3] - # quote = scanner[4] - value = scanner[5] - if prefix == "xmlns" - if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" - msg = "The 'xml' prefix must not be bound to any other namespace "+ + @source.match(/\s*/um, true) + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self ) + raise REXML::ParseException.new( msg, @source, self) end - elsif local_part == "xmlns" - msg = "The 'xmlns' prefix must not be declared "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self) + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" end - curr_ns << local_part - elsif prefix - prefixes << prefix unless prefix == "xml" - end - if attributes.has_key?(name) - msg = "Duplicate attribute #{name.inspect}" - raise REXML::ParseException.new(msg, @source, self) - end + if attributes[name] + msg = "Duplicate attribute #{name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end - attributes[name] = value + attributes[name] = value + else + message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" + raise REXML::ParseException.new(message, @source) + end end - return attributes, closed end end end diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 0d18559a..191932b8 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -31,7 +31,7 @@ module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.2.6" + VERSION = "3.2.8" REVISION = "" Copyright = COPYRIGHT diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 90b370b9..0f3c5011 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -30,8 +30,6 @@ def SourceFactory::create_from(arg) # objects and provides consumption of text class Source include Encoding - # The current buffer (what we're going to read next) - attr_reader :buffer # The line number of the last consumed text attr_reader :line attr_reader :encoding @@ -41,7 +39,8 @@ class Source # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection def initialize(arg, encoding=nil) - @orig = @buffer = arg + @orig = arg + @scanner = StringScanner.new(@orig) if encoding self.encoding = encoding else @@ -50,6 +49,14 @@ def initialize(arg, encoding=nil) @line = 0 end + # The current buffer (what we're going to read next) + def buffer + @scanner.rest + end + + def buffer_encoding=(encoding) + @scanner.string.force_encoding(encoding) + end # Inherited from Encoding # Overridden to support optimized en/decoding @@ -58,98 +65,72 @@ def encoding=(enc) encoding_updated end - # Scans the source for a given pattern. Note, that this is not your - # usual scan() method. For one thing, the pattern argument has some - # requirements; for another, the source can be consumed. You can easily - # confuse this method. Originally, the patterns were easier - # to construct and this method more robust, because this method - # generated search regexps on the fly; however, this was - # computationally expensive and slowed down the entire REXML package - # considerably, since this is by far the most commonly called method. - # @param pattern must be a Regexp, and must be in the form of - # /^\s*(#{your pattern, with no groups})(.*)/. The first group - # will be returned; the second group is used if the consume flag is - # set. - # @param consume if true, the pattern returned will be consumed, leaving - # everything after it in the Source. - # @return the pattern, if found, or nil if the Source is empty or the - # pattern is not found. - def scan(pattern, cons=false) - return nil if @buffer.nil? - rv = @buffer.scan(pattern) - @buffer = $' if cons and rv.size>0 - rv + def read(term = nil) end - def read + def read_until(term) + @scanner.scan_until(Regexp.union(term)) or @scanner.rest end - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) + def ensure_buffer end - def match_to( char, pattern ) - return pattern.match(@buffer) + def match(pattern, cons=false) + if cons + @scanner.scan(pattern).nil? ? nil : @scanner + else + @scanner.check(pattern).nil? ? nil : @scanner + end end - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md + def position + @scanner.pos end - def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md + def position=(pos) + @scanner.pos = pos end # @return true if the Source is exhausted def empty? - @buffer == "" - end - - def position - @orig.index( @buffer ) + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string.force_encoding("ASCII-8BIT") + if @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.scan(/\xef\xbb\xbf/n) detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string.force_encoding(::Encoding::UTF_8) end end end @@ -172,7 +153,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -181,65 +162,57 @@ def initialize(arg, block_size=500, encoding=nil) end end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrant duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - @buffer << readline - rescue Iconv::IllegalSequence - raise - rescue - @source = nil - end - end - rv = super + def read(term = nil) + begin + @scanner << readline(term) + true + rescue Exception, NameError + @source = nil + false end - rv.taint if RUBY_VERSION < '2.7' - rv end - def read + def read_until(term) + pattern = Regexp.union(term) begin - @buffer << readline - rescue Exception, NameError - @source = nil + until str = @scanner.scan_until(pattern) + @scanner << readline(term) + end + rescue EOFError + @scanner.rest + else + read if @scanner.eos? and !@source.eof? + str end end - def consume( pattern ) - match( pattern, true ) + def ensure_buffer + read if @scanner.eos? && @source end + # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats: + # - ">" + # - "XXX>" (X is any string excluding '>') def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source - begin - @buffer << readline - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - rescue - @source = nil + while true + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) end + break if md + return nil if pattern.is_a?(String) + return nil if @source.nil? + return nil unless read end - rv.taint if RUBY_VERSION < '2.7' - rv + + md.nil? ? nil : @scanner end def empty? super and ( @source.nil? || @source.eof? ) end - def position - @er_source.pos rescue 0 - end - # @return the current line in the source def current_line begin @@ -263,8 +236,8 @@ def current_line end private - def readline - str = @source.readline(@line_break) + def readline(term = nil) + str = @source.readline(term || @line_break) if @pending_buffer if str.nil? str = @pending_buffer @@ -290,7 +263,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index d8b88e7a..5eb1e5a9 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -590,6 +590,7 @@ def filter_nodeset(nodeset) def evaluate_predicate(expression, nodesets) enter(:predicate, expression, nodesets) if @debug + new_nodeset_count = 0 new_nodesets = nodesets.collect do |nodeset| new_nodeset = [] subcontext = { :size => nodeset.size } @@ -606,17 +607,20 @@ def evaluate_predicate(expression, nodesets) result = result[0] if result.kind_of? Array and result.length == 1 if result.kind_of? Numeric if result == node.position - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end elsif result.instance_of? Array if result.size > 0 and result.inject(false) {|k,s| s or k} if result.size > 0 - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end end else if result - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end end end diff --git a/rexml.gemspec b/rexml.gemspec index ceb77047..97eac657 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -55,7 +55,5 @@ Gem::Specification.new do |spec| spec.required_ruby_version = '>= 2.5.0' - spec.add_development_dependency "bundler" - spec.add_development_dependency "rake" - spec.add_development_dependency "test-unit" + spec.add_runtime_dependency("strscan", ">= 3.0.9") end diff --git a/test/formatter/test_default.rb b/test/formatter/test_default.rb index 321d8180..aa403dbe 100644 --- a/test/formatter/test_default.rb +++ b/test/formatter/test_default.rb @@ -2,7 +2,7 @@ module REXMLTests class DefaultFormatterTest < Test::Unit::TestCase def format(node) formatter = REXML::Formatters::Default.new - output = "" + output = +"" formatter.write(node, output) output end diff --git a/test/functions/test_base.rb b/test/functions/test_base.rb index 74dc1a31..daa38156 100644 --- a/test/functions/test_base.rb +++ b/test/functions/test_base.rb @@ -229,8 +229,30 @@ def test_normalize_space assert_equal( [REXML::Comment.new("COMMENT A")], m ) end + def test_normalize_space_strings + source = <<-XML +breakfast boosts\t\t + +concentration +Coffee beans + aroma + + + + Dessert + \t\t after dinner + XML + normalized_texts = REXML::XPath.each(REXML::Document.new(source), "normalize-space(//text())").to_a + assert_equal([ + "breakfast boosts concentration", + "Coffee beans aroma", + "Dessert after dinner", + ], + normalized_texts) + end + def test_string_nil_without_context - doc = REXML::Document.new(<<-XML) + doc = REXML::Document.new(<<~XML) diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 55713909..8faa0b78 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -36,6 +36,21 @@ def test_garbage_plus_before_name_at_line_start + r SYSTEM "urn:x-rexml:test" [ ]> DETAIL end + + def test_no_name + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: name is missing +Line: 3 +Position: 17 +Last 80 unconsumed characters: + + DETAIL + end end class TestExternalID < self diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 9f172a28..14d0703a 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -41,9 +41,22 @@ def test_empty_namespace_attribute_name assert_equal(<<-DETAIL.chomp, exception.to_s) Invalid attribute name: <:a=""> Line: 1 -Position: 9 +Position: 13 Last 80 unconsumed characters: +:a=""> + DETAIL + end + def test_empty_namespace_attribute_name_with_utf8_character + exception = assert_raise(REXML::ParseException) do + parse("") # U+200B ZERO WIDTH SPACE + end + assert_equal(<<-DETAIL.chomp.force_encoding("ASCII-8BIT"), exception.to_s) +Invalid attribute name: <:\xE2\x80\x8B> +Line: 1 +Position: 8 +Last 80 unconsumed characters: +:\xE2\x80\x8B> DETAIL end diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb new file mode 100644 index 00000000..e15deec6 --- /dev/null +++ b/test/parse/test_entity_declaration.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: false +require 'test/unit' +require 'rexml/document' + +module REXMLTests + class TestParseEntityDeclaration < Test::Unit::TestCase + private + def xml(internal_subset) + <<-XML + + + XML + end + + def parse(internal_subset) + REXML::Document.new(xml(internal_subset)).doctype + end + + def test_empty + exception = assert_raise(REXML::ParseException) do + parse(<<-INTERNAL_SUBSET) + + INTERNAL_SUBSET + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed notation declaration: name is missing +Line: 5 +Position: 72 +Last 80 unconsumed characters: + ]> + DETAIL + end + end +end diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb index 19a0536d..9e81b6a4 100644 --- a/test/parse/test_notation_declaration.rb +++ b/test/parse/test_notation_declaration.rb @@ -35,7 +35,7 @@ def test_no_name Line: 5 Position: 72 Last 80 unconsumed characters: - ]> + ]> DETAIL end diff --git a/test/test_contrib.rb b/test/test_contrib.rb index f3ad0b6c..23ee35b1 100644 --- a/test/test_contrib.rb +++ b/test/test_contrib.rb @@ -80,7 +80,7 @@ def test_bad_doctype_Tobias # Peter Verhage def test_namespace_Peter - source = <<-EOF + source = <<~EOF @@ -377,7 +377,7 @@ def test_various_xpath end def test_entities_Holden_Glova - document = <<-EOL + document = <<~EOL diff --git a/test/test_core.rb b/test/test_core.rb index 7c18c03f..44e2e7ea 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -15,7 +15,7 @@ class Tester < Test::Unit::TestCase include Helper::Fixture include REXML def setup - @xsa_source = <<-EOL + @xsa_source = <<~EOL