diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 00000000..52349b44
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,29 @@
+name: Benchmark
+
+on:
+ - push
+ - pull_request
+
+jobs:
+ benchmark:
+ name: "Benchmark: Ruby ${{ matrix.ruby-version }}: ${{ matrix.runs-on }}"
+ strategy:
+ fail-fast: false
+ matrix:
+ ruby-version:
+ - '3.3'
+ runs-on:
+ - ubuntu-latest
+ runs-on: ${{ matrix.runs-on }}
+ steps:
+ - uses: actions/checkout@v4
+ - uses: ruby/setup-ruby@v1
+ with:
+ ruby-version: ${{ matrix.ruby-version }}
+ - name: Install dependencies
+ run: |
+ bundle install
+ gem install rexml -v 3.2.6
+ - name: Benchmark
+ run: |
+ rake benchmark
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2755192a..20ff87e7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Extract release note
run: |
ruby \
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0e7df009..fd26b9ab 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -3,7 +3,14 @@ on:
- push
- pull_request
jobs:
+ ruby-versions:
+ uses: ruby/actions/.github/workflows/ruby_versions.yml@master
+ with:
+ engine: cruby-jruby
+ min_version: 2.5
+
inplace:
+ needs: ruby-versions
name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
runs-on: ${{ matrix.runs-on }}
strategy:
@@ -13,19 +20,14 @@ jobs:
- ubuntu-latest
- macos-latest
- windows-latest
- ruby-version:
- - "2.5"
- - "2.6"
- - "2.7"
- - "3.0"
- - "3.1"
- - "3.2"
- - jruby
+ ruby-version: ${{ fromJson(needs.ruby-versions.outputs.versions) }}
+ exclude:
+ - {runs-on: macos-latest, ruby-version: 2.5}
# include:
# - runs-on: ubuntu-latest
# ruby-version: truffleruby
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- uses: ruby/setup-ruby@v1
with:
ruby-version: ${{ matrix.ruby-version }}
@@ -33,6 +35,18 @@ jobs:
- name: Test
run: bundle exec rake test
+ frozen-string-literal:
+ name: frozen-string-literal
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: ruby/setup-ruby@v1
+ with:
+ ruby-version: ruby
+ bundler-cache: true
+ - name: Test
+ run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal"
+
gem:
name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}"
runs-on: ${{ matrix.runs-on }}
@@ -47,13 +61,17 @@ jobs:
- "3.0"
- head
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- uses: ruby/setup-ruby@v1
with:
ruby-version: ${{ matrix.ruby-version }}
- name: Install as gem
+ env:
+ BUNDLE_PATH__SYSTEM: "true"
+ BUNDLE_WITHOUT: "benchmark:development"
run: |
rake install
+ bundle install
- name: Test
run: |
ruby -run -e mkdir -- tmp
@@ -65,7 +83,7 @@ jobs:
name: "Document"
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- uses: ruby/setup-ruby@v1
with:
ruby-version: 2.7
@@ -75,7 +93,7 @@ jobs:
- name: Build document
run: |
bundle exec rake warning:error rdoc
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
if: |
github.event_name == 'push'
with:
diff --git a/Gemfile b/Gemfile
index 54da2c0c..67f21dfb 100644
--- a/Gemfile
+++ b/Gemfile
@@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
# Specify your gem's dependencies in rexml.gemspec
gemspec
+
+group :development do
+ gem "bundler"
+ gem "rake"
+end
+
+group :benchmark do
+ gem "benchmark_driver"
+end
+
+group :test do
+ gem "test-unit"
+ gem "test-unit-ruby-core"
+end
diff --git a/NEWS.md b/NEWS.md
index 271c303b..013409e6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,71 @@
# News
+## 3.2.8 - 2024-05-16 {#version-3-2-8}
+
+### Fixes
+
+ * Suppressed a warning
+
+## 3.2.7 - 2024-05-16 {#version-3-2-7}
+
+### Improvements
+
+ * Improve parse performance by using `StringScanner`.
+
+ * GH-106
+ * GH-107
+ * GH-108
+ * GH-109
+ * GH-112
+ * GH-113
+ * GH-114
+ * GH-115
+ * GH-116
+ * GH-117
+ * GH-118
+ * GH-119
+ * GH-121
+
+ * Patch by NAITOH Jun.
+
+ * Improved parse performance when an attribute has many `<`s.
+
+ * GH-124
+
+### Fixes
+
+ * XPath: Fixed a bug of `normalize_space(array)`.
+
+ * GH-110
+ * GH-111
+
+ * Patch by flatisland.
+
+ * XPath: Fixed a bug that wrong position is used with nested path.
+
+ * GH-110
+ * GH-122
+
+ * Reported by jcavalieri.
+ * Patch by NAITOH Jun.
+
+ * Fixed a bug that an exception message can't be generated for
+ invalid encoding XML.
+
+ * GH-29
+ * GH-123
+
+ * Reported by DuKewu.
+ * Patch by NAITOH Jun.
+
+### Thanks
+
+ * NAITOH Jun
+ * flatisland
+ * jcavalieri
+ * DuKewu
+
+
## 3.2.6 - 2023-07-27 {#version-3-2-6}
### Improvements
diff --git a/Rakefile b/Rakefile
index 7143e754..76a56296 100644
--- a/Rakefile
+++ b/Rakefile
@@ -28,3 +28,42 @@ RDoc::Task.new do |rdoc|
end
load "#{__dir__}/tasks/tocs.rake"
+
+benchmark_tasks = []
+namespace :benchmark do
+ Dir.glob("benchmark/*.yaml").sort.each do |yaml|
+ name = File.basename(yaml, ".*")
+ env = {
+ "RUBYLIB" => nil,
+ "BUNDLER_ORIG_RUBYLIB" => nil,
+ }
+ command_line = [
+ RbConfig.ruby, "-v", "-S", "benchmark-driver", File.expand_path(yaml),
+ ]
+
+ desc "Run #{name} benchmark"
+ task name do
+ puts("```")
+ sh(env, *command_line)
+ puts("```")
+ end
+ benchmark_tasks << "benchmark:#{name}"
+
+ case name
+ when /\Aparse/
+ namespace name do
+ desc "Run #{name} benchmark: small"
+ task :small do
+ puts("```")
+ sh(env.merge("N_ELEMENTS" => "500", "N_ATTRIBUTES" => "1"),
+ *command_line)
+ puts("```")
+ end
+ benchmark_tasks << "benchmark:#{name}:small"
+ end
+ end
+ end
+end
+
+desc "Run all benchmarks"
+task :benchmark => benchmark_tasks
diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml
new file mode 100644
index 00000000..e7066fcb
--- /dev/null
+++ b/benchmark/parse.yaml
@@ -0,0 +1,57 @@
+loop_count: 100
+contexts:
+ - gems:
+ rexml: 3.2.6
+ require: false
+ prelude: require 'rexml'
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+ require 'rexml'
+ - name: 3.2.6(YJIT)
+ gems:
+ rexml: 3.2.6
+ require: false
+ prelude: |
+ require 'rexml'
+ RubyVM::YJIT.enable
+ - name: master(YJIT)
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+ require 'rexml'
+ RubyVM::YJIT.enable
+
+prelude: |
+ require 'rexml/document'
+ require 'rexml/parsers/sax2parser'
+ require 'rexml/parsers/pullparser'
+ require 'rexml/parsers/streamparser'
+ require 'rexml/streamlistener'
+
+ n_elements = Integer(ENV.fetch("N_ELEMENTS", "5000"), 10)
+ n_attributes = Integer(ENV.fetch("N_ATTRIBUTES", "2"), 10)
+
+ def build_xml(n_elements, n_attributes)
+ xml = ''
+ n_elements.times do |i|
+ xml << ''
+ end
+ xml << ''
+ end
+ xml = build_xml(n_elements, n_attributes)
+
+ class Listener
+ include REXML::StreamListener
+ end
+
+benchmark:
+ 'dom' : REXML::Document.new(xml).elements.each("root/child") {|_|}
+ 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse
+ 'pull' : |
+ parser = REXML::Parsers::PullParser.new(xml)
+ while parser.has_next?
+ parser.pull
+ end
+ 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse
diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb
index 77926bf2..4c114616 100644
--- a/lib/rexml/functions.rb
+++ b/lib/rexml/functions.rb
@@ -262,11 +262,10 @@ def Functions::string_length( string )
string(string).length
end
- # UNTESTED
def Functions::normalize_space( string=nil )
string = string(@@context[:node]) if string.nil?
if string.kind_of? Array
- string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string}
+ string.collect{|x| x.to_s.strip.gsub(/\s+/um, ' ') if x}
else
string.to_s.strip.gsub(/\s+/um, ' ')
end
diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb
index 7b16cd1a..e57d05fd 100644
--- a/lib/rexml/parseexception.rb
+++ b/lib/rexml/parseexception.rb
@@ -29,6 +29,7 @@ def to_s
err << "\nLine: #{line}\n"
err << "Position: #{position}\n"
err << "Last 80 unconsumed characters:\n"
+ err.force_encoding("ASCII-8BIT")
err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
end
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 305b1207..d09237c5 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -1,4 +1,4 @@
-# frozen_string_literal: false
+# frozen_string_literal: true
require_relative '../parseexception'
require_relative '../undefinednamespaceexception'
require_relative '../source'
@@ -96,7 +96,7 @@ class BaseParser
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = ""
GEDECL = ""
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
NOTATIONDECL_START = /\A\s* [/'/, "'", "'", /'/]
}
+ module Private
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
+ NAME_PATTERN = /\s*#{NAME}/um
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
+ end
+ private_constant :Private
+ include Private
+
def initialize( source )
self.stream = source
@listeners = []
@@ -196,181 +209,180 @@ def pull_event
return @stack.shift if @stack.size > 0
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
+
+ @source.ensure_buffer
if @document_status == nil
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
- word = word[1] unless word.nil?
- #STDERR.puts "WORD = #{word.inspect}"
- case word
- when COMMENT_START
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
- when XMLDECL_START
- #STDERR.puts "XMLDECL"
- results = @source.match( XMLDECL_PATTERN, true )[1]
- version = VERSION.match( results )
- version = version[1] unless version.nil?
- encoding = ENCODING.match(results)
- encoding = encoding[1] unless encoding.nil?
- if need_source_encoding_update?(encoding)
- @source.encoding = encoding
- end
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
- encoding = "UTF-16"
- end
- standalone = STANDALONE.match(results)
- standalone = standalone[1] unless standalone.nil?
- return [ :xmldecl, version, encoding, standalone ]
- when INSTRUCTION_START
- return process_instruction
- when DOCTYPE_START
- base_error_message = "Malformed DOCTYPE"
- @source.match(DOCTYPE_START, true)
- @nsstack.unshift(curr_ns=Set.new)
- name = parse_name(base_error_message)
- if @source.match(/\A\s*\[/um, true)
- id = [nil, nil, nil]
- @document_status = :in_doctype
- elsif @source.match(/\A\s*>/um, true)
- id = [nil, nil, nil]
- @document_status = :after_doctype
- else
- id = parse_id(base_error_message,
- accept_external_id: true,
- accept_public_id: false)
- if id[0] == "SYSTEM"
- # For backward compatibility
- id[1], id[2] = id[2], nil
+ start_position = @source.position
+ if @source.match("", true)
+ return process_instruction(start_position)
+ elsif @source.match("/um, true)[1] ]
+ elsif @source.match("DOCTYPE", true)
+ base_error_message = "Malformed DOCTYPE"
+ unless @source.match(/\s+/um, true)
+ if @source.match(">")
+ message = "#{base_error_message}: name is missing"
+ else
+ message = "#{base_error_message}: invalid name"
+ end
+ @source.position = start_position
+ raise REXML::ParseException.new(message, @source)
end
- if @source.match(/\A\s*\[/um, true)
+ @nsstack.unshift(curr_ns=Set.new)
+ name = parse_name(base_error_message)
+ if @source.match(/\s*\[/um, true)
+ id = [nil, nil, nil]
@document_status = :in_doctype
- elsif @source.match(/\A\s*>/um, true)
+ elsif @source.match(/\s*>/um, true)
+ id = [nil, nil, nil]
@document_status = :after_doctype
+ @source.ensure_buffer
else
- message = "#{base_error_message}: garbage after external ID"
- raise REXML::ParseException.new(message, @source)
+ id = parse_id(base_error_message,
+ accept_external_id: true,
+ accept_public_id: false)
+ if id[0] == "SYSTEM"
+ # For backward compatibility
+ id[1], id[2] = id[2], nil
+ end
+ if @source.match(/\s*\[/um, true)
+ @document_status = :in_doctype
+ elsif @source.match(/\s*>/um, true)
+ @document_status = :after_doctype
+ @source.ensure_buffer
+ else
+ message = "#{base_error_message}: garbage after external ID"
+ raise REXML::ParseException.new(message, @source)
+ end
end
- end
- args = [:start_doctype, name, *id]
- if @document_status == :after_doctype
- @source.match(/\A\s*/um, true)
- @stack << [ :end_doctype ]
- end
- return args
- when /\A\s+/
- else
- @document_status = :after_doctype
- if @source.encoding == "UTF-8"
- @source.buffer.force_encoding(::Encoding::UTF_8)
+ args = [:start_doctype, name, *id]
+ if @document_status == :after_doctype
+ @source.match(/\s*/um, true)
+ @stack << [ :end_doctype ]
+ end
+ return args
+ else
+ message = "Invalid XML"
+ raise REXML::ParseException.new(message, @source)
end
end
end
if @document_status == :in_doctype
- md = @source.match(/\A\s*(.*?>)/um)
- case md[1]
- when SYSTEMENTITY
- match = @source.match( SYSTEMENTITY, true )[1]
- return [ :externalentity, match ]
-
- when ELEMENTDECL_START
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
-
- when ENTITY_START
- match = @source.match( ENTITYDECL, true ).to_a.compact
- match[0] = :entitydecl
- ref = false
- if match[1] == '%'
- ref = true
- match.delete_at 1
- end
- # Now we have to sort out what kind of entity reference this is
- if match[2] == 'SYSTEM'
- # External reference
- match[3] = match[3][1..-2] # PUBID
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
- elsif match[2] == 'PUBLIC'
- # External reference
- match[3] = match[3][1..-2] # PUBID
- match[4] = match[4][1..-2] # HREF
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
- else
- match[2] = match[2][1..-2]
- match.pop if match.size == 4
- # match is [ :entity, name, value ]
- end
- match << '%' if ref
- return match
- when ATTLISTDECL_START
- md = @source.match( ATTLISTDECL_PATTERN, true )
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
- element = md[1]
- contents = md[0]
-
- pairs = {}
- values = md[0].scan( ATTDEF_RE )
- values.each do |attdef|
- unless attdef[3] == "#IMPLIED"
- attdef.compact!
- val = attdef[3]
- val = attdef[4] if val == "#FIXED "
- pairs[attdef[0]] = val
- if attdef[0] =~ /^xmlns:(.*)/
- @nsstack[0] << $1
- end
+ @source.match(/\s*/um, true) # skip spaces
+ start_position = @source.position
+ if @source.match("/um, true)
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
+ return [ :elementdecl, "/um)
- message = "#{base_error_message}: name is missing"
+ # Now we have to sort out what kind of entity reference this is
+ if match[2] == 'SYSTEM'
+ # External reference
+ match[3] = match[3][1..-2] # PUBID
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
+ elsif match[2] == 'PUBLIC'
+ # External reference
+ match[3] = match[3][1..-2] # PUBID
+ match[4] = match[4][1..-2] # HREF
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
else
- message = "#{base_error_message}: invalid declaration name"
+ match[2] = match[2][1..-2]
+ match.pop if match.size == 4
+ # match is [ :entity, name, value ]
end
- raise REXML::ParseException.new(message, @source)
- end
- name = parse_name(base_error_message)
- id = parse_id(base_error_message,
- accept_external_id: true,
- accept_public_id: true)
- unless @source.match(/\A\s*>/um, true)
- message = "#{base_error_message}: garbage before end >"
- raise REXML::ParseException.new(message, @source)
+ match << '%' if ref
+ return match
+ elsif @source.match("ATTLIST", true)
+ md = @source.match(ATTLISTDECL_END, true)
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
+ element = md[1]
+ contents = md[0]
+
+ pairs = {}
+ values = md[0].scan( ATTDEF_RE )
+ values.each do |attdef|
+ unless attdef[3] == "#IMPLIED"
+ attdef.compact!
+ val = attdef[3]
+ val = attdef[4] if val == "#FIXED "
+ pairs[attdef[0]] = val
+ if attdef[0] =~ /^xmlns:(.*)/
+ @nsstack[0] << $1
+ end
+ end
+ end
+ return [ :attlistdecl, element, pairs, contents ]
+ elsif @source.match("NOTATION", true)
+ base_error_message = "Malformed notation declaration"
+ unless @source.match(/\s+/um, true)
+ if @source.match(">")
+ message = "#{base_error_message}: name is missing"
+ else
+ message = "#{base_error_message}: invalid name"
+ end
+ @source.position = start_position
+ raise REXML::ParseException.new(message, @source)
+ end
+ name = parse_name(base_error_message)
+ id = parse_id(base_error_message,
+ accept_external_id: true,
+ accept_public_id: true)
+ unless @source.match(/\s*>/um, true)
+ message = "#{base_error_message}: garbage before end >"
+ raise REXML::ParseException.new(message, @source)
+ end
+ return [:notationdecl, name, *id]
+ elsif md = @source.match(/--(.*?)-->/um, true)
+ case md[1]
+ when /--/, /-\z/
+ raise REXML::ParseException.new("Malformed comment", @source)
+ end
+ return [ :comment, md[1] ] if md
end
- return [:notationdecl, name, *id]
- when DOCTYPE_END
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
+ return [ :externalentity, match[1] ]
+ elsif @source.match(/\]\s*>/um, true)
@document_status = :after_doctype
- @source.match( DOCTYPE_END, true )
return [ :end_doctype ]
end
end
if @document_status == :after_doctype
- @source.match(/\A\s*/um, true)
+ @source.match(/\s*/um, true)
end
begin
- @source.read if @source.buffer.size<2
- if @source.buffer[0] == ?<
- if @source.buffer[1] == ?/
+ start_position = @source.position
+ if @source.match("<", true)
+ if @source.match("/", true)
@nsstack.shift
last_tag = @tags.pop
- md = @source.match( CLOSE_MATCH, true )
+ md = @source.match(CLOSE_PATTERN, true)
if md and !last_tag
message = "Unexpected top-level end tag (got '#{md[1]}')"
raise REXML::ParseException.new(message, @source)
end
if md.nil? or last_tag != md[1]
message = "Missing end tag for '#{last_tag}'"
- message << " (got '#{md[1]}')" if md
+ message += " (got '#{md[1]}')" if md
+ @source.position = start_position if md.nil?
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
- elsif @source.buffer[1] == ?!
- md = @source.match(/\A(\s*[^>]*>)/um)
+ elsif @source.match("!", true)
+ md = @source.match(/([^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
- if md[0][2] == ?-
- md = @source.match( COMMENT_PATTERN, true )
+ if md[0][0] == ?-
+ md = @source.match(/--(.*?)-->/um, true)
case md[1]
when /--/, /-\z/
@@ -379,19 +391,21 @@ def pull_event
return [ :comment, md[1] ] if md
else
- md = @source.match( CDATA_PATTERN, true )
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
- elsif @source.buffer[1] == ??
- return process_instruction
+ elsif @source.match("?", true)
+ return process_instruction(start_position)
else
# Get the next tag
- md = @source.match(TAG_MATCH, true)
+ md = @source.match(TAG_PATTERN, true)
unless md
+ @source.position = start_position
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
+ tag = md[1]
@document_status = :in_element
prefixes = Set.new
prefixes << md[2] if md[2]
@@ -405,23 +419,17 @@ def pull_event
end
if closed
- @closed = md[1]
+ @closed = tag
@nsstack.shift
else
- @tags.push( md[1] )
+ @tags.push( tag )
end
- return [ :start_element, md[1], attributes ]
+ return [ :start_element, tag, attributes ]
end
else
- md = @source.match( TEXT_PATTERN, true )
- if md[0].length == 0
- @source.match( /(\s+)/, true )
- end
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
- #return [ :text, "" ] if md[0].length == 0
- # unnormalized = Text::unnormalize( md[1], self )
- # return PullEvent.new( :text, md[1], unnormalized )
- return [ :text, md[1] ]
+ md = @source.match(/([^<]*)/um, true)
+ text = md[1]
+ return [ :text, text ]
end
rescue REXML::UndefinedNamespaceException
raise
@@ -463,8 +471,7 @@ def normalize( input, entities=nil, entity_filter=nil )
# Unescapes all possible entities
def unnormalize( string, entities=nil, filter=nil )
- rv = string.clone
- rv.gsub!( /\r\n?/, "\n" )
+ rv = string.gsub( /\r\n?/, "\n" )
matches = rv.scan( REFERENCE_RE )
return rv if matches.size == 0
rv.gsub!( /*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
@@ -499,9 +506,9 @@ def need_source_encoding_update?(xml_declaration_encoding)
end
def parse_name(base_error_message)
- md = @source.match(/\A\s*#{NAME}/um, true)
+ md = @source.match(NAME_PATTERN, true)
unless md
- if @source.match(/\A\s*\S/um)
+ if @source.match(/\s*\S/um)
message = "#{base_error_message}: invalid name"
else
message = "#{base_error_message}: name is missing"
@@ -577,97 +584,89 @@ def parse_id_invalid_details(accept_external_id:,
end
end
- def process_instruction
- match_data = @source.match(INSTRUCTION_PATTERN, true)
+ def process_instruction(start_position)
+ match_data = @source.match(INSTRUCTION_END, true)
unless match_data
message = "Invalid processing instruction node"
+ @source.position = start_position
raise REXML::ParseException.new(message, @source)
end
+ if @document_status.nil? and match_data[1] == "xml"
+ content = match_data[2]
+ version = VERSION.match(content)
+ version = version[1] unless version.nil?
+ encoding = ENCODING.match(content)
+ encoding = encoding[1] unless encoding.nil?
+ if need_source_encoding_update?(encoding)
+ @source.encoding = encoding
+ end
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
+ encoding = "UTF-16"
+ end
+ standalone = STANDALONE.match(content)
+ standalone = standalone[1] unless standalone.nil?
+ return [ :xmldecl, version, encoding, standalone ]
+ end
[:processing_instruction, match_data[1], match_data[2]]
end
def parse_attributes(prefixes, curr_ns)
attributes = {}
closed = false
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
- if match_data.nil?
- message = "Start tag isn't ended"
- raise REXML::ParseException.new(message, @source)
- end
-
- raw_attributes = match_data[1]
- closed = !match_data[2].nil?
- return attributes, closed if raw_attributes.nil?
- return attributes, closed if raw_attributes.empty?
-
- scanner = StringScanner.new(raw_attributes)
- until scanner.eos?
- if scanner.scan(/\s+/)
- break if scanner.eos?
- end
-
- pos = scanner.pos
- loop do
- break if scanner.scan(ATTRIBUTE_PATTERN)
- unless scanner.scan(QNAME)
- message = "Invalid attribute name: <#{scanner.rest}>"
- raise REXML::ParseException.new(message, @source)
- end
- name = scanner[0]
- unless scanner.scan(/\s*=\s*/um)
+ while true
+ if @source.match(">", true)
+ return attributes, closed
+ elsif @source.match("/>", true)
+ closed = true
+ return attributes, closed
+ elsif match = @source.match(QNAME, true)
+ name = match[1]
+ prefix = match[2]
+ local_part = match[3]
+
+ unless @source.match(/\s*=\s*/um, true)
message = "Missing attribute equal: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
- quote = scanner.scan(/['"]/)
- unless quote
+ unless match = @source.match(/(['"])/, true)
message = "Missing attribute value start quote: <#{name}>"
raise REXML::ParseException.new(message, @source)
end
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
- if match_data
- scanner << "/" if closed
- scanner << ">"
- scanner << match_data[1]
- scanner.pos = pos
- closed = !match_data[2].nil?
- next
- end
- message =
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
+ quote = match[1]
+ value = @source.read_until(quote)
+ unless value.chomp!(quote)
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
raise REXML::ParseException.new(message, @source)
end
- end
- name = scanner[1]
- prefix = scanner[2]
- local_part = scanner[3]
- # quote = scanner[4]
- value = scanner[5]
- if prefix == "xmlns"
- if local_part == "xml"
- if value != "http://www.w3.org/XML/1998/namespace"
- msg = "The 'xml' prefix must not be bound to any other namespace "+
+ @source.match(/\s*/um, true)
+ if prefix == "xmlns"
+ if local_part == "xml"
+ if value != "http://www.w3.org/XML/1998/namespace"
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
+ raise REXML::ParseException.new( msg, @source, self )
+ end
+ elsif local_part == "xmlns"
+ msg = "The 'xmlns' prefix must not be declared "+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
- raise REXML::ParseException.new( msg, @source, self )
+ raise REXML::ParseException.new( msg, @source, self)
end
- elsif local_part == "xmlns"
- msg = "The 'xmlns' prefix must not be declared "+
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
- raise REXML::ParseException.new( msg, @source, self)
+ curr_ns << local_part
+ elsif prefix
+ prefixes << prefix unless prefix == "xml"
end
- curr_ns << local_part
- elsif prefix
- prefixes << prefix unless prefix == "xml"
- end
- if attributes.has_key?(name)
- msg = "Duplicate attribute #{name.inspect}"
- raise REXML::ParseException.new(msg, @source, self)
- end
+ if attributes[name]
+ msg = "Duplicate attribute #{name.inspect}"
+ raise REXML::ParseException.new(msg, @source, self)
+ end
- attributes[name] = value
+ attributes[name] = value
+ else
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
+ raise REXML::ParseException.new(message, @source)
+ end
end
- return attributes, closed
end
end
end
diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 0d18559a..191932b8 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
module REXML
COPYRIGHT = "Copyright © 2001-2008 Sean Russell "
DATE = "2008/019"
- VERSION = "3.2.6"
+ VERSION = "3.2.8"
REVISION = ""
Copyright = COPYRIGHT
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 90b370b9..0f3c5011 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -30,8 +30,6 @@ def SourceFactory::create_from(arg)
# objects and provides consumption of text
class Source
include Encoding
- # The current buffer (what we're going to read next)
- attr_reader :buffer
# The line number of the last consumed text
attr_reader :line
attr_reader :encoding
@@ -41,7 +39,8 @@ class Source
# @param encoding if non-null, sets the encoding of the source to this
# value, overriding all encoding detection
def initialize(arg, encoding=nil)
- @orig = @buffer = arg
+ @orig = arg
+ @scanner = StringScanner.new(@orig)
if encoding
self.encoding = encoding
else
@@ -50,6 +49,14 @@ def initialize(arg, encoding=nil)
@line = 0
end
+ # The current buffer (what we're going to read next)
+ def buffer
+ @scanner.rest
+ end
+
+ def buffer_encoding=(encoding)
+ @scanner.string.force_encoding(encoding)
+ end
# Inherited from Encoding
# Overridden to support optimized en/decoding
@@ -58,98 +65,72 @@ def encoding=(enc)
encoding_updated
end
- # Scans the source for a given pattern. Note, that this is not your
- # usual scan() method. For one thing, the pattern argument has some
- # requirements; for another, the source can be consumed. You can easily
- # confuse this method. Originally, the patterns were easier
- # to construct and this method more robust, because this method
- # generated search regexps on the fly; however, this was
- # computationally expensive and slowed down the entire REXML package
- # considerably, since this is by far the most commonly called method.
- # @param pattern must be a Regexp, and must be in the form of
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
- # will be returned; the second group is used if the consume flag is
- # set.
- # @param consume if true, the pattern returned will be consumed, leaving
- # everything after it in the Source.
- # @return the pattern, if found, or nil if the Source is empty or the
- # pattern is not found.
- def scan(pattern, cons=false)
- return nil if @buffer.nil?
- rv = @buffer.scan(pattern)
- @buffer = $' if cons and rv.size>0
- rv
+ def read(term = nil)
end
- def read
+ def read_until(term)
+ @scanner.scan_until(Regexp.union(term)) or @scanner.rest
end
- def consume( pattern )
- @buffer = $' if pattern.match( @buffer )
+ def ensure_buffer
end
- def match_to( char, pattern )
- return pattern.match(@buffer)
+ def match(pattern, cons=false)
+ if cons
+ @scanner.scan(pattern).nil? ? nil : @scanner
+ else
+ @scanner.check(pattern).nil? ? nil : @scanner
+ end
end
- def match_to_consume( char, pattern )
- md = pattern.match(@buffer)
- @buffer = $'
- return md
+ def position
+ @scanner.pos
end
- def match(pattern, cons=false)
- md = pattern.match(@buffer)
- @buffer = $' if cons and md
- return md
+ def position=(pos)
+ @scanner.pos = pos
end
# @return true if the Source is exhausted
def empty?
- @buffer == ""
- end
-
- def position
- @orig.index( @buffer )
+ @scanner.eos?
end
# @return the current line in the source
def current_line
lines = @orig.split
- res = lines.grep @buffer[0..30]
+ res = lines.grep @scanner.rest[0..30]
res = res[-1] if res.kind_of? Array
lines.index( res ) if res
end
private
+
def detect_encoding
- buffer_encoding = @buffer.encoding
+ scanner_encoding = @scanner.rest.encoding
detected_encoding = "UTF-8"
begin
- @buffer.force_encoding("ASCII-8BIT")
- if @buffer[0, 2] == "\xfe\xff"
- @buffer[0, 2] = ""
+ @scanner.string.force_encoding("ASCII-8BIT")
+ if @scanner.scan(/\xfe\xff/n)
detected_encoding = "UTF-16BE"
- elsif @buffer[0, 2] == "\xff\xfe"
- @buffer[0, 2] = ""
+ elsif @scanner.scan(/\xff\xfe/n)
detected_encoding = "UTF-16LE"
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
- @buffer[0, 3] = ""
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
detected_encoding = "UTF-8"
end
ensure
- @buffer.force_encoding(buffer_encoding)
+ @scanner.string.force_encoding(scanner_encoding)
end
self.encoding = detected_encoding
end
def encoding_updated
if @encoding != 'UTF-8'
- @buffer = decode(@buffer)
+ @scanner.string = decode(@scanner.rest)
@to_utf = true
else
@to_utf = false
- @buffer.force_encoding ::Encoding::UTF_8
+ @scanner.string.force_encoding(::Encoding::UTF_8)
end
end
end
@@ -172,7 +153,7 @@ def initialize(arg, block_size=500, encoding=nil)
end
if !@to_utf and
- @buffer.respond_to?(:force_encoding) and
+ @orig.respond_to?(:force_encoding) and
@source.respond_to?(:external_encoding) and
@source.external_encoding != ::Encoding::UTF_8
@force_utf8 = true
@@ -181,65 +162,57 @@ def initialize(arg, block_size=500, encoding=nil)
end
end
- def scan(pattern, cons=false)
- rv = super
- # You'll notice that this next section is very similar to the same
- # section in match(), but just a liiittle different. This is
- # because it is a touch faster to do it this way with scan()
- # than the way match() does it; enough faster to warrant duplicating
- # some code
- if rv.size == 0
- until @buffer =~ pattern or @source.nil?
- begin
- @buffer << readline
- rescue Iconv::IllegalSequence
- raise
- rescue
- @source = nil
- end
- end
- rv = super
+ def read(term = nil)
+ begin
+ @scanner << readline(term)
+ true
+ rescue Exception, NameError
+ @source = nil
+ false
end
- rv.taint if RUBY_VERSION < '2.7'
- rv
end
- def read
+ def read_until(term)
+ pattern = Regexp.union(term)
begin
- @buffer << readline
- rescue Exception, NameError
- @source = nil
+ until str = @scanner.scan_until(pattern)
+ @scanner << readline(term)
+ end
+ rescue EOFError
+ @scanner.rest
+ else
+ read if @scanner.eos? and !@source.eof?
+ str
end
end
- def consume( pattern )
- match( pattern, true )
+ def ensure_buffer
+ read if @scanner.eos? && @source
end
+ # Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
+ # - ">"
+ # - "XXX>" (X is any string excluding '>')
def match( pattern, cons=false )
- rv = pattern.match(@buffer)
- @buffer = $' if cons and rv
- while !rv and @source
- begin
- @buffer << readline
- rv = pattern.match(@buffer)
- @buffer = $' if cons and rv
- rescue
- @source = nil
+ while true
+ if cons
+ md = @scanner.scan(pattern)
+ else
+ md = @scanner.check(pattern)
end
+ break if md
+ return nil if pattern.is_a?(String)
+ return nil if @source.nil?
+ return nil unless read
end
- rv.taint if RUBY_VERSION < '2.7'
- rv
+
+ md.nil? ? nil : @scanner
end
def empty?
super and ( @source.nil? || @source.eof? )
end
- def position
- @er_source.pos rescue 0
- end
-
# @return the current line in the source
def current_line
begin
@@ -263,8 +236,8 @@ def current_line
end
private
- def readline
- str = @source.readline(@line_break)
+ def readline(term = nil)
+ str = @source.readline(term || @line_break)
if @pending_buffer
if str.nil?
str = @pending_buffer
@@ -290,7 +263,7 @@ def encoding_updated
@source.set_encoding(@encoding, @encoding)
end
@line_break = encode(">")
- @pending_buffer, @buffer = @buffer, ""
+ @pending_buffer, @scanner.string = @scanner.rest, ""
@pending_buffer.force_encoding(@encoding)
super
end
diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb
index d8b88e7a..5eb1e5a9 100644
--- a/lib/rexml/xpath_parser.rb
+++ b/lib/rexml/xpath_parser.rb
@@ -590,6 +590,7 @@ def filter_nodeset(nodeset)
def evaluate_predicate(expression, nodesets)
enter(:predicate, expression, nodesets) if @debug
+ new_nodeset_count = 0
new_nodesets = nodesets.collect do |nodeset|
new_nodeset = []
subcontext = { :size => nodeset.size }
@@ -606,17 +607,20 @@ def evaluate_predicate(expression, nodesets)
result = result[0] if result.kind_of? Array and result.length == 1
if result.kind_of? Numeric
if result == node.position
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
+ new_nodeset_count += 1
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
end
elsif result.instance_of? Array
if result.size > 0 and result.inject(false) {|k,s| s or k}
if result.size > 0
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
+ new_nodeset_count += 1
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
end
end
else
if result
- new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1)
+ new_nodeset_count += 1
+ new_nodeset << XPathNode.new(node, position: new_nodeset_count)
end
end
end
diff --git a/rexml.gemspec b/rexml.gemspec
index ceb77047..97eac657 100644
--- a/rexml.gemspec
+++ b/rexml.gemspec
@@ -55,7 +55,5 @@ Gem::Specification.new do |spec|
spec.required_ruby_version = '>= 2.5.0'
- spec.add_development_dependency "bundler"
- spec.add_development_dependency "rake"
- spec.add_development_dependency "test-unit"
+ spec.add_runtime_dependency("strscan", ">= 3.0.9")
end
diff --git a/test/formatter/test_default.rb b/test/formatter/test_default.rb
index 321d8180..aa403dbe 100644
--- a/test/formatter/test_default.rb
+++ b/test/formatter/test_default.rb
@@ -2,7 +2,7 @@ module REXMLTests
class DefaultFormatterTest < Test::Unit::TestCase
def format(node)
formatter = REXML::Formatters::Default.new
- output = ""
+ output = +""
formatter.write(node, output)
output
end
diff --git a/test/functions/test_base.rb b/test/functions/test_base.rb
index 74dc1a31..daa38156 100644
--- a/test/functions/test_base.rb
+++ b/test/functions/test_base.rb
@@ -229,8 +229,30 @@ def test_normalize_space
assert_equal( [REXML::Comment.new("COMMENT A")], m )
end
+ def test_normalize_space_strings
+ source = <<-XML
+breakfast boosts\t\t
+
+concentration
+Coffee beans
+ aroma
+
+
+
+ Dessert
+ \t\t after dinner
+ XML
+ normalized_texts = REXML::XPath.each(REXML::Document.new(source), "normalize-space(//text())").to_a
+ assert_equal([
+ "breakfast boosts concentration",
+ "Coffee beans aroma",
+ "Dessert after dinner",
+ ],
+ normalized_texts)
+ end
+
def test_string_nil_without_context
- doc = REXML::Document.new(<<-XML)
+ doc = REXML::Document.new(<<~XML)
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 55713909..8faa0b78 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -36,6 +36,21 @@ def test_garbage_plus_before_name_at_line_start
+ r SYSTEM "urn:x-rexml:test" [ ]>
DETAIL
end
+
+ def test_no_name
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-DOCTYPE)
+
+ DOCTYPE
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: name is missing
+Line: 3
+Position: 17
+Last 80 unconsumed characters:
+
+ DETAIL
+ end
end
class TestExternalID < self
diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb
index 9f172a28..14d0703a 100644
--- a/test/parse/test_element.rb
+++ b/test/parse/test_element.rb
@@ -41,9 +41,22 @@ def test_empty_namespace_attribute_name
assert_equal(<<-DETAIL.chomp, exception.to_s)
Invalid attribute name: <:a="">
Line: 1
-Position: 9
+Position: 13
Last 80 unconsumed characters:
+:a="">
+ DETAIL
+ end
+ def test_empty_namespace_attribute_name_with_utf8_character
+ exception = assert_raise(REXML::ParseException) do
+ parse("") # U+200B ZERO WIDTH SPACE
+ end
+ assert_equal(<<-DETAIL.chomp.force_encoding("ASCII-8BIT"), exception.to_s)
+Invalid attribute name: <:\xE2\x80\x8B>
+Line: 1
+Position: 8
+Last 80 unconsumed characters:
+:\xE2\x80\x8B>
DETAIL
end
diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb
new file mode 100644
index 00000000..e15deec6
--- /dev/null
+++ b/test/parse/test_entity_declaration.rb
@@ -0,0 +1,36 @@
+# frozen_string_literal: false
+require 'test/unit'
+require 'rexml/document'
+
+module REXMLTests
+ class TestParseEntityDeclaration < Test::Unit::TestCase
+ private
+ def xml(internal_subset)
+ <<-XML
+
+
+ XML
+ end
+
+ def parse(internal_subset)
+ REXML::Document.new(xml(internal_subset)).doctype
+ end
+
+ def test_empty
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-INTERNAL_SUBSET)
+
+ INTERNAL_SUBSET
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed notation declaration: name is missing
+Line: 5
+Position: 72
+Last 80 unconsumed characters:
+ ]>
+ DETAIL
+ end
+ end
+end
diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb
index 19a0536d..9e81b6a4 100644
--- a/test/parse/test_notation_declaration.rb
+++ b/test/parse/test_notation_declaration.rb
@@ -35,7 +35,7 @@ def test_no_name
Line: 5
Position: 72
Last 80 unconsumed characters:
- ]>
+ ]>
DETAIL
end
diff --git a/test/test_contrib.rb b/test/test_contrib.rb
index f3ad0b6c..23ee35b1 100644
--- a/test/test_contrib.rb
+++ b/test/test_contrib.rb
@@ -80,7 +80,7 @@ def test_bad_doctype_Tobias
# Peter Verhage
def test_namespace_Peter
- source = <<-EOF
+ source = <<~EOF
@@ -377,7 +377,7 @@ def test_various_xpath
end
def test_entities_Holden_Glova
- document = <<-EOL
+ document = <<~EOL
diff --git a/test/test_core.rb b/test/test_core.rb
index 7c18c03f..44e2e7ea 100644
--- a/test/test_core.rb
+++ b/test/test_core.rb
@@ -15,7 +15,7 @@ class Tester < Test::Unit::TestCase
include Helper::Fixture
include REXML
def setup
- @xsa_source = <<-EOL
+ @xsa_source = <<~EOL