8000 [ruby/csv] Add handling for ambiguous parsing options (https://github… · kou/ruby@9b9877e · GitHub
[go: up one dir, main page]

Skip to content

Commit 9b9877e

Browse files
adamroyjoneskou
authored andcommitted
[ruby/csv] Add handling for ambiguous parsing options (ruby/csv#226)
GitHub: fix rubyGH-225 With Ruby 3.0.2 and csv 3.2.1, the file ```ruby require "csv" File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") } CSV.read("example.tsv", col_sep: "\t", strip: true) ``` produces the error ``` lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful message in line 1. (CSV::MalformedCSVError) ``` However, the CSV in this example is not malformed; instead, ambiguous options were provided to the parser. It is not obvious (to me) whether the string should be parsed as - `["foo\t\tbar"]`, - `["foo", "bar"]`, - `["foo", "", "bar"]`, or - `["foo", nil, "bar"]`. This commit adds code that raises an exception when this situation is encountered. Specifically, it checks if the column separator either ends with or starts with the characters that would be stripped away. This commit also adds unit tests and updates the documentation. ruby/csv@cc317dd42d
1 parent 6b5308e commit 9b9877e

File tree

3 files changed

+59
-6
lines changed

3 files changed

+59
-6
lines changed

lib/csv.rb

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -330,14 +330,14 @@
330330
# liberal_parsing: false,
331331
# nil_value: nil,
332332
# empty_value: "",
333+
# strip: false,
333334
# # For generating.
334335
# write_headers: nil,
335336
# quote_empty: true,
336337
# force_quotes: false,
337338
# write_converters: nil,
338339
# write_nil_value: nil,
339340
# write_empty_value: "",
340-
# strip: false,
341341
# }
342342
#
343343
# ==== Options for Parsing
@@ -355,8 +355,9 @@
355355
# - +header_converters+: Specifies the header converters to be used.
356356
# - +skip_blanks+: Specifies whether blanks lines are to be ignored.
357357
# - +skip_lines+: Specifies how comments lines are to be recognized.
358-
# - +strip+: Specifies whether leading and trailing whitespace are
359-
# to be stripped from fields..
358+
# - +strip+: Specifies whether leading and trailing whitespace are to be
359+
# stripped from fields. This must be compatible with +col_sep+; if it is not,
360+
# then an +ArgumentError+ exception will be raised.
360361
# - +liberal_parsing+: Specifies whether \CSV should attempt to parse
361362
# non-compliant data.
362363
# - +nil_value+: Specifies the object that is to be substituted for each null (no-text) field.
@@ -935,14 +936,14 @@ def initialize(message, line_number)
935936
liberal_parsing: false,
936937
nil_value: nil,
937938
empty_value: "",
939+
strip: false,
938940
# For generating.
939941
write_headers: nil,
940942
quote_empty: true,
941943
force_quotes: false,
942944
write_converters: nil,
943945
write_nil_value: nil,
944946
write_empty_value: "",
945-
strip: false,
946947
}.freeze
947948

948949
class << self
@@ -1760,11 +1761,11 @@ def initialize(data,
17601761
encoding: nil,
17611762
nil_value: nil,
17621763
empty_value: "",
1764+
strip: false,
17631765
quote_empty: true,
17641766
write_converters: nil,
17651767
write_nil_value: nil,
1766-
write_empty_value: "",
1767-
strip: false)
1768+
write_empty_value: "")
17681769
raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
17691770

17701771
if data.is_a?(String)

lib/csv/parser.rb

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ def prepare
361361
prepare_skip_lines
362362
prepare_strip
363363
prepare_separators
364+
validate_strip_and_col_sep_options
364365
prepare_quoted
365366
prepare_unquoted
366367
prepare_line
@@ -531,6 +532,28 @@ def prepare_separators
531532
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
532533
end
533534

535+
# This method verifies that there are no (obvious) ambiguities with the
536+
# provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
537+
# and +strip+ were both equal to +\t+, then there would be no clear way to
538+
# parse the input.
539+
def validate_strip_and_col_sep_options
540+
return unless @strip
541+
542+
if @strip.is_a?(String)
543+
if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
544+
raise ArgumentError,
545+
"The provided strip (#{@escaped_strip}) and " \
546+
"col_sep (#{@escaped_column_separator}) options are incompatible."
547+
end
548+
else
549+
if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
550+
raise ArgumentError,
551+
"The provided strip (true) and " \
552+
"col_sep (#{@escaped_column_separator}) options are incompatible."
553+
end
554+
end
555+
end
556+
534557
def prepare_quoted
535558
if @quote_character
536559
@quotes = Regexp.new(@escaped_quote_character +

test/csv/parse/test_strip.rb

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,33 @@ def test_do_not_strip_crlf
8080
%Q{"a" ,"b " \r\n},
8181
strip: true))
8282
end
83+
84+
def test_col_sep_incompatible_true
85+
message = "The provided strip (true) and " \
86+
"col_sep (\\t) options are incompatible."
87+
assert_raise_with_message(ArgumentError, message) do
88+
CSV.parse_line(%Q{"a"\t"b"\n},
89+
col_sep: "\t",
90+
strip: true)
91+
end
92+
end
93+
94+
def test_col_sep_incompatible_string
95+
message = "The provided strip (\\t) and " \
96+
"col_sep (\\t) options are incompatible."
97+
assert_raise_with_message(ArgumentError, message) do
98+
CSV.parse_line(%Q{"a"\t"b"\n},
99+
col_sep: "\t",
100+
strip: "\t")
101+
end
102+
end
103+
104+
def test_col_sep_compatible_string
105+
assert_equal(
106+
["a", "b"],
107+
CSV.parse_line(%Q{\va\tb\v\n},
108+
col_sep: "\t",
109+
strip: "\v")
110+
)
111+
end
83112
end

0 commit comments

Comments
 (0)
0