1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

[ruby/csv] Add handling for ambiguous parsing options (https://github.com/ruby/csv/pull/226)

GitHub: fix GH-225

With Ruby 3.0.2 and csv 3.2.1, the file

```ruby
require "csv"
File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") }
CSV.read("example.tsv", col_sep: "\t", strip: true)
```

produces the error

```
lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful
message in line 1. (CSV::MalformedCSVError)
```

However, the CSV in this example is not malformed; instead, ambiguous
options were provided to the parser. It is not obvious (to me) whether
the string should be parsed as

- `["foo\t\tbar"]`,
- `["foo", "bar"]`,
- `["foo", "", "bar"]`, or
- `["foo", nil, "bar"]`.

This commit adds code that raises an exception when this situation is
encountered. Specifically, it checks if the column separator either ends
with or starts with the characters that would be stripped away.

This commit also adds unit tests and updates the documentation.

https://github.com/ruby/csv/commit/cc317dd42d
This commit is contained in:
adamroyjones 2021-11-18 21:20:09 +00:00 committed by Sutou Kouhei
parent 47c53af168
commit c70dc3cafb
Notes: git 2021-12-24 14:35:55 +09:00
3 changed files with 59 additions and 6 deletions

View file

@ -330,6 +330,7 @@ using CSV::MatchP if CSV.const_defined?(:MatchP)
# liberal_parsing: false, # liberal_parsing: false,
# nil_value: nil, # nil_value: nil,
# empty_value: "", # empty_value: "",
# strip: false,
# # For generating. # # For generating.
# write_headers: nil, # write_headers: nil,
# quote_empty: true, # quote_empty: true,
@ -337,7 +338,6 @@ using CSV::MatchP if CSV.const_defined?(:MatchP)
# write_converters: nil, # write_converters: nil,
# write_nil_value: nil, # write_nil_value: nil,
# write_empty_value: "", # write_empty_value: "",
# strip: false,
# } # }
# #
# ==== Options for Parsing # ==== Options for Parsing
@ -355,8 +355,9 @@ using CSV::MatchP if CSV.const_defined?(:MatchP)
# - +header_converters+: Specifies the header converters to be used. # - +header_converters+: Specifies the header converters to be used.
# - +skip_blanks+: Specifies whether blanks lines are to be ignored. # - +skip_blanks+: Specifies whether blanks lines are to be ignored.
# - +skip_lines+: Specifies how comments lines are to be recognized. # - +skip_lines+: Specifies how comments lines are to be recognized.
# - +strip+: Specifies whether leading and trailing whitespace are # - +strip+: Specifies whether leading and trailing whitespace are to be
# to be stripped from fields.. # stripped from fields. This must be compatible with +col_sep+; if it is not,
# then an +ArgumentError+ exception will be raised.
# - +liberal_parsing+: Specifies whether \CSV should attempt to parse # - +liberal_parsing+: Specifies whether \CSV should attempt to parse
# non-compliant data. # non-compliant data.
# - +nil_value+: Specifies the object that is to be substituted for each null (no-text) field. # - +nil_value+: Specifies the object that is to be substituted for each null (no-text) field.
@ -935,6 +936,7 @@ class CSV
liberal_parsing: false, liberal_parsing: false,
nil_value: nil, nil_value: nil,
empty_value: "", empty_value: "",
strip: false,
# For generating. # For generating.
write_headers: nil, write_headers: nil,
quote_empty: true, quote_empty: true,
@ -942,7 +944,6 @@ class CSV
write_converters: nil, write_converters: nil,
write_nil_value: nil, write_nil_value: nil,
write_empty_value: "", write_empty_value: "",
strip: false,
}.freeze }.freeze
class << self class << self
@ -1760,11 +1761,11 @@ class CSV
encoding: nil, encoding: nil,
nil_value: nil, nil_value: nil,
empty_value: "", empty_value: "",
strip: false,
quote_empty: true, quote_empty: true,
write_converters: nil, write_converters: nil,
write_nil_value: nil, write_nil_value: nil,
write_empty_value: "", write_empty_value: "")
strip: false)
raise ArgumentError.new("Cannot parse nil as CSV") if data.nil? raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
if data.is_a?(String) if data.is_a?(String)

View file

@ -361,6 +361,7 @@ class CSV
prepare_skip_lines prepare_skip_lines
prepare_strip prepare_strip
prepare_separators prepare_separators
validate_strip_and_col_sep_options
prepare_quoted prepare_quoted
prepare_unquoted prepare_unquoted
prepare_line prepare_line
@ -531,6 +532,28 @@ class CSV
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding)) @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
end end
# This method verifies that there are no (obvious) ambiguities with the
# provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
# and +strip+ were both equal to +\t+, then there would be no clear way to
# parse the input.
def validate_strip_and_col_sep_options
return unless @strip
if @strip.is_a?(String)
if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
raise ArgumentError,
"The provided strip (#{@escaped_strip}) and " \
"col_sep (#{@escaped_column_separator}) options are incompatible."
end
else
if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
raise ArgumentError,
"The provided strip (true) and " \
"col_sep (#{@escaped_column_separator}) options are incompatible."
end
end
end
def prepare_quoted def prepare_quoted
if @quote_character if @quote_character
@quotes = Regexp.new(@escaped_quote_character + @quotes = Regexp.new(@escaped_quote_character +

View file

@ -80,4 +80,33 @@ class TestCSVParseStrip < Test::Unit::TestCase
%Q{"a" ,"b " \r\n}, %Q{"a" ,"b " \r\n},
strip: true)) strip: true))
end end
def test_col_sep_incompatible_true
message = "The provided strip (true) and " \
"col_sep (\\t) options are incompatible."
assert_raise_with_message(ArgumentError, message) do
CSV.parse_line(%Q{"a"\t"b"\n},
col_sep: "\t",
strip: true)
end
end
def test_col_sep_incompatible_string
message = "The provided strip (\\t) and " \
"col_sep (\\t) options are incompatible."
assert_raise_with_message(ArgumentError, message) do
CSV.parse_line(%Q{"a"\t"b"\n},
col_sep: "\t",
strip: "\t")
end
end
def test_col_sep_compatible_string
assert_equal(
["a", "b"],
CSV.parse_line(%Q{\va\tb\v\n},
col_sep: "\t",
strip: "\v")
)
end
end end