1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00
ruby--ruby/lib/csv/parser.rb
adamroyjones c70dc3cafb [ruby/csv] Add handling for ambiguous parsing options (https://github.com/ruby/csv/pull/226)
GitHub: fix GH-225

With Ruby 3.0.2 and csv 3.2.1, the file

```ruby
require "csv"
File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") }
CSV.read("example.tsv", col_sep: "\t", strip: true)
```

produces the error

```
lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful
message in line 1. (CSV::MalformedCSVError)
```

However, the CSV in this example is not malformed; instead, ambiguous
options were provided to the parser. It is not obvious (to me) whether
the string should be parsed as

- `["foo\t\tbar"]`,
- `["foo", "bar"]`,
- `["foo", "", "bar"]`, or
- `["foo", nil, "bar"]`.

This commit adds code that raises an exception when this situation is
encountered. Specifically, it checks if the column separator either ends
with or starts with the characters that would be stripped away.

This commit also adds unit tests and updates the documentation.

cc317dd42d
2021-12-24 14:35:33 +09:00

1167 lines
32 KiB
Ruby

# frozen_string_literal: true
require "strscan"
require_relative "delete_suffix"
require_relative "input_record_separator"
require_relative "match_p"
require_relative "row"
require_relative "table"
using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
using CSV::MatchP if CSV.const_defined?(:MatchP)
class CSV
# Note: Don't use this class directly. This is an internal class.
class Parser
#
# A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
# or String object being read from or written to. Your data is never transcoded
# (unless you ask Ruby to transcode it for you) and will literally be parsed in
# the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
# Encoding of your data. This is accomplished by transcoding the parser itself
# into your Encoding.
#
# Raised when encoding is invalid.
class InvalidEncoding < StandardError
end
#
# CSV::Scanner receives a CSV output, scans it and return the content.
# It also controls the life cycle of the object with its methods +keep_start+,
# +keep_end+, +keep_back+, +keep_drop+.
#
# Uses StringScanner (the official strscan gem). Strscan provides lexical
# scanning operations on a String. We inherit its object and take advantage
# on the methods. For more information, please visit:
# https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
#
class Scanner < StringScanner
alias_method :scan_all, :scan
def initialize(*args)
super
@keeps = []
end
def each_line(row_separator)
position = pos
rest.each_line(row_separator) do |line|
position += line.bytesize
self.pos = position
yield(line)
end
end
def keep_start
@keeps.push(pos)
end
def keep_end
start = @keeps.pop
string.byteslice(start, pos - start)
end
def keep_back
self.pos = @keeps.pop
end
def keep_drop
@keeps.pop
end
end
#
# CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
# It also controls the life cycle of the object with its methods +keep_start+,
# +keep_end+, +keep_back+, +keep_drop+.
#
# CSV::InputsScanner.scan() tries to match with pattern at the current position.
# If there's a match, the scanner advances the “scan pointer” and returns the matched string.
# Otherwise, the scanner returns nil.
#
# CSV::InputsScanner.rest() returns the “rest” of the string (i.e. everything after the scan pointer).
# If there is no more data (eos? = true), it returns "".
#
class InputsScanner
def initialize(inputs, encoding, chunk_size: 8192)
@inputs = inputs.dup
@encoding = encoding
@chunk_size = chunk_size
@last_scanner = @inputs.empty?
@keeps = []
read_chunk
end
def each_line(row_separator)
buffer = nil
input = @scanner.rest
position = @scanner.pos
offset = 0
n_row_separator_chars = row_separator.size
while true
input.each_line(row_separator) do |line|
@scanner.pos += line.bytesize
if buffer
if n_row_separator_chars == 2 and
buffer.end_with?(row_separator[0]) and
line.start_with?(row_separator[1])
buffer << line[0]
line = line[1..-1]
position += buffer.bytesize + offset
@scanner.pos = position
offset = 0
yield(buffer)
buffer = nil
next if line.empty?
else
buffer << line
line = buffer
buffer = nil
end
end
if line.end_with?(row_separator)
position += line.bytesize + offset
@scanner.pos = position
offset = 0
yield(line)
else
buffer = line
end
end
break unless read_chunk
input = @scanner.rest
position = @scanner.pos
offset = -buffer.bytesize if buffer
end
yield(buffer) if buffer
end
def scan(pattern)
value = @scanner.scan(pattern)
return value if @last_scanner
if value
read_chunk if @scanner.eos?
return value
else
nil
end
end
def scan_all(pattern)
value = @scanner.scan(pattern)
return value if @last_scanner
return nil if value.nil?
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
value << sub_value
end
value
end
def eos?
@scanner.eos?
end
def keep_start
@keeps.push([@scanner.pos, nil])
end
def keep_end
start, buffer = @keeps.pop
keep = @scanner.string.byteslice(start, @scanner.pos - start)
if buffer
buffer << keep
keep = buffer
end
keep
end
def keep_back
start, buffer = @keeps.pop
if buffer
string = @scanner.string
keep = string.byteslice(start, string.bytesize - start)
if keep and not keep.empty?
@inputs.unshift(StringIO.new(keep))
@last_scanner = false
end
@scanner = StringScanner.new(buffer)
else
@scanner.pos = start
end
read_chunk if @scanner.eos?
end
def keep_drop
@keeps.pop
end
def rest
@scanner.rest
end
private
def read_chunk
return false if @last_scanner
unless @keeps.empty?
keep = @keeps.last
keep_start = keep[0]
string = @scanner.string
keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
if keep_data
keep_buffer = keep[1]
if keep_buffer
keep_buffer << keep_data
else
keep[1] = keep_data.dup
end
end
keep[0] = 0
end
input = @inputs.first
case input
when StringIO
string = input.read
raise InvalidEncoding unless string.valid_encoding?
@scanner = StringScanner.new(string)
@inputs.shift
@last_scanner = @inputs.empty?
true
else
chunk = input.gets(nil, @chunk_size)
if chunk
raise InvalidEncoding unless chunk.valid_encoding?
@scanner = StringScanner.new(chunk)
if input.respond_to?(:eof?) and input.eof?
@inputs.shift
@last_scanner = @inputs.empty?
end
true
else
@scanner = StringScanner.new("".encode(@encoding))
@inputs.shift
@last_scanner = @inputs.empty?
if @last_scanner
false
else
read_chunk
end
end
end
end
end
def initialize(input, options)
@input = input
@options = options
@samples = []
prepare
end
def column_separator
@column_separator
end
def row_separator
@row_separator
end
def quote_character
@quote_character
end
def field_size_limit
@field_size_limit
end
def skip_lines
@skip_lines
end
def unconverted_fields?
@unconverted_fields
end
def headers
@headers
end
def header_row?
@use_headers and @headers.nil?
end
def return_headers?
@return_headers
end
def skip_blanks?
@skip_blanks
end
def liberal_parsing?
@liberal_parsing
end
def lineno
@lineno
end
def line
last_line
end
def parse(&block)
return to_enum(__method__) unless block_given?
if @return_headers and @headers and @raw_headers
headers = Row.new(@headers, @raw_headers, true)
if @unconverted_fields
headers = add_unconverted_fields(headers, [])
end
yield headers
end
begin
@scanner ||= build_scanner
if quote_character.nil?
parse_no_quote(&block)
elsif @need_robust_parsing
parse_quotable_robust(&block)
else
parse_quotable_loose(&block)
end
rescue InvalidEncoding
if @scanner
ignore_broken_line
lineno = @lineno
else
lineno = @lineno + 1
end
message = "Invalid byte sequence in #{@encoding}"
raise MalformedCSVError.new(message, lineno)
end
end
def use_headers?
@use_headers
end
private
# A set of tasks to prepare the file in order to parse it
def prepare
prepare_variable
prepare_quote_character
prepare_backslash
prepare_skip_lines
prepare_strip
prepare_separators
validate_strip_and_col_sep_options
prepare_quoted
prepare_unquoted
prepare_line
prepare_header
prepare_parser
end
def prepare_variable
@need_robust_parsing = false
@encoding = @options[:encoding]
liberal_parsing = @options[:liberal_parsing]
if liberal_parsing
@liberal_parsing = true
if liberal_parsing.is_a?(Hash)
@double_quote_outside_quote =
liberal_parsing[:double_quote_outside_quote]
@backslash_quote = liberal_parsing[:backslash_quote]
else
@double_quote_outside_quote = false
@backslash_quote = false
end
@need_robust_parsing = true
else
@liberal_parsing = false
@backslash_quote = false
end
@unconverted_fields = @options[:unconverted_fields]
@field_size_limit = @options[:field_size_limit]
@skip_blanks = @options[:skip_blanks]
@fields_converter = @options[:fields_converter]
@header_fields_converter = @options[:header_fields_converter]
end
def prepare_quote_character
@quote_character = @options[:quote_character]
if @quote_character.nil?
@escaped_quote_character = nil
@escaped_quote = nil
else
@quote_character = @quote_character.to_s.encode(@encoding)
if @quote_character.length != 1
message = ":quote_char has to be nil or a single character String"
raise ArgumentError, message
end
@double_quote_character = @quote_character * 2
@escaped_quote_character = Regexp.escape(@quote_character)
@escaped_quote = Regexp.new(@escaped_quote_character)
end
end
def prepare_backslash
return unless @backslash_quote
@backslash_character = "\\".encode(@encoding)
@escaped_backslash_character = Regexp.escape(@backslash_character)
@escaped_backslash = Regexp.new(@escaped_backslash_character)
if @quote_character.nil?
@backslash_quote_character = nil
else
@backslash_quote_character =
@backslash_character + @escaped_quote_character
end
end
def prepare_skip_lines
skip_lines = @options[:skip_lines]
case skip_lines
when String
@skip_lines = skip_lines.encode(@encoding)
when Regexp, nil
@skip_lines = skip_lines
else
unless skip_lines.respond_to?(:match)
message =
":skip_lines has to respond to \#match: #{skip_lines.inspect}"
raise ArgumentError, message
end
@skip_lines = skip_lines
end
end
def prepare_strip
@strip = @options[:strip]
@escaped_strip = nil
@strip_value = nil
@rstrip_value = nil
if @strip.is_a?(String)
case @strip.length
when 0
raise ArgumentError, ":strip must not be an empty String"
when 1
# ok
else
raise ArgumentError, ":strip doesn't support 2 or more characters yet"
end
@strip = @strip.encode(@encoding)
@escaped_strip = Regexp.escape(@strip)
if @quote_character
@strip_value = Regexp.new(@escaped_strip +
"+".encode(@encoding))
@rstrip_value = Regexp.new(@escaped_strip +
"+\\z".encode(@encoding))
end
@need_robust_parsing = true
elsif @strip
strip_values = " \t\f\v"
@escaped_strip = strip_values.encode(@encoding)
if @quote_character
@strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
@rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
end
@need_robust_parsing = true
end
end
begin
StringScanner.new("x").scan("x")
rescue TypeError
STRING_SCANNER_SCAN_ACCEPT_STRING = false
else
STRING_SCANNER_SCAN_ACCEPT_STRING = true
end
def prepare_separators
column_separator = @options[:column_separator]
@column_separator = column_separator.to_s.encode(@encoding)
if @column_separator.size < 1
message = ":col_sep must be 1 or more characters: "
message += column_separator.inspect
raise ArgumentError, message
end
@row_separator =
resolve_row_separator(@options[:row_separator]).encode(@encoding)
@escaped_column_separator = Regexp.escape(@column_separator)
@escaped_first_column_separator = Regexp.escape(@column_separator[0])
if @column_separator.size > 1
@column_end = Regexp.new(@escaped_column_separator)
@column_ends = @column_separator.each_char.collect do |char|
Regexp.new(Regexp.escape(char))
end
@first_column_separators = Regexp.new(@escaped_first_column_separator +
"+".encode(@encoding))
else
if STRING_SCANNER_SCAN_ACCEPT_STRING
@column_end = @column_separator
else
@column_end = Regexp.new(@escaped_column_separator)
end
@column_ends = nil
@first_column_separators = nil
end
escaped_row_separator = Regexp.escape(@row_separator)
@row_end = Regexp.new(escaped_row_separator)
if @row_separator.size > 1
@row_ends = @row_separator.each_char.collect do |char|
Regexp.new(Regexp.escape(char))
end
else
@row_ends = nil
end
@cr = "\r".encode(@encoding)
@lf = "\n".encode(@encoding)
@line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
end
# This method verifies that there are no (obvious) ambiguities with the
# provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
# and +strip+ were both equal to +\t+, then there would be no clear way to
# parse the input.
def validate_strip_and_col_sep_options
return unless @strip
if @strip.is_a?(String)
if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
raise ArgumentError,
"The provided strip (#{@escaped_strip}) and " \
"col_sep (#{@escaped_column_separator}) options are incompatible."
end
else
if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
raise ArgumentError,
"The provided strip (true) and " \
"col_sep (#{@escaped_column_separator}) options are incompatible."
end
end
end
def prepare_quoted
if @quote_character
@quotes = Regexp.new(@escaped_quote_character +
"+".encode(@encoding))
no_quoted_values = @escaped_quote_character.dup
if @backslash_quote
no_quoted_values << @escaped_backslash_character
end
@quoted_value = Regexp.new("[^".encode(@encoding) +
no_quoted_values +
"]+".encode(@encoding))
end
if @escaped_strip
@split_column_separator = Regexp.new(@escaped_strip +
"*".encode(@encoding) +
@escaped_column_separator +
@escaped_strip +
"*".encode(@encoding))
else
if @column_separator == " ".encode(@encoding)
@split_column_separator = Regexp.new(@escaped_column_separator)
else
@split_column_separator = @column_separator
end
end
end
def prepare_unquoted
return if @quote_character.nil?
no_unquoted_values = "\r\n".encode(@encoding)
no_unquoted_values << @escaped_first_column_separator
unless @liberal_parsing
no_unquoted_values << @escaped_quote_character
end
@unquoted_value = Regexp.new("[^".encode(@encoding) +
no_unquoted_values +
"]+".encode(@encoding))
end
def resolve_row_separator(separator)
if separator == :auto
cr = "\r".encode(@encoding)
lf = "\n".encode(@encoding)
if @input.is_a?(StringIO)
pos = @input.pos
separator = detect_row_separator(@input.read, cr, lf)
@input.seek(pos)
elsif @input.respond_to?(:gets)
if @input.is_a?(File)
chunk_size = 32 * 1024
else
chunk_size = 1024
end
begin
while separator == :auto
#
# if we run out of data, it's probably a single line
# (ensure will set default value)
#
break unless sample = @input.gets(nil, chunk_size)
# extend sample if we're unsure of the line ending
if sample.end_with?(cr)
sample << (@input.gets(nil, 1) || "")
end
@samples << sample
separator = detect_row_separator(sample, cr, lf)
end
rescue IOError
# do nothing: ensure will set default
end
end
separator = InputRecordSeparator.value if separator == :auto
end
separator.to_s.encode(@encoding)
end
def detect_row_separator(sample, cr, lf)
lf_index = sample.index(lf)
if lf_index
cr_index = sample[0, lf_index].index(cr)
else
cr_index = sample.index(cr)
end
if cr_index and lf_index
if cr_index + 1 == lf_index
cr + lf
elsif cr_index < lf_index
cr
else
lf
end
elsif cr_index
cr
elsif lf_index
lf
else
:auto
end
end
def prepare_line
@lineno = 0
@last_line = nil
@scanner = nil
end
def last_line
if @scanner
@last_line ||= @scanner.keep_end
else
@last_line
end
end
def prepare_header
@return_headers = @options[:return_headers]
headers = @options[:headers]
case headers
when Array
@raw_headers = headers
@use_headers = true
when String
@raw_headers = parse_headers(headers)
@use_headers = true
when nil, false
@raw_headers = nil
@use_headers = false
else
@raw_headers = nil
@use_headers = true
end
if @raw_headers
@headers = adjust_headers(@raw_headers)
else
@headers = nil
end
end
def parse_headers(row)
CSV.parse_line(row,
col_sep: @column_separator,
row_sep: @row_separator,
quote_char: @quote_character)
end
def adjust_headers(headers)
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
adjusted_headers.each {|h| h.freeze if h.is_a? String}
adjusted_headers
end
def prepare_parser
@may_quoted = may_quoted?
end
def may_quoted?
return false if @quote_character.nil?
if @input.is_a?(StringIO)
pos = @input.pos
sample = @input.read
@input.seek(pos)
else
return false if @samples.empty?
sample = @samples.first
end
sample[0, 128].index(@quote_character)
end
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
if SCANNER_TEST
class UnoptimizedStringIO
def initialize(string)
@io = StringIO.new(string, "rb:#{string.encoding}")
end
def gets(*args)
@io.gets(*args)
end
def each_line(*args, &block)
@io.each_line(*args, &block)
end
def eof?
@io.eof?
end
end
SCANNER_TEST_CHUNK_SIZE =
Integer((ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"), 10)
def build_scanner
inputs = @samples.collect do |sample|
UnoptimizedStringIO.new(sample)
end
if @input.is_a?(StringIO)
inputs << UnoptimizedStringIO.new(@input.read)
else
inputs << @input
end
InputsScanner.new(inputs,
@encoding,
chunk_size: SCANNER_TEST_CHUNK_SIZE)
end
else
def build_scanner
string = nil
if @samples.empty? and @input.is_a?(StringIO)
string = @input.read
elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
string = @samples[0]
end
if string
unless string.valid_encoding?
index = string.lines(@row_separator).index do |line|
!line.valid_encoding?
end
if index
message = "Invalid byte sequence in #{@encoding}"
raise MalformedCSVError.new(message, @lineno + index + 1)
end
end
Scanner.new(string)
else
inputs = @samples.collect do |sample|
StringIO.new(sample)
end
inputs << @input
InputsScanner.new(inputs, @encoding)
end
end
end
def skip_needless_lines
return unless @skip_lines
until @scanner.eos?
@scanner.keep_start
line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
line << @row_separator if parse_row_end
if skip_line?(line)
@lineno += 1
@scanner.keep_drop
else
@scanner.keep_back
return
end
end
end
def skip_line?(line)
line = line.delete_suffix(@row_separator)
case @skip_lines
when String
line.include?(@skip_lines)
when Regexp
@skip_lines.match?(line)
else
@skip_lines.match(line)
end
end
def parse_no_quote(&block)
@scanner.each_line(@row_separator) do |line|
next if @skip_lines and skip_line?(line)
original_line = line
line = line.delete_suffix(@row_separator)
if line.empty?
next if @skip_blanks
row = []
else
line = strip_value(line)
row = line.split(@split_column_separator, -1)
n_columns = row.size
i = 0
while i < n_columns
row[i] = nil if row[i].empty?
i += 1
end
end
@last_line = original_line
emit_row(row, &block)
end
end
def parse_quotable_loose(&block)
@scanner.keep_start
@scanner.each_line(@row_separator) do |line|
if @skip_lines and skip_line?(line)
@scanner.keep_drop
@scanner.keep_start
next
end
original_line = line
line = line.delete_suffix(@row_separator)
if line.empty?
if @skip_blanks
@scanner.keep_drop
@scanner.keep_start
next
end
row = []
elsif line.include?(@cr) or line.include?(@lf)
@scanner.keep_back
@need_robust_parsing = true
return parse_quotable_robust(&block)
else
row = line.split(@split_column_separator, -1)
n_columns = row.size
i = 0
while i < n_columns
column = row[i]
if column.empty?
row[i] = nil
else
n_quotes = column.count(@quote_character)
if n_quotes.zero?
# no quote
elsif n_quotes == 2 and
column.start_with?(@quote_character) and
column.end_with?(@quote_character)
row[i] = column[1..-2]
else
@scanner.keep_back
@need_robust_parsing = true
return parse_quotable_robust(&block)
end
end
i += 1
end
end
@scanner.keep_drop
@scanner.keep_start
@last_line = original_line
emit_row(row, &block)
end
@scanner.keep_drop
end
def parse_quotable_robust(&block)
row = []
skip_needless_lines
start_row
while true
@quoted_column_value = false
@unquoted_column_value = false
@scanner.scan_all(@strip_value) if @strip_value
value = parse_column_value
if value
@scanner.scan_all(@strip_value) if @strip_value
if @field_size_limit and value.size >= @field_size_limit
ignore_broken_line
raise MalformedCSVError.new("Field size exceeded", @lineno)
end
end
if parse_column_end
row << value
elsif parse_row_end
if row.empty? and value.nil?
emit_row([], &block) unless @skip_blanks
else
row << value
emit_row(row, &block)
row = []
end
skip_needless_lines
start_row
elsif @scanner.eos?
break if row.empty? and value.nil?
row << value
emit_row(row, &block)
break
else
if @quoted_column_value
ignore_broken_line
message = "Any value after quoted field isn't allowed"
raise MalformedCSVError.new(message, @lineno)
elsif @unquoted_column_value and
(new_line = @scanner.scan(@line_end))
ignore_broken_line
message = "Unquoted fields do not allow new line " +
"<#{new_line.inspect}>"
raise MalformedCSVError.new(message, @lineno)
elsif @scanner.rest.start_with?(@quote_character)
ignore_broken_line
message = "Illegal quoting"
raise MalformedCSVError.new(message, @lineno)
elsif (new_line = @scanner.scan(@line_end))
ignore_broken_line
message = "New line must be <#{@row_separator.inspect}> " +
"not <#{new_line.inspect}>"
raise MalformedCSVError.new(message, @lineno)
else
ignore_broken_line
raise MalformedCSVError.new("TODO: Meaningful message",
@lineno)
end
end
end
end
def parse_column_value
if @liberal_parsing
quoted_value = parse_quoted_column_value
if quoted_value
@scanner.scan_all(@strip_value) if @strip_value
unquoted_value = parse_unquoted_column_value
if unquoted_value
if @double_quote_outside_quote
unquoted_value = unquoted_value.gsub(@quote_character * 2,
@quote_character)
if quoted_value.empty? # %Q{""...} case
return @quote_character + unquoted_value
end
end
@quote_character + quoted_value + @quote_character + unquoted_value
else
quoted_value
end
else
parse_unquoted_column_value
end
elsif @may_quoted
parse_quoted_column_value ||
parse_unquoted_column_value
else
parse_unquoted_column_value ||
parse_quoted_column_value
end
end
def parse_unquoted_column_value
value = @scanner.scan_all(@unquoted_value)
return nil unless value
@unquoted_column_value = true
if @first_column_separators
while true
@scanner.keep_start
is_column_end = @column_ends.all? do |column_end|
@scanner.scan(column_end)
end
@scanner.keep_back
break if is_column_end
sub_separator = @scanner.scan_all(@first_column_separators)
break if sub_separator.nil?
value << sub_separator
sub_value = @scanner.scan_all(@unquoted_value)
break if sub_value.nil?
value << sub_value
end
end
value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
if @rstrip_value
value.gsub!(@rstrip_value, "")
end
value
end
def parse_quoted_column_value
quotes = @scanner.scan_all(@quotes)
return nil unless quotes
@quoted_column_value = true
n_quotes = quotes.size
if (n_quotes % 2).zero?
quotes[0, (n_quotes - 2) / 2]
else
value = quotes[0, (n_quotes - 1) / 2]
while true
quoted_value = @scanner.scan_all(@quoted_value)
value << quoted_value if quoted_value
if @backslash_quote
if @scanner.scan(@escaped_backslash)
if @scanner.scan(@escaped_quote)
value << @quote_character
else
value << @backslash_character
end
next
end
end
quotes = @scanner.scan_all(@quotes)
unless quotes
ignore_broken_line
message = "Unclosed quoted field"
raise MalformedCSVError.new(message, @lineno)
end
n_quotes = quotes.size
if n_quotes == 1
break
elsif (n_quotes % 2) == 1
value << quotes[0, (n_quotes - 1) / 2]
break
else
value << quotes[0, n_quotes / 2]
end
end
value
end
end
def parse_column_end
return true if @scanner.scan(@column_end)
return false unless @column_ends
@scanner.keep_start
if @column_ends.all? {|column_end| @scanner.scan(column_end)}
@scanner.keep_drop
true
else
@scanner.keep_back
false
end
end
def parse_row_end
return true if @scanner.scan(@row_end)
return false unless @row_ends
@scanner.keep_start
if @row_ends.all? {|row_end| @scanner.scan(row_end)}
@scanner.keep_drop
true
else
@scanner.keep_back
false
end
end
def strip_value(value)
return value unless @strip
return nil if value.nil?
case @strip
when String
size = value.size
while value.start_with?(@strip)
size -= 1
value = value[1, size]
end
while value.end_with?(@strip)
size -= 1
value = value[0, size]
end
else
value.strip!
end
value
end
def ignore_broken_line
@scanner.scan_all(@not_line_end)
@scanner.scan_all(@line_end)
@lineno += 1
end
def start_row
if @last_line
@last_line = nil
else
@scanner.keep_drop
end
@scanner.keep_start
end
def emit_row(row, &block)
@lineno += 1
raw_row = row
if @use_headers
if @headers.nil?
@headers = adjust_headers(row)
return unless @return_headers
row = Row.new(@headers, row, true)
else
row = Row.new(@headers,
@fields_converter.convert(raw_row, @headers, @lineno))
end
else
# convert fields, if needed...
row = @fields_converter.convert(raw_row, nil, @lineno)
end
# inject unconverted fields and accessor, if requested...
if @unconverted_fields and not row.respond_to?(:unconverted_fields)
add_unconverted_fields(row, raw_row)
end
yield(row)
end
# This method injects an instance variable <tt>unconverted_fields</tt> into
# +row+ and an accessor method for +row+ called unconverted_fields(). The
# variable is set to the contents of +fields+.
def add_unconverted_fields(row, fields)
class << row
attr_reader :unconverted_fields
end
row.instance_variable_set(:@unconverted_fields, fields)
row
end
end
end