2018-12-23 02:00:35 -05:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
require "strscan"
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
require_relative "delete_suffix"
|
2021-09-11 18:34:15 -04:00
|
|
|
require_relative "input_record_separator"
|
2018-12-23 02:00:35 -05:00
|
|
|
require_relative "match_p"
|
|
|
|
require_relative "row"
|
|
|
|
require_relative "table"
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
|
2018-12-23 02:00:35 -05:00
|
|
|
using CSV::MatchP if CSV.const_defined?(:MatchP)
|
|
|
|
|
|
|
|
class CSV
|
2019-10-12 01:03:21 -04:00
|
|
|
# Note: Don't use this class directly. This is an internal class.
|
2018-12-23 02:00:35 -05:00
|
|
|
class Parser
|
2019-10-12 01:03:21 -04:00
|
|
|
#
|
|
|
|
# A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
|
|
|
|
# or String object being read from or written to. Your data is never transcoded
|
|
|
|
# (unless you ask Ruby to transcode it for you) and will literally be parsed in
|
|
|
|
# the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
|
|
|
|
# Encoding of your data. This is accomplished by transcoding the parser itself
|
|
|
|
# into your Encoding.
|
|
|
|
#
|
|
|
|
|
|
|
|
# Raised when encoding is invalid.
|
2018-12-23 02:00:35 -05:00
|
|
|
class InvalidEncoding < StandardError
|
|
|
|
end
|
|
|
|
|
2019-10-12 01:03:21 -04:00
|
|
|
#
|
|
|
|
# CSV::Scanner receives a CSV output, scans it and return the content.
|
|
|
|
# It also controls the life cycle of the object with its methods +keep_start+,
|
|
|
|
# +keep_end+, +keep_back+, +keep_drop+.
|
|
|
|
#
|
|
|
|
# Uses StringScanner (the official strscan gem). Strscan provides lexical
|
|
|
|
# scanning operations on a String. We inherit its object and take advantage
|
|
|
|
# on the methods. For more information, please visit:
|
|
|
|
# https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
|
|
|
|
#
|
2018-12-23 02:00:35 -05:00
|
|
|
class Scanner < StringScanner
|
|
|
|
alias_method :scan_all, :scan
|
|
|
|
|
|
|
|
def initialize(*args)
|
|
|
|
super
|
|
|
|
@keeps = []
|
|
|
|
end
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
def each_line(row_separator)
|
|
|
|
position = pos
|
|
|
|
rest.each_line(row_separator) do |line|
|
|
|
|
position += line.bytesize
|
|
|
|
self.pos = position
|
|
|
|
yield(line)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
def keep_start
|
|
|
|
@keeps.push(pos)
|
|
|
|
end
|
|
|
|
|
|
|
|
def keep_end
|
|
|
|
start = @keeps.pop
|
2019-10-12 01:03:21 -04:00
|
|
|
string.byteslice(start, pos - start)
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def keep_back
|
|
|
|
self.pos = @keeps.pop
|
|
|
|
end
|
|
|
|
|
|
|
|
def keep_drop
|
|
|
|
@keeps.pop
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-10-12 01:03:21 -04:00
|
|
|
#
|
|
|
|
# CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
|
|
|
|
# It also controls the life cycle of the object with its methods +keep_start+,
|
|
|
|
# +keep_end+, +keep_back+, +keep_drop+.
|
|
|
|
#
|
|
|
|
# CSV::InputsScanner.scan() tries to match with pattern at the current position.
|
|
|
|
# If there's a match, the scanner advances the “scan pointer” and returns the matched string.
|
|
|
|
# Otherwise, the scanner returns nil.
|
|
|
|
#
|
|
|
|
# CSV::InputsScanner.rest() returns the “rest” of the string (i.e. everything after the scan pointer).
|
|
|
|
# If there is no more data (eos? = true), it returns "".
|
|
|
|
#
|
2018-12-23 02:00:35 -05:00
|
|
|
class InputsScanner
|
2021-12-23 20:18:35 -05:00
|
|
|
def initialize(inputs, encoding, row_separator, chunk_size: 8192)
|
2018-12-23 02:00:35 -05:00
|
|
|
@inputs = inputs.dup
|
|
|
|
@encoding = encoding
|
2021-12-23 20:18:35 -05:00
|
|
|
@row_separator = row_separator
|
2018-12-23 02:00:35 -05:00
|
|
|
@chunk_size = chunk_size
|
|
|
|
@last_scanner = @inputs.empty?
|
|
|
|
@keeps = []
|
|
|
|
read_chunk
|
|
|
|
end
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
def each_line(row_separator)
|
|
|
|
buffer = nil
|
|
|
|
input = @scanner.rest
|
|
|
|
position = @scanner.pos
|
|
|
|
offset = 0
|
|
|
|
n_row_separator_chars = row_separator.size
|
|
|
|
while true
|
|
|
|
input.each_line(row_separator) do |line|
|
|
|
|
@scanner.pos += line.bytesize
|
|
|
|
if buffer
|
|
|
|
if n_row_separator_chars == 2 and
|
|
|
|
buffer.end_with?(row_separator[0]) and
|
|
|
|
line.start_with?(row_separator[1])
|
|
|
|
buffer << line[0]
|
|
|
|
line = line[1..-1]
|
|
|
|
position += buffer.bytesize + offset
|
|
|
|
@scanner.pos = position
|
|
|
|
offset = 0
|
|
|
|
yield(buffer)
|
|
|
|
buffer = nil
|
|
|
|
next if line.empty?
|
|
|
|
else
|
|
|
|
buffer << line
|
|
|
|
line = buffer
|
|
|
|
buffer = nil
|
|
|
|
end
|
|
|
|
end
|
|
|
|
if line.end_with?(row_separator)
|
|
|
|
position += line.bytesize + offset
|
|
|
|
@scanner.pos = position
|
|
|
|
offset = 0
|
|
|
|
yield(line)
|
|
|
|
else
|
|
|
|
buffer = line
|
|
|
|
end
|
|
|
|
end
|
|
|
|
break unless read_chunk
|
|
|
|
input = @scanner.rest
|
|
|
|
position = @scanner.pos
|
|
|
|
offset = -buffer.bytesize if buffer
|
|
|
|
end
|
|
|
|
yield(buffer) if buffer
|
|
|
|
end
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
def scan(pattern)
|
|
|
|
value = @scanner.scan(pattern)
|
|
|
|
return value if @last_scanner
|
|
|
|
|
|
|
|
if value
|
|
|
|
read_chunk if @scanner.eos?
|
|
|
|
return value
|
|
|
|
else
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def scan_all(pattern)
|
|
|
|
value = @scanner.scan(pattern)
|
|
|
|
return value if @last_scanner
|
|
|
|
|
|
|
|
return nil if value.nil?
|
|
|
|
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
|
|
|
|
value << sub_value
|
|
|
|
end
|
|
|
|
value
|
|
|
|
end
|
|
|
|
|
|
|
|
def eos?
|
|
|
|
@scanner.eos?
|
|
|
|
end
|
|
|
|
|
|
|
|
def keep_start
|
2021-12-23 20:18:18 -05:00
|
|
|
adjust_last_keep
|
2018-12-23 02:00:35 -05:00
|
|
|
@keeps.push([@scanner.pos, nil])
|
|
|
|
end
|
|
|
|
|
|
|
|
def keep_end
|
|
|
|
start, buffer = @keeps.pop
|
2019-10-12 01:03:21 -04:00
|
|
|
keep = @scanner.string.byteslice(start, @scanner.pos - start)
|
2018-12-23 02:00:35 -05:00
|
|
|
if buffer
|
|
|
|
buffer << keep
|
|
|
|
keep = buffer
|
|
|
|
end
|
|
|
|
keep
|
|
|
|
end
|
|
|
|
|
|
|
|
def keep_back
|
|
|
|
start, buffer = @keeps.pop
|
|
|
|
if buffer
|
|
|
|
string = @scanner.string
|
2019-04-14 17:01:51 -04:00
|
|
|
keep = string.byteslice(start, string.bytesize - start)
|
2018-12-23 02:00:35 -05:00
|
|
|
if keep and not keep.empty?
|
|
|
|
@inputs.unshift(StringIO.new(keep))
|
|
|
|
@last_scanner = false
|
|
|
|
end
|
|
|
|
@scanner = StringScanner.new(buffer)
|
|
|
|
else
|
|
|
|
@scanner.pos = start
|
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
read_chunk if @scanner.eos?
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def keep_drop
|
2021-12-23 20:18:18 -05:00
|
|
|
_, buffer = @keeps.pop
|
|
|
|
return unless buffer
|
|
|
|
|
|
|
|
last_keep = @keeps.last
|
|
|
|
return unless last_keep
|
|
|
|
|
|
|
|
if last_keep[1]
|
|
|
|
last_keep[1] << buffer
|
|
|
|
else
|
|
|
|
last_keep[1] = buffer
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def rest
|
|
|
|
@scanner.rest
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
2021-12-23 20:18:18 -05:00
|
|
|
def adjust_last_keep
|
|
|
|
keep = @keeps.last
|
|
|
|
return if keep.nil?
|
|
|
|
|
|
|
|
keep_start = keep[0]
|
|
|
|
return if @scanner.pos == keep_start
|
|
|
|
|
|
|
|
string = @scanner.string
|
|
|
|
keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
|
|
|
|
if keep_data
|
|
|
|
keep_buffer = keep[1]
|
|
|
|
if keep_buffer
|
|
|
|
keep_buffer << keep_data
|
|
|
|
else
|
|
|
|
keep[1] = keep_data.dup
|
|
|
|
end
|
|
|
|
end
|
|
|
|
keep[0] = 0
|
|
|
|
end
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
def read_chunk
|
|
|
|
return false if @last_scanner
|
|
|
|
|
2021-12-23 20:18:18 -05:00
|
|
|
adjust_last_keep
|
2018-12-23 02:00:35 -05:00
|
|
|
|
|
|
|
input = @inputs.first
|
|
|
|
case input
|
|
|
|
when StringIO
|
2019-10-12 01:03:21 -04:00
|
|
|
string = input.read
|
2018-12-23 02:00:35 -05:00
|
|
|
raise InvalidEncoding unless string.valid_encoding?
|
|
|
|
@scanner = StringScanner.new(string)
|
|
|
|
@inputs.shift
|
|
|
|
@last_scanner = @inputs.empty?
|
|
|
|
true
|
|
|
|
else
|
2021-12-23 20:18:35 -05:00
|
|
|
chunk = input.gets(@row_separator, @chunk_size)
|
2018-12-23 02:00:35 -05:00
|
|
|
if chunk
|
|
|
|
raise InvalidEncoding unless chunk.valid_encoding?
|
|
|
|
@scanner = StringScanner.new(chunk)
|
|
|
|
if input.respond_to?(:eof?) and input.eof?
|
|
|
|
@inputs.shift
|
|
|
|
@last_scanner = @inputs.empty?
|
|
|
|
end
|
|
|
|
true
|
|
|
|
else
|
|
|
|
@scanner = StringScanner.new("".encode(@encoding))
|
|
|
|
@inputs.shift
|
|
|
|
@last_scanner = @inputs.empty?
|
|
|
|
if @last_scanner
|
|
|
|
false
|
|
|
|
else
|
|
|
|
read_chunk
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def initialize(input, options)
|
|
|
|
@input = input
|
|
|
|
@options = options
|
|
|
|
@samples = []
|
|
|
|
|
|
|
|
prepare
|
|
|
|
end
|
|
|
|
|
|
|
|
def column_separator
|
|
|
|
@column_separator
|
|
|
|
end
|
|
|
|
|
|
|
|
def row_separator
|
|
|
|
@row_separator
|
|
|
|
end
|
|
|
|
|
|
|
|
def quote_character
|
|
|
|
@quote_character
|
|
|
|
end
|
|
|
|
|
|
|
|
def field_size_limit
|
|
|
|
@field_size_limit
|
|
|
|
end
|
|
|
|
|
|
|
|
def skip_lines
|
|
|
|
@skip_lines
|
|
|
|
end
|
|
|
|
|
|
|
|
def unconverted_fields?
|
|
|
|
@unconverted_fields
|
|
|
|
end
|
|
|
|
|
|
|
|
def headers
|
|
|
|
@headers
|
|
|
|
end
|
|
|
|
|
|
|
|
def header_row?
|
|
|
|
@use_headers and @headers.nil?
|
|
|
|
end
|
|
|
|
|
|
|
|
def return_headers?
|
|
|
|
@return_headers
|
|
|
|
end
|
|
|
|
|
|
|
|
def skip_blanks?
|
|
|
|
@skip_blanks
|
|
|
|
end
|
|
|
|
|
|
|
|
def liberal_parsing?
|
|
|
|
@liberal_parsing
|
|
|
|
end
|
|
|
|
|
|
|
|
def lineno
|
|
|
|
@lineno
|
|
|
|
end
|
|
|
|
|
|
|
|
def line
|
|
|
|
last_line
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse(&block)
|
|
|
|
return to_enum(__method__) unless block_given?
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
if @return_headers and @headers and @raw_headers
|
2018-12-23 02:00:35 -05:00
|
|
|
headers = Row.new(@headers, @raw_headers, true)
|
|
|
|
if @unconverted_fields
|
|
|
|
headers = add_unconverted_fields(headers, [])
|
|
|
|
end
|
|
|
|
yield headers
|
|
|
|
end
|
|
|
|
|
|
|
|
begin
|
2019-04-14 17:01:51 -04:00
|
|
|
@scanner ||= build_scanner
|
|
|
|
if quote_character.nil?
|
|
|
|
parse_no_quote(&block)
|
|
|
|
elsif @need_robust_parsing
|
|
|
|
parse_quotable_robust(&block)
|
|
|
|
else
|
|
|
|
parse_quotable_loose(&block)
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
rescue InvalidEncoding
|
2019-04-14 17:01:51 -04:00
|
|
|
if @scanner
|
|
|
|
ignore_broken_line
|
|
|
|
lineno = @lineno
|
|
|
|
else
|
|
|
|
lineno = @lineno + 1
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
message = "Invalid byte sequence in #{@encoding}"
|
2019-04-14 17:01:51 -04:00
|
|
|
raise MalformedCSVError.new(message, lineno)
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
2019-01-25 01:49:59 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def use_headers?
|
|
|
|
@use_headers
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
private
|
2019-10-12 01:03:21 -04:00
|
|
|
# A set of tasks to prepare the file in order to parse it
|
2018-12-23 02:00:35 -05:00
|
|
|
def prepare
|
|
|
|
prepare_variable
|
2019-04-14 17:01:51 -04:00
|
|
|
prepare_quote_character
|
|
|
|
prepare_backslash
|
|
|
|
prepare_skip_lines
|
|
|
|
prepare_strip
|
|
|
|
prepare_separators
|
[ruby/csv] Add handling for ambiguous parsing options (https://github.com/ruby/csv/pull/226)
GitHub: fix GH-225
With Ruby 3.0.2 and csv 3.2.1, the file
```ruby
require "csv"
File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") }
CSV.read("example.tsv", col_sep: "\t", strip: true)
```
produces the error
```
lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful
message in line 1. (CSV::MalformedCSVError)
```
However, the CSV in this example is not malformed; instead, ambiguous
options were provided to the parser. It is not obvious (to me) whether
the string should be parsed as
- `["foo\t\tbar"]`,
- `["foo", "bar"]`,
- `["foo", "", "bar"]`, or
- `["foo", nil, "bar"]`.
This commit adds code that raises an exception when this situation is
encountered. Specifically, it checks if the column separator either ends
with or starts with the characters that would be stripped away.
This commit also adds unit tests and updates the documentation.
https://github.com/ruby/csv/commit/cc317dd42d
2021-11-18 16:20:09 -05:00
|
|
|
validate_strip_and_col_sep_options
|
2019-04-14 17:01:51 -04:00
|
|
|
prepare_quoted
|
|
|
|
prepare_unquoted
|
2018-12-23 02:00:35 -05:00
|
|
|
prepare_line
|
|
|
|
prepare_header
|
|
|
|
prepare_parser
|
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_variable
|
2019-04-14 17:01:51 -04:00
|
|
|
@need_robust_parsing = false
|
2018-12-23 02:00:35 -05:00
|
|
|
@encoding = @options[:encoding]
|
2019-01-25 01:49:59 -05:00
|
|
|
liberal_parsing = @options[:liberal_parsing]
|
|
|
|
if liberal_parsing
|
|
|
|
@liberal_parsing = true
|
|
|
|
if liberal_parsing.is_a?(Hash)
|
|
|
|
@double_quote_outside_quote =
|
|
|
|
liberal_parsing[:double_quote_outside_quote]
|
2019-04-14 17:01:51 -04:00
|
|
|
@backslash_quote = liberal_parsing[:backslash_quote]
|
2019-01-25 01:49:59 -05:00
|
|
|
else
|
|
|
|
@double_quote_outside_quote = false
|
2019-04-14 17:01:51 -04:00
|
|
|
@backslash_quote = false
|
2019-01-25 01:49:59 -05:00
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
@need_robust_parsing = true
|
2019-01-25 01:49:59 -05:00
|
|
|
else
|
|
|
|
@liberal_parsing = false
|
2019-04-14 17:01:51 -04:00
|
|
|
@backslash_quote = false
|
2019-01-25 01:49:59 -05:00
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
@unconverted_fields = @options[:unconverted_fields]
|
|
|
|
@field_size_limit = @options[:field_size_limit]
|
|
|
|
@skip_blanks = @options[:skip_blanks]
|
|
|
|
@fields_converter = @options[:fields_converter]
|
|
|
|
@header_fields_converter = @options[:header_fields_converter]
|
|
|
|
end
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
def prepare_quote_character
|
|
|
|
@quote_character = @options[:quote_character]
|
|
|
|
if @quote_character.nil?
|
|
|
|
@escaped_quote_character = nil
|
|
|
|
@escaped_quote = nil
|
|
|
|
else
|
|
|
|
@quote_character = @quote_character.to_s.encode(@encoding)
|
|
|
|
if @quote_character.length != 1
|
|
|
|
message = ":quote_char has to be nil or a single character String"
|
|
|
|
raise ArgumentError, message
|
|
|
|
end
|
|
|
|
@double_quote_character = @quote_character * 2
|
|
|
|
@escaped_quote_character = Regexp.escape(@quote_character)
|
|
|
|
@escaped_quote = Regexp.new(@escaped_quote_character)
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
def prepare_backslash
|
|
|
|
return unless @backslash_quote
|
2018-12-23 02:00:35 -05:00
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
@backslash_character = "\\".encode(@encoding)
|
|
|
|
|
|
|
|
@escaped_backslash_character = Regexp.escape(@backslash_character)
|
|
|
|
@escaped_backslash = Regexp.new(@escaped_backslash_character)
|
|
|
|
if @quote_character.nil?
|
|
|
|
@backslash_quote_character = nil
|
|
|
|
else
|
|
|
|
@backslash_quote_character =
|
|
|
|
@backslash_character + @escaped_quote_character
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_skip_lines
|
2018-12-23 02:00:35 -05:00
|
|
|
skip_lines = @options[:skip_lines]
|
|
|
|
case skip_lines
|
|
|
|
when String
|
|
|
|
@skip_lines = skip_lines.encode(@encoding)
|
|
|
|
when Regexp, nil
|
|
|
|
@skip_lines = skip_lines
|
|
|
|
else
|
|
|
|
unless skip_lines.respond_to?(:match)
|
|
|
|
message =
|
|
|
|
":skip_lines has to respond to \#match: #{skip_lines.inspect}"
|
|
|
|
raise ArgumentError, message
|
|
|
|
end
|
|
|
|
@skip_lines = skip_lines
|
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_strip
|
|
|
|
@strip = @options[:strip]
|
|
|
|
@escaped_strip = nil
|
|
|
|
@strip_value = nil
|
2019-12-24 16:59:43 -05:00
|
|
|
@rstrip_value = nil
|
2019-04-14 17:01:51 -04:00
|
|
|
if @strip.is_a?(String)
|
|
|
|
case @strip.length
|
|
|
|
when 0
|
|
|
|
raise ArgumentError, ":strip must not be an empty String"
|
|
|
|
when 1
|
|
|
|
# ok
|
|
|
|
else
|
|
|
|
raise ArgumentError, ":strip doesn't support 2 or more characters yet"
|
|
|
|
end
|
|
|
|
@strip = @strip.encode(@encoding)
|
|
|
|
@escaped_strip = Regexp.escape(@strip)
|
|
|
|
if @quote_character
|
|
|
|
@strip_value = Regexp.new(@escaped_strip +
|
|
|
|
"+".encode(@encoding))
|
2019-12-24 16:59:43 -05:00
|
|
|
@rstrip_value = Regexp.new(@escaped_strip +
|
|
|
|
"+\\z".encode(@encoding))
|
2019-04-14 17:01:51 -04:00
|
|
|
end
|
|
|
|
@need_robust_parsing = true
|
|
|
|
elsif @strip
|
2019-04-22 16:54:44 -04:00
|
|
|
strip_values = " \t\f\v"
|
2019-04-14 17:01:51 -04:00
|
|
|
@escaped_strip = strip_values.encode(@encoding)
|
|
|
|
if @quote_character
|
|
|
|
@strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
|
2019-12-24 16:59:43 -05:00
|
|
|
@rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
|
2019-04-14 17:01:51 -04:00
|
|
|
end
|
|
|
|
@need_robust_parsing = true
|
|
|
|
end
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
begin
|
|
|
|
StringScanner.new("x").scan("x")
|
|
|
|
rescue TypeError
|
2021-10-10 22:21:42 -04:00
|
|
|
STRING_SCANNER_SCAN_ACCEPT_STRING = false
|
2019-04-14 17:01:51 -04:00
|
|
|
else
|
2021-10-10 22:21:42 -04:00
|
|
|
STRING_SCANNER_SCAN_ACCEPT_STRING = true
|
2019-04-14 17:01:51 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_separators
|
2019-10-12 01:03:21 -04:00
|
|
|
column_separator = @options[:column_separator]
|
|
|
|
@column_separator = column_separator.to_s.encode(@encoding)
|
|
|
|
if @column_separator.size < 1
|
|
|
|
message = ":col_sep must be 1 or more characters: "
|
|
|
|
message += column_separator.inspect
|
|
|
|
raise ArgumentError, message
|
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
@row_separator =
|
|
|
|
resolve_row_separator(@options[:row_separator]).encode(@encoding)
|
|
|
|
|
|
|
|
@escaped_column_separator = Regexp.escape(@column_separator)
|
|
|
|
@escaped_first_column_separator = Regexp.escape(@column_separator[0])
|
2018-12-23 02:00:35 -05:00
|
|
|
if @column_separator.size > 1
|
2019-04-14 17:01:51 -04:00
|
|
|
@column_end = Regexp.new(@escaped_column_separator)
|
2018-12-23 02:00:35 -05:00
|
|
|
@column_ends = @column_separator.each_char.collect do |char|
|
|
|
|
Regexp.new(Regexp.escape(char))
|
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
@first_column_separators = Regexp.new(@escaped_first_column_separator +
|
2019-01-25 01:49:59 -05:00
|
|
|
"+".encode(@encoding))
|
2018-12-23 02:00:35 -05:00
|
|
|
else
|
2021-10-10 22:21:42 -04:00
|
|
|
if STRING_SCANNER_SCAN_ACCEPT_STRING
|
2019-04-14 17:01:51 -04:00
|
|
|
@column_end = @column_separator
|
|
|
|
else
|
|
|
|
@column_end = Regexp.new(@escaped_column_separator)
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
@column_ends = nil
|
2019-01-25 01:49:59 -05:00
|
|
|
@first_column_separators = nil
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
|
|
|
|
escaped_row_separator = Regexp.escape(@row_separator)
|
2018-12-23 02:00:35 -05:00
|
|
|
@row_end = Regexp.new(escaped_row_separator)
|
|
|
|
if @row_separator.size > 1
|
|
|
|
@row_ends = @row_separator.each_char.collect do |char|
|
|
|
|
Regexp.new(Regexp.escape(char))
|
|
|
|
end
|
|
|
|
else
|
|
|
|
@row_ends = nil
|
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
|
|
|
|
@cr = "\r".encode(@encoding)
|
|
|
|
@lf = "\n".encode(@encoding)
|
2021-10-03 15:10:48 -04:00
|
|
|
@line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
|
2018-12-23 02:00:35 -05:00
|
|
|
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
|
|
|
|
end
|
|
|
|
|
[ruby/csv] Add handling for ambiguous parsing options (https://github.com/ruby/csv/pull/226)
GitHub: fix GH-225
With Ruby 3.0.2 and csv 3.2.1, the file
```ruby
require "csv"
File.open("example.tsv", "w") { |f| f.puts("foo\t\tbar") }
CSV.read("example.tsv", col_sep: "\t", strip: true)
```
produces the error
```
lib/csv/parser.rb:935:in `parse_quotable_robust': TODO: Meaningful
message in line 1. (CSV::MalformedCSVError)
```
However, the CSV in this example is not malformed; instead, ambiguous
options were provided to the parser. It is not obvious (to me) whether
the string should be parsed as
- `["foo\t\tbar"]`,
- `["foo", "bar"]`,
- `["foo", "", "bar"]`, or
- `["foo", nil, "bar"]`.
This commit adds code that raises an exception when this situation is
encountered. Specifically, it checks if the column separator either ends
with or starts with the characters that would be stripped away.
This commit also adds unit tests and updates the documentation.
https://github.com/ruby/csv/commit/cc317dd42d
2021-11-18 16:20:09 -05:00
|
|
|
# This method verifies that there are no (obvious) ambiguities with the
|
|
|
|
# provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
|
|
|
|
# and +strip+ were both equal to +\t+, then there would be no clear way to
|
|
|
|
# parse the input.
|
|
|
|
def validate_strip_and_col_sep_options
|
|
|
|
return unless @strip
|
|
|
|
|
|
|
|
if @strip.is_a?(String)
|
|
|
|
if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
|
|
|
|
raise ArgumentError,
|
|
|
|
"The provided strip (#{@escaped_strip}) and " \
|
|
|
|
"col_sep (#{@escaped_column_separator}) options are incompatible."
|
|
|
|
end
|
|
|
|
else
|
|
|
|
if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
|
|
|
|
raise ArgumentError,
|
|
|
|
"The provided strip (true) and " \
|
|
|
|
"col_sep (#{@escaped_column_separator}) options are incompatible."
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
def prepare_quoted
|
|
|
|
if @quote_character
|
|
|
|
@quotes = Regexp.new(@escaped_quote_character +
|
|
|
|
"+".encode(@encoding))
|
|
|
|
no_quoted_values = @escaped_quote_character.dup
|
|
|
|
if @backslash_quote
|
|
|
|
no_quoted_values << @escaped_backslash_character
|
|
|
|
end
|
|
|
|
@quoted_value = Regexp.new("[^".encode(@encoding) +
|
|
|
|
no_quoted_values +
|
|
|
|
"]+".encode(@encoding))
|
|
|
|
end
|
|
|
|
if @escaped_strip
|
|
|
|
@split_column_separator = Regexp.new(@escaped_strip +
|
|
|
|
"*".encode(@encoding) +
|
|
|
|
@escaped_column_separator +
|
|
|
|
@escaped_strip +
|
|
|
|
"*".encode(@encoding))
|
|
|
|
else
|
|
|
|
if @column_separator == " ".encode(@encoding)
|
|
|
|
@split_column_separator = Regexp.new(@escaped_column_separator)
|
|
|
|
else
|
|
|
|
@split_column_separator = @column_separator
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_unquoted
|
|
|
|
return if @quote_character.nil?
|
|
|
|
|
|
|
|
no_unquoted_values = "\r\n".encode(@encoding)
|
|
|
|
no_unquoted_values << @escaped_first_column_separator
|
|
|
|
unless @liberal_parsing
|
|
|
|
no_unquoted_values << @escaped_quote_character
|
|
|
|
end
|
|
|
|
@unquoted_value = Regexp.new("[^".encode(@encoding) +
|
|
|
|
no_unquoted_values +
|
|
|
|
"]+".encode(@encoding))
|
|
|
|
end
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
def resolve_row_separator(separator)
|
|
|
|
if separator == :auto
|
|
|
|
cr = "\r".encode(@encoding)
|
|
|
|
lf = "\n".encode(@encoding)
|
|
|
|
if @input.is_a?(StringIO)
|
2019-10-12 01:03:21 -04:00
|
|
|
pos = @input.pos
|
|
|
|
separator = detect_row_separator(@input.read, cr, lf)
|
|
|
|
@input.seek(pos)
|
2018-12-23 02:00:35 -05:00
|
|
|
elsif @input.respond_to?(:gets)
|
|
|
|
if @input.is_a?(File)
|
|
|
|
chunk_size = 32 * 1024
|
|
|
|
else
|
|
|
|
chunk_size = 1024
|
|
|
|
end
|
|
|
|
begin
|
|
|
|
while separator == :auto
|
|
|
|
#
|
|
|
|
# if we run out of data, it's probably a single line
|
|
|
|
# (ensure will set default value)
|
|
|
|
#
|
|
|
|
break unless sample = @input.gets(nil, chunk_size)
|
|
|
|
|
|
|
|
# extend sample if we're unsure of the line ending
|
|
|
|
if sample.end_with?(cr)
|
|
|
|
sample << (@input.gets(nil, 1) || "")
|
|
|
|
end
|
|
|
|
|
|
|
|
@samples << sample
|
|
|
|
|
|
|
|
separator = detect_row_separator(sample, cr, lf)
|
|
|
|
end
|
|
|
|
rescue IOError
|
|
|
|
# do nothing: ensure will set default
|
|
|
|
end
|
|
|
|
end
|
2021-09-11 18:34:15 -04:00
|
|
|
separator = InputRecordSeparator.value if separator == :auto
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
separator.to_s.encode(@encoding)
|
|
|
|
end
|
|
|
|
|
|
|
|
def detect_row_separator(sample, cr, lf)
|
|
|
|
lf_index = sample.index(lf)
|
|
|
|
if lf_index
|
|
|
|
cr_index = sample[0, lf_index].index(cr)
|
|
|
|
else
|
|
|
|
cr_index = sample.index(cr)
|
|
|
|
end
|
|
|
|
if cr_index and lf_index
|
|
|
|
if cr_index + 1 == lf_index
|
|
|
|
cr + lf
|
|
|
|
elsif cr_index < lf_index
|
|
|
|
cr
|
|
|
|
else
|
|
|
|
lf
|
|
|
|
end
|
|
|
|
elsif cr_index
|
|
|
|
cr
|
|
|
|
elsif lf_index
|
|
|
|
lf
|
|
|
|
else
|
|
|
|
:auto
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_line
|
|
|
|
@lineno = 0
|
|
|
|
@last_line = nil
|
|
|
|
@scanner = nil
|
|
|
|
end
|
|
|
|
|
|
|
|
def last_line
|
|
|
|
if @scanner
|
|
|
|
@last_line ||= @scanner.keep_end
|
|
|
|
else
|
|
|
|
@last_line
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_header
|
|
|
|
@return_headers = @options[:return_headers]
|
|
|
|
|
|
|
|
headers = @options[:headers]
|
|
|
|
case headers
|
|
|
|
when Array
|
|
|
|
@raw_headers = headers
|
|
|
|
@use_headers = true
|
|
|
|
when String
|
|
|
|
@raw_headers = parse_headers(headers)
|
|
|
|
@use_headers = true
|
|
|
|
when nil, false
|
|
|
|
@raw_headers = nil
|
|
|
|
@use_headers = false
|
|
|
|
else
|
|
|
|
@raw_headers = nil
|
|
|
|
@use_headers = true
|
|
|
|
end
|
|
|
|
if @raw_headers
|
|
|
|
@headers = adjust_headers(@raw_headers)
|
|
|
|
else
|
|
|
|
@headers = nil
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_headers(row)
|
|
|
|
CSV.parse_line(row,
|
|
|
|
col_sep: @column_separator,
|
|
|
|
row_sep: @row_separator,
|
|
|
|
quote_char: @quote_character)
|
|
|
|
end
|
|
|
|
|
|
|
|
def adjust_headers(headers)
|
|
|
|
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
|
|
|
|
adjusted_headers.each {|h| h.freeze if h.is_a? String}
|
|
|
|
adjusted_headers
|
|
|
|
end
|
|
|
|
|
|
|
|
def prepare_parser
|
|
|
|
@may_quoted = may_quoted?
|
|
|
|
end
|
|
|
|
|
|
|
|
def may_quoted?
|
2019-04-14 17:01:51 -04:00
|
|
|
return false if @quote_character.nil?
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
if @input.is_a?(StringIO)
|
2019-10-12 01:03:21 -04:00
|
|
|
pos = @input.pos
|
|
|
|
sample = @input.read
|
|
|
|
@input.seek(pos)
|
2018-12-23 02:00:35 -05:00
|
|
|
else
|
|
|
|
return false if @samples.empty?
|
|
|
|
sample = @samples.first
|
|
|
|
end
|
|
|
|
sample[0, 128].index(@quote_character)
|
|
|
|
end
|
|
|
|
|
2021-12-23 20:18:18 -05:00
|
|
|
class UnoptimizedStringIO # :nodoc:
|
|
|
|
def initialize(string)
|
|
|
|
@io = StringIO.new(string, "rb:#{string.encoding}")
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
|
2021-12-23 20:18:18 -05:00
|
|
|
def gets(*args)
|
|
|
|
@io.gets(*args)
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
|
2021-12-23 20:18:18 -05:00
|
|
|
def each_line(*args, &block)
|
|
|
|
@io.each_line(*args, &block)
|
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
|
2021-12-23 20:18:18 -05:00
|
|
|
def eof?
|
|
|
|
@io.eof?
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
2021-12-23 20:18:18 -05:00
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
|
2021-12-23 20:18:18 -05:00
|
|
|
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
|
|
|
|
if SCANNER_TEST
|
2018-12-23 02:00:35 -05:00
|
|
|
def build_scanner
|
|
|
|
inputs = @samples.collect do |sample|
|
|
|
|
UnoptimizedStringIO.new(sample)
|
|
|
|
end
|
|
|
|
if @input.is_a?(StringIO)
|
2019-10-12 01:03:21 -04:00
|
|
|
inputs << UnoptimizedStringIO.new(@input.read)
|
2018-12-23 02:00:35 -05:00
|
|
|
else
|
|
|
|
inputs << @input
|
|
|
|
end
|
2021-12-23 20:18:18 -05:00
|
|
|
chunk_size =
|
|
|
|
Integer((ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"), 10)
|
2019-04-14 17:01:51 -04:00
|
|
|
InputsScanner.new(inputs,
|
|
|
|
@encoding,
|
2021-12-23 20:18:35 -05:00
|
|
|
@row_separator,
|
2021-12-23 20:18:18 -05:00
|
|
|
chunk_size: chunk_size)
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
else
|
|
|
|
def build_scanner
|
|
|
|
string = nil
|
|
|
|
if @samples.empty? and @input.is_a?(StringIO)
|
2019-10-12 01:03:21 -04:00
|
|
|
string = @input.read
|
2021-12-02 16:30:01 -05:00
|
|
|
elsif @samples.size == 1 and
|
|
|
|
@input != ARGF and
|
|
|
|
@input.respond_to?(:eof?) and
|
|
|
|
@input.eof?
|
2018-12-23 02:00:35 -05:00
|
|
|
string = @samples[0]
|
|
|
|
end
|
|
|
|
if string
|
|
|
|
unless string.valid_encoding?
|
2019-04-14 17:01:51 -04:00
|
|
|
index = string.lines(@row_separator).index do |line|
|
|
|
|
!line.valid_encoding?
|
|
|
|
end
|
|
|
|
if index
|
|
|
|
message = "Invalid byte sequence in #{@encoding}"
|
|
|
|
raise MalformedCSVError.new(message, @lineno + index + 1)
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
Scanner.new(string)
|
|
|
|
else
|
|
|
|
inputs = @samples.collect do |sample|
|
|
|
|
StringIO.new(sample)
|
|
|
|
end
|
|
|
|
inputs << @input
|
2021-12-23 20:18:35 -05:00
|
|
|
InputsScanner.new(inputs, @encoding, @row_separator)
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def skip_needless_lines
|
|
|
|
return unless @skip_lines
|
|
|
|
|
2019-11-02 23:48:22 -04:00
|
|
|
until @scanner.eos?
|
2018-12-23 02:00:35 -05:00
|
|
|
@scanner.keep_start
|
|
|
|
line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
|
|
|
|
line << @row_separator if parse_row_end
|
|
|
|
if skip_line?(line)
|
2019-04-14 17:01:51 -04:00
|
|
|
@lineno += 1
|
2018-12-23 02:00:35 -05:00
|
|
|
@scanner.keep_drop
|
|
|
|
else
|
|
|
|
@scanner.keep_back
|
|
|
|
return
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def skip_line?(line)
|
2020-11-22 15:54:04 -05:00
|
|
|
line = line.delete_suffix(@row_separator)
|
2018-12-23 02:00:35 -05:00
|
|
|
case @skip_lines
|
|
|
|
when String
|
|
|
|
line.include?(@skip_lines)
|
|
|
|
when Regexp
|
|
|
|
@skip_lines.match?(line)
|
|
|
|
else
|
|
|
|
@skip_lines.match(line)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
def parse_no_quote(&block)
|
|
|
|
@scanner.each_line(@row_separator) do |line|
|
|
|
|
next if @skip_lines and skip_line?(line)
|
|
|
|
original_line = line
|
|
|
|
line = line.delete_suffix(@row_separator)
|
|
|
|
|
|
|
|
if line.empty?
|
|
|
|
next if @skip_blanks
|
|
|
|
row = []
|
|
|
|
else
|
|
|
|
line = strip_value(line)
|
|
|
|
row = line.split(@split_column_separator, -1)
|
|
|
|
n_columns = row.size
|
|
|
|
i = 0
|
|
|
|
while i < n_columns
|
|
|
|
row[i] = nil if row[i].empty?
|
|
|
|
i += 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
@last_line = original_line
|
|
|
|
emit_row(row, &block)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_quotable_loose(&block)
|
|
|
|
@scanner.keep_start
|
|
|
|
@scanner.each_line(@row_separator) do |line|
|
|
|
|
if @skip_lines and skip_line?(line)
|
|
|
|
@scanner.keep_drop
|
|
|
|
@scanner.keep_start
|
|
|
|
next
|
|
|
|
end
|
|
|
|
original_line = line
|
|
|
|
line = line.delete_suffix(@row_separator)
|
|
|
|
|
|
|
|
if line.empty?
|
|
|
|
if @skip_blanks
|
|
|
|
@scanner.keep_drop
|
|
|
|
@scanner.keep_start
|
|
|
|
next
|
|
|
|
end
|
|
|
|
row = []
|
|
|
|
elsif line.include?(@cr) or line.include?(@lf)
|
|
|
|
@scanner.keep_back
|
|
|
|
@need_robust_parsing = true
|
|
|
|
return parse_quotable_robust(&block)
|
|
|
|
else
|
|
|
|
row = line.split(@split_column_separator, -1)
|
|
|
|
n_columns = row.size
|
|
|
|
i = 0
|
|
|
|
while i < n_columns
|
|
|
|
column = row[i]
|
|
|
|
if column.empty?
|
|
|
|
row[i] = nil
|
|
|
|
else
|
|
|
|
n_quotes = column.count(@quote_character)
|
|
|
|
if n_quotes.zero?
|
|
|
|
# no quote
|
|
|
|
elsif n_quotes == 2 and
|
|
|
|
column.start_with?(@quote_character) and
|
|
|
|
column.end_with?(@quote_character)
|
|
|
|
row[i] = column[1..-2]
|
|
|
|
else
|
|
|
|
@scanner.keep_back
|
|
|
|
@need_robust_parsing = true
|
|
|
|
return parse_quotable_robust(&block)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
i += 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
@scanner.keep_drop
|
|
|
|
@scanner.keep_start
|
|
|
|
@last_line = original_line
|
|
|
|
emit_row(row, &block)
|
|
|
|
end
|
|
|
|
@scanner.keep_drop
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_quotable_robust(&block)
|
|
|
|
row = []
|
|
|
|
skip_needless_lines
|
|
|
|
start_row
|
|
|
|
while true
|
|
|
|
@quoted_column_value = false
|
|
|
|
@unquoted_column_value = false
|
|
|
|
@scanner.scan_all(@strip_value) if @strip_value
|
|
|
|
value = parse_column_value
|
|
|
|
if value
|
|
|
|
@scanner.scan_all(@strip_value) if @strip_value
|
|
|
|
if @field_size_limit and value.size >= @field_size_limit
|
|
|
|
ignore_broken_line
|
|
|
|
raise MalformedCSVError.new("Field size exceeded", @lineno)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
if parse_column_end
|
|
|
|
row << value
|
|
|
|
elsif parse_row_end
|
|
|
|
if row.empty? and value.nil?
|
|
|
|
emit_row([], &block) unless @skip_blanks
|
|
|
|
else
|
|
|
|
row << value
|
|
|
|
emit_row(row, &block)
|
|
|
|
row = []
|
|
|
|
end
|
|
|
|
skip_needless_lines
|
|
|
|
start_row
|
|
|
|
elsif @scanner.eos?
|
|
|
|
break if row.empty? and value.nil?
|
|
|
|
row << value
|
|
|
|
emit_row(row, &block)
|
|
|
|
break
|
|
|
|
else
|
|
|
|
if @quoted_column_value
|
|
|
|
ignore_broken_line
|
|
|
|
message = "Any value after quoted field isn't allowed"
|
|
|
|
raise MalformedCSVError.new(message, @lineno)
|
|
|
|
elsif @unquoted_column_value and
|
2021-10-03 15:10:48 -04:00
|
|
|
(new_line = @scanner.scan(@line_end))
|
2019-04-14 17:01:51 -04:00
|
|
|
ignore_broken_line
|
|
|
|
message = "Unquoted fields do not allow new line " +
|
|
|
|
"<#{new_line.inspect}>"
|
|
|
|
raise MalformedCSVError.new(message, @lineno)
|
|
|
|
elsif @scanner.rest.start_with?(@quote_character)
|
|
|
|
ignore_broken_line
|
|
|
|
message = "Illegal quoting"
|
|
|
|
raise MalformedCSVError.new(message, @lineno)
|
2021-10-03 15:10:48 -04:00
|
|
|
elsif (new_line = @scanner.scan(@line_end))
|
2019-04-14 17:01:51 -04:00
|
|
|
ignore_broken_line
|
|
|
|
message = "New line must be <#{@row_separator.inspect}> " +
|
|
|
|
"not <#{new_line.inspect}>"
|
|
|
|
raise MalformedCSVError.new(message, @lineno)
|
|
|
|
else
|
|
|
|
ignore_broken_line
|
|
|
|
raise MalformedCSVError.new("TODO: Meaningful message",
|
|
|
|
@lineno)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
def parse_column_value
|
|
|
|
if @liberal_parsing
|
|
|
|
quoted_value = parse_quoted_column_value
|
|
|
|
if quoted_value
|
2019-12-24 16:59:43 -05:00
|
|
|
@scanner.scan_all(@strip_value) if @strip_value
|
2018-12-23 02:00:35 -05:00
|
|
|
unquoted_value = parse_unquoted_column_value
|
|
|
|
if unquoted_value
|
2019-01-25 01:49:59 -05:00
|
|
|
if @double_quote_outside_quote
|
|
|
|
unquoted_value = unquoted_value.gsub(@quote_character * 2,
|
|
|
|
@quote_character)
|
|
|
|
if quoted_value.empty? # %Q{""...} case
|
|
|
|
return @quote_character + unquoted_value
|
|
|
|
end
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
@quote_character + quoted_value + @quote_character + unquoted_value
|
|
|
|
else
|
|
|
|
quoted_value
|
|
|
|
end
|
|
|
|
else
|
|
|
|
parse_unquoted_column_value
|
|
|
|
end
|
|
|
|
elsif @may_quoted
|
|
|
|
parse_quoted_column_value ||
|
|
|
|
parse_unquoted_column_value
|
|
|
|
else
|
|
|
|
parse_unquoted_column_value ||
|
|
|
|
parse_quoted_column_value
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_unquoted_column_value
|
|
|
|
value = @scanner.scan_all(@unquoted_value)
|
2019-01-25 01:49:59 -05:00
|
|
|
return nil unless value
|
|
|
|
|
|
|
|
@unquoted_column_value = true
|
|
|
|
if @first_column_separators
|
|
|
|
while true
|
|
|
|
@scanner.keep_start
|
|
|
|
is_column_end = @column_ends.all? do |column_end|
|
|
|
|
@scanner.scan(column_end)
|
|
|
|
end
|
|
|
|
@scanner.keep_back
|
|
|
|
break if is_column_end
|
|
|
|
sub_separator = @scanner.scan_all(@first_column_separators)
|
|
|
|
break if sub_separator.nil?
|
|
|
|
value << sub_separator
|
|
|
|
sub_value = @scanner.scan_all(@unquoted_value)
|
|
|
|
break if sub_value.nil?
|
|
|
|
value << sub_value
|
|
|
|
end
|
|
|
|
end
|
2019-04-14 17:01:51 -04:00
|
|
|
value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
|
2019-12-24 16:59:43 -05:00
|
|
|
if @rstrip_value
|
|
|
|
value.gsub!(@rstrip_value, "")
|
|
|
|
end
|
2018-12-23 02:00:35 -05:00
|
|
|
value
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_quoted_column_value
|
|
|
|
quotes = @scanner.scan_all(@quotes)
|
|
|
|
return nil unless quotes
|
|
|
|
|
|
|
|
@quoted_column_value = true
|
|
|
|
n_quotes = quotes.size
|
|
|
|
if (n_quotes % 2).zero?
|
|
|
|
quotes[0, (n_quotes - 2) / 2]
|
|
|
|
else
|
|
|
|
value = quotes[0, (n_quotes - 1) / 2]
|
|
|
|
while true
|
|
|
|
quoted_value = @scanner.scan_all(@quoted_value)
|
|
|
|
value << quoted_value if quoted_value
|
2019-04-14 17:01:51 -04:00
|
|
|
if @backslash_quote
|
|
|
|
if @scanner.scan(@escaped_backslash)
|
|
|
|
if @scanner.scan(@escaped_quote)
|
|
|
|
value << @quote_character
|
|
|
|
else
|
|
|
|
value << @backslash_character
|
|
|
|
end
|
|
|
|
next
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
quotes = @scanner.scan_all(@quotes)
|
|
|
|
unless quotes
|
2019-04-14 17:01:51 -04:00
|
|
|
ignore_broken_line
|
2018-12-23 02:00:35 -05:00
|
|
|
message = "Unclosed quoted field"
|
2019-04-14 17:01:51 -04:00
|
|
|
raise MalformedCSVError.new(message, @lineno)
|
2018-12-23 02:00:35 -05:00
|
|
|
end
|
|
|
|
n_quotes = quotes.size
|
|
|
|
if n_quotes == 1
|
|
|
|
break
|
|
|
|
elsif (n_quotes % 2) == 1
|
|
|
|
value << quotes[0, (n_quotes - 1) / 2]
|
|
|
|
break
|
|
|
|
else
|
|
|
|
value << quotes[0, n_quotes / 2]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
value
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_column_end
|
|
|
|
return true if @scanner.scan(@column_end)
|
|
|
|
return false unless @column_ends
|
|
|
|
|
|
|
|
@scanner.keep_start
|
|
|
|
if @column_ends.all? {|column_end| @scanner.scan(column_end)}
|
|
|
|
@scanner.keep_drop
|
|
|
|
true
|
|
|
|
else
|
|
|
|
@scanner.keep_back
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_row_end
|
|
|
|
return true if @scanner.scan(@row_end)
|
|
|
|
return false unless @row_ends
|
|
|
|
@scanner.keep_start
|
|
|
|
if @row_ends.all? {|row_end| @scanner.scan(row_end)}
|
|
|
|
@scanner.keep_drop
|
|
|
|
true
|
|
|
|
else
|
|
|
|
@scanner.keep_back
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-04-14 17:01:51 -04:00
|
|
|
def strip_value(value)
|
|
|
|
return value unless @strip
|
|
|
|
return nil if value.nil?
|
|
|
|
|
|
|
|
case @strip
|
|
|
|
when String
|
|
|
|
size = value.size
|
|
|
|
while value.start_with?(@strip)
|
|
|
|
size -= 1
|
|
|
|
value = value[1, size]
|
|
|
|
end
|
|
|
|
while value.end_with?(@strip)
|
|
|
|
size -= 1
|
|
|
|
value = value[0, size]
|
|
|
|
end
|
|
|
|
else
|
|
|
|
value.strip!
|
|
|
|
end
|
|
|
|
value
|
|
|
|
end
|
|
|
|
|
|
|
|
def ignore_broken_line
|
|
|
|
@scanner.scan_all(@not_line_end)
|
2021-10-03 15:10:48 -04:00
|
|
|
@scanner.scan_all(@line_end)
|
2019-04-14 17:01:51 -04:00
|
|
|
@lineno += 1
|
|
|
|
end
|
|
|
|
|
2018-12-23 02:00:35 -05:00
|
|
|
def start_row
|
|
|
|
if @last_line
|
|
|
|
@last_line = nil
|
|
|
|
else
|
|
|
|
@scanner.keep_drop
|
|
|
|
end
|
|
|
|
@scanner.keep_start
|
|
|
|
end
|
|
|
|
|
|
|
|
def emit_row(row, &block)
|
|
|
|
@lineno += 1
|
|
|
|
|
|
|
|
raw_row = row
|
|
|
|
if @use_headers
|
|
|
|
if @headers.nil?
|
|
|
|
@headers = adjust_headers(row)
|
|
|
|
return unless @return_headers
|
|
|
|
row = Row.new(@headers, row, true)
|
|
|
|
else
|
|
|
|
row = Row.new(@headers,
|
|
|
|
@fields_converter.convert(raw_row, @headers, @lineno))
|
|
|
|
end
|
|
|
|
else
|
|
|
|
# convert fields, if needed...
|
|
|
|
row = @fields_converter.convert(raw_row, nil, @lineno)
|
|
|
|
end
|
|
|
|
|
|
|
|
# inject unconverted fields and accessor, if requested...
|
|
|
|
if @unconverted_fields and not row.respond_to?(:unconverted_fields)
|
|
|
|
add_unconverted_fields(row, raw_row)
|
|
|
|
end
|
|
|
|
|
|
|
|
yield(row)
|
|
|
|
end
|
|
|
|
|
|
|
|
# This method injects an instance variable <tt>unconverted_fields</tt> into
|
|
|
|
# +row+ and an accessor method for +row+ called unconverted_fields(). The
|
|
|
|
# variable is set to the contents of +fields+.
|
|
|
|
def add_unconverted_fields(row, fields)
|
|
|
|
class << row
|
|
|
|
attr_reader :unconverted_fields
|
|
|
|
end
|
|
|
|
row.instance_variable_set(:@unconverted_fields, fields)
|
|
|
|
row
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|