1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Import CSV 3.1.2 (#2547)

This commit is contained in:
Sutou Kouhei 2019-10-12 14:03:21 +09:00 committed by GitHub
parent d6e68bb263
commit 92df7d98b6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
Notes: git 2019-10-12 14:03:45 +09:00
Merged-By: kou <kou@clear-code.com>
14 changed files with 582 additions and 447 deletions

2
NEWS
View file

@ -343,7 +343,7 @@ CGI::
CSV::
* Upgrade to 3.0.9.
* Upgrade to 3.1.2.
See https://github.com/ruby/csv/blob/master/NEWS.md.
Date::

View file

@ -29,7 +29,7 @@
# the original library as of Ruby 1.9. If you are migrating code from 1.8 or
# earlier, you may have to change your code to comply with the new interface.
#
# == What's Different From the Old CSV?
# == What's the Different From the Old CSV?
#
# I'm sure I'll miss something, but I'll try to mention most of the major
# differences I am aware of, to help others quickly get up to speed:
@ -74,9 +74,9 @@
# place and that is to make using this library easier. CSV will parse all valid
# CSV.
#
# What you don't want to do is feed CSV invalid data. Because of the way the
# What you don't want to do is to feed CSV invalid data. Because of the way the
# CSV format works, it's common for a parser to need to read until the end of
# the file to be sure a field is invalid. This eats a lot of time and memory.
# the file to be sure a field is invalid. This consumes a lot of time and memory.
#
# Luckily, when working with invalid CSV, Ruby's built-in methods will almost
# always be superior in every way. For example, parsing non-quoted fields is as
@ -184,7 +184,7 @@ using CSV::MatchP if CSV.const_defined?(:MatchP)
# === CSV with headers
#
# CSV allows to specify column names of CSV file, whether they are in data, or
# provided separately. If headers specified, reading methods return an instance
# provided separately. If headers are specified, reading methods return an instance
# of CSV::Table, consisting of CSV::Row.
#
# # Headers are part of data
@ -416,6 +416,7 @@ class CSV
quote_empty: true,
}.freeze
class << self
#
# This method will return a CSV instance, just like CSV::new(), but the
# instance will be cached and returned for all future calls to this method for
@ -425,7 +426,7 @@ class CSV
# If a block is given, the instance is passed to the block and the return
# value becomes the return value of the block.
#
def self.instance(data = $stdout, **options)
def instance(data = $stdout, **options)
# create a _signature_ for this method call, data object and options
sig = [data.object_id] +
options.values_at(*DEFAULT_OPTIONS.keys.sort_by { |sym| sym.to_s })
@ -465,7 +466,7 @@ class CSV
# The <tt>:output_row_sep</tt> +option+ defaults to
# <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
#
def self.filter(input=nil, output=nil, **options)
def filter(input=nil, output=nil, **options)
# parse options for input, output, or both
in_options, out_options = Hash.new, {row_sep: $INPUT_RECORD_SEPARATOR}
options.each do |key, value|
@ -504,7 +505,7 @@ class CSV
# <tt>encoding: "UTF-32BE:UTF-8"</tt> would read UTF-32BE data from the file
# but transcode it to UTF-8 before CSV parses it.
#
def self.foreach(path, mode="r", **options, &block)
def foreach(path, mode="r", **options, &block)
return to_enum(__method__, path, mode, **options) unless block_given?
open(path, mode, **options) do |csv|
csv.each(&block)
@ -529,7 +530,7 @@ class CSV
# String to set the base Encoding for the output. CSV needs this hint if you
# plan to output non-ASCII compatible data.
#
def self.generate(str=nil, **options)
def generate(str=nil, **options)
# add a default empty String, if none was given
if str
str = StringIO.new(str)
@ -557,7 +558,7 @@ class CSV
# The <tt>:row_sep</tt> +option+ defaults to <tt>$INPUT_RECORD_SEPARATOR</tt>
# (<tt>$/</tt>) when calling this method.
#
def self.generate_line(row, **options)
def generate_line(row, **options)
options = {row_sep: $INPUT_RECORD_SEPARATOR}.merge(options)
str = +""
if options[:encoding]
@ -631,7 +632,7 @@ class CSV
# * truncate()
# * tty?()
#
def self.open(filename, mode="r", **options)
def open(filename, mode="r", **options)
# wrap a File opened with the remaining +args+ with no newline
# decorator
file_opts = {universal_newline: false}.merge(options)
@ -675,8 +676,8 @@ class CSV
# You pass your +str+ to read from, and an optional +options+ containing
# anything CSV::new() understands.
#
def self.parse(*args, **options, &block)
csv = new(*args, **options)
def parse(str, **options, &block)
csv = new(str, **options)
return csv.each(&block) if block_given?
@ -695,7 +696,7 @@ class CSV
#
# The +options+ parameter can be anything CSV::new() understands.
#
def self.parse_line(line, **options)
def parse_line(line, **options)
new(line, **options).shift
end
@ -710,13 +711,13 @@ class CSV
# <tt>encoding: "UTF-32BE:UTF-8"</tt> would read UTF-32BE data from the file
# but transcode it to UTF-8 before CSV parses it.
#
def self.read(*args, **options)
open(*args, **options) { |csv| csv.read }
def read(path, **options)
open(path, **options) { |csv| csv.read }
end
# Alias for CSV::read().
def self.readlines(*args, **options)
read(*args, **options)
def readlines(path, **options)
read(path, **options)
end
#
@ -726,10 +727,15 @@ class CSV
# converters: :numeric,
# header_converters: :symbol }.merge(options) )
#
def self.table(path, **options)
read( path, **{ headers: true,
def table(path, **options)
default_options = {
headers: true,
converters: :numeric,
header_converters: :symbol }.merge(options) )
header_converters: :symbol,
}
options = default_options.merge(options)
read(path, **options)
end
end
#
@ -853,7 +859,7 @@ class CSV
# converting. The conversion will fail
# if the data cannot be transcoded,
# leaving the header unchanged.
# <b><tt>:skip_blanks</tt></b>:: When set to a +true+ value, CSV will
# <b><tt>:skip_blanks</tt></b>:: When setting a +true+ value, CSV will
# skip over any empty rows. Note that
# this setting will not skip rows that
# contain column separators, even if
@ -863,9 +869,9 @@ class CSV
# using <tt>:skip_lines</tt>, or
# inspecting fields.compact.empty? on
# each row.
# <b><tt>:force_quotes</tt></b>:: When set to a +true+ value, CSV will
# <b><tt>:force_quotes</tt></b>:: When setting a +true+ value, CSV will
# quote all CSV fields it creates.
# <b><tt>:skip_lines</tt></b>:: When set to an object responding to
# <b><tt>:skip_lines</tt></b>:: When setting an object responding to
# <tt>match</tt>, every line matching
# it is considered a comment and ignored
# during parsing. When set to a String,
@ -874,17 +880,17 @@ class CSV
# a comment. If the passed object does
# not respond to <tt>match</tt>,
# <tt>ArgumentError</tt> is thrown.
# <b><tt>:liberal_parsing</tt></b>:: When set to a +true+ value, CSV will
# <b><tt>:liberal_parsing</tt></b>:: When setting a +true+ value, CSV will
# attempt to parse input not conformant
# with RFC 4180, such as double quotes
# in unquoted fields.
# <b><tt>:nil_value</tt></b>:: When set an object, any values of an
# empty field are replaced by the set
# empty field is replaced by the set
# object, not nil.
# <b><tt>:empty_value</tt></b>:: When set an object, any values of a
# <b><tt>:empty_value</tt></b>:: When setting an object, any values of a
# blank string field is replaced by
# the set object.
# <b><tt>:quote_empty</tt></b>:: When set to a +true+ value, CSV will
# <b><tt>:quote_empty</tt></b>:: When setting a +true+ value, CSV will
# quote empty values with double quotes.
# When +false+, CSV will emit an
# empty string for an empty field value.
@ -901,11 +907,11 @@ class CSV
# <b><tt>:write_empty_value</tt></b>:: When a <tt>String</tt> or +nil+ value,
# empty value(s) on each line will be
# replaced with the specified value.
# <b><tt>:strip</tt></b>:: When set to a +true+ value, CSV will
# <b><tt>:strip</tt></b>:: When setting a +true+ value, CSV will
# strip "\t\r\n\f\v" around the values.
# If you specify a string instead of
# +true+, CSV will strip string. The
# length of string must be 1.
# length of the string must be 1.
#
# See CSV::DEFAULT_OPTIONS for the default settings.
#
@ -939,8 +945,12 @@ class CSV
strip: false)
raise ArgumentError.new("Cannot parse nil as CSV") if data.nil?
# create the IO object we will read from
@io = data.is_a?(String) ? StringIO.new(data) : data
if data.is_a?(String)
@io = StringIO.new(data)
@io.set_encoding(encoding || data.encoding)
else
@io = data
end
@encoding = determine_encoding(encoding, internal_encoding)
@base_fields_converter_options = {
@ -992,35 +1002,41 @@ class CSV
end
#
# The encoded <tt>:col_sep</tt> used in parsing and writing. See CSV::new
# for details.
# The encoded <tt>:col_sep</tt> used in parsing and writing.
# See CSV::new for details.
#
def col_sep
parser.column_separator
end
#
# The encoded <tt>:row_sep</tt> used in parsing and writing. See CSV::new
# for details.
# The encoded <tt>:row_sep</tt> used in parsing and writing.
# See CSV::new for details.
#
def row_sep
parser.row_separator
end
#
# The encoded <tt>:quote_char</tt> used in parsing and writing. See CSV::new
# for details.
# The encoded <tt>:quote_char</tt> used in parsing and writing.
# See CSV::new for details.
#
def quote_char
parser.quote_character
end
# The limit for field size, if any. See CSV::new for details.
#
# The limit for field size, if any.
# See CSV::new for details.
#
def field_size_limit
parser.field_size_limit
end
# The regex marking a line as a comment. See CSV::new for details
#
# The regex marking a line as a comment.
# See CSV::new for details.
#
def skip_lines
parser.skip_lines
end
@ -1036,9 +1052,10 @@ class CSV
name ? name.first : converter
end
end
#
# Returns +true+ if unconverted_fields() to parsed results. See CSV::new
# for details.
# Returns +true+ if unconverted_fields() to parsed results.
# See CSV::new for details.
#
def unconverted_fields?
parser.unconverted_fields?
@ -1046,8 +1063,8 @@ class CSV
#
# Returns +nil+ if headers will not be used, +true+ if they will but have not
# yet been read, or the actual headers after they have been read. See
# CSV::new for details.
# yet been read, or the actual headers after they have been read.
# See CSV::new for details.
#
def headers
if @writer
@ -1068,7 +1085,10 @@ class CSV
parser.return_headers?
end
# Returns +true+ if headers are written in output. See CSV::new for details.
#
# Returns +true+ if headers are written in output.
# See CSV::new for details.
#
def write_headers?
@writer_options[:write_headers]
end

View file

@ -1,8 +1,14 @@
# frozen_string_literal: true
class CSV
# Note: Don't use this class directly. This is an internal class.
class FieldsConverter
include Enumerable
#
# A CSV::FieldsConverter is a data structure for storing the
# fields converter properties to be passed as a parameter
# when parsing a new file (e.g. CSV::Parser.new(@io, parser_options))
#
def initialize(options={})
@converters = []

View file

@ -11,10 +11,31 @@ using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
using CSV::MatchP if CSV.const_defined?(:MatchP)
class CSV
# Note: Don't use this class directly. This is an internal class.
class Parser
#
# A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
# or String object being read from or written to. Your data is never transcoded
# (unless you ask Ruby to transcode it for you) and will literally be parsed in
# the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
# Encoding of your data. This is accomplished by transcoding the parser itself
# into your Encoding.
#
# Raised when encoding is invalid.
class InvalidEncoding < StandardError
end
#
# CSV::Scanner receives a CSV output, scans it and return the content.
# It also controls the life cycle of the object with its methods +keep_start+,
# +keep_end+, +keep_back+, +keep_drop+.
#
# Uses StringScanner (the official strscan gem). Strscan provides lexical
# scanning operations on a String. We inherit its object and take advantage
# on the methods. For more information, please visit:
# https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
#
class Scanner < StringScanner
alias_method :scan_all, :scan
@ -38,7 +59,7 @@ class CSV
def keep_end
start = @keeps.pop
string[start, pos - start]
string.byteslice(start, pos - start)
end
def keep_back
@ -50,6 +71,18 @@ class CSV
end
end
#
# CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
# It also controls the life cycle of the object with its methods +keep_start+,
# +keep_end+, +keep_back+, +keep_drop+.
#
# CSV::InputsScanner.scan() tries to match with pattern at the current position.
# If there's a match, the scanner advances the “scan pointer” and returns the matched string.
# Otherwise, the scanner returns nil.
#
# CSV::InputsScanner.rest() returns the “rest” of the string (i.e. everything after the scan pointer).
# If there is no more data (eos? = true), it returns "".
#
class InputsScanner
def initialize(inputs, encoding, chunk_size: 8192)
@inputs = inputs.dup
@ -137,7 +170,7 @@ class CSV
def keep_end
start, buffer = @keeps.pop
keep = @scanner.string[start, @scanner.pos - start]
keep = @scanner.string.byteslice(start, @scanner.pos - start)
if buffer
buffer << keep
keep = buffer
@ -192,7 +225,7 @@ class CSV
input = @inputs.first
case input
when StringIO
string = input.string
string = input.read
raise InvalidEncoding unless string.valid_encoding?
@scanner = StringScanner.new(string)
@inputs.shift
@ -319,6 +352,7 @@ class CSV
end
private
# A set of tasks to prepare the file in order to parse it
def prepare
prepare_variable
prepare_quote_character
@ -447,7 +481,13 @@ class CSV
end
def prepare_separators
@column_separator = @options[:column_separator].to_s.encode(@encoding)
column_separator = @options[:column_separator]
@column_separator = column_separator.to_s.encode(@encoding)
if @column_separator.size < 1
message = ":col_sep must be 1 or more characters: "
message += column_separator.inspect
raise ArgumentError, message
end
@row_separator =
resolve_row_separator(@options[:row_separator]).encode(@encoding)
@ -534,7 +574,9 @@ class CSV
cr = "\r".encode(@encoding)
lf = "\n".encode(@encoding)
if @input.is_a?(StringIO)
separator = detect_row_separator(@input.string, cr, lf)
pos = @input.pos
separator = detect_row_separator(@input.read, cr, lf)
@input.seek(pos)
elsif @input.respond_to?(:gets)
if @input.is_a?(File)
chunk_size = 32 * 1024
@ -651,7 +693,9 @@ class CSV
return false if @quote_character.nil?
if @input.is_a?(StringIO)
sample = @input.string
pos = @input.pos
sample = @input.read
@input.seek(pos)
else
return false if @samples.empty?
sample = @samples.first
@ -684,7 +728,7 @@ class CSV
UnoptimizedStringIO.new(sample)
end
if @input.is_a?(StringIO)
inputs << UnoptimizedStringIO.new(@input.string)
inputs << UnoptimizedStringIO.new(@input.read)
else
inputs << @input
end
@ -697,7 +741,7 @@ class CSV
def build_scanner
string = nil
if @samples.empty? and @input.is_a?(StringIO)
string = @input.string
string = @input.read
elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
string = @samples[0]
end

View file

@ -13,13 +13,13 @@ class CSV
#
class Row
#
# Construct a new CSV::Row from +headers+ and +fields+, which are expected
# Constructs a new CSV::Row from +headers+ and +fields+, which are expected
# to be Arrays. If one Array is shorter than the other, it will be padded
# with +nil+ objects.
#
# The optional +header_row+ parameter can be set to +true+ to indicate, via
# CSV::Row.header_row?() and CSV::Row.field_row?(), that this is a header
# row. Otherwise, the row is assumes to be a field row.
# row. Otherwise, the row assumes to be a field row.
#
# A CSV::Row object supports the following Array methods through delegation:
#
@ -209,7 +209,7 @@ class CSV
# delete( header, offset )
# delete( index )
#
# Used to remove a pair from the row by +header+ or +index+. The pair is
# Removes a pair from the row by +header+ or +index+. The pair is
# located as described in CSV::Row.field(). The deleted pair is returned,
# or +nil+ if a pair could not be found.
#
@ -367,7 +367,9 @@ class CSV
end
end
#
# A summary of fields, by header, in an ASCII compatible String.
#
def inspect
str = ["#<", self.class.to_s]
each do |header, field|

View file

@ -13,7 +13,7 @@ class CSV
#
class Table
#
# Construct a new CSV::Table from +array_of_rows+, which are expected
# Constructs a new CSV::Table from +array_of_rows+, which are expected
# to be CSV::Row objects. All rows are assumed to have the same headers.
#
# The optional +headers+ parameter can be set to Array of headers.

View file

@ -2,5 +2,5 @@
class CSV
# The version of the installed library.
VERSION = "3.1.1"
VERSION = "3.1.2"
end

View file

@ -6,7 +6,12 @@ require_relative "row"
using CSV::MatchP if CSV.const_defined?(:MatchP)
class CSV
# Note: Don't use this class directly. This is an internal class.
class Writer
#
# A CSV::Writer receives an output, prepares the header, format and output.
# It allows us to write new rows in the object and rewind it.
#
attr_reader :lineno
attr_reader :headers
@ -22,6 +27,9 @@ class CSV
@fields_converter = @options[:fields_converter]
end
#
# Adds a new row
#
def <<(row)
case row
when Row
@ -47,6 +55,9 @@ class CSV
self
end
#
# Winds back to the beginning
#
def rewind
@lineno = 0
@headers = nil if @options[:headers].nil?

View file

@ -233,11 +233,21 @@ line,5,jkl
assert_equal([["a"]], CSV.parse("a\r\n"))
end
def test_seeked_string_io
input_with_bom = StringIO.new("\ufeffあ,い,う\r\na,b,c\r\n")
input_with_bom.read(3)
assert_equal([
["", "", ""],
["a", "b", "c"],
],
CSV.new(input_with_bom).each.to_a)
end
private
def assert_parse_errors_out(*args, **options)
def assert_parse_errors_out(data, **options)
assert_raise(CSV::MalformedCSVError) do
Timeout.timeout(0.2) do
CSV.parse(*args, **options)
CSV.parse(data, **options)
fail("Parse didn't error out")
end
end

View file

@ -312,12 +312,12 @@ A
end
def test_parse_empty
assert_equal(CSV::Table.new([], **{}),
assert_equal(CSV::Table.new([]),
CSV.parse("", headers: true))
end
def test_parse_empty_line
assert_equal(CSV::Table.new([], **{}),
assert_equal(CSV::Table.new([]),
CSV.parse("\n", headers: true))
end

View file

@ -6,7 +6,7 @@ require_relative "../helper"
class TestCSVParseRewind < Test::Unit::TestCase
extend DifferentOFS
def parse(data, options={})
def parse(data, **options)
csv = CSV.new(data, **options)
records = csv.to_a
csv.rewind

View file

@ -268,11 +268,11 @@ class TestCSVEncodings < Test::Unit::TestCase
private
def assert_parses(fields, encoding, options = { })
def assert_parses(fields, encoding, **options)
encoding = Encoding.find(encoding) unless encoding.is_a? Encoding
orig_fields = fields
fields = encode_ary(fields, encoding)
data = ary_to_data(fields, options)
data = ary_to_data(fields, **options)
parsed = CSV.parse(data, **options)
assert_equal(fields, parsed)
parsed.flatten.each_with_index do |field, i|
@ -285,7 +285,9 @@ class TestCSVEncodings < Test::Unit::TestCase
end
end
begin
CSV.open(@temp_csv_path, "rb:#{encoding}:#{__ENCODING__}", **options) do |csv|
CSV.open(@temp_csv_path,
"rb:#{encoding}:#{__ENCODING__}",
**options) do |csv|
csv.each_with_index do |row, i|
assert_equal(orig_fields[i], row)
end
@ -315,7 +317,7 @@ class TestCSVEncodings < Test::Unit::TestCase
ary.map { |row| row.map { |field| field.encode(encoding) } }
end
def ary_to_data(ary, options = { })
def ary_to_data(ary, **options)
encoding = ary.flatten.first.encoding
quote_char = (options[:quote_char] || '"').encode(encoding)
col_sep = (options[:col_sep] || ",").encode(encoding)
@ -327,9 +329,9 @@ class TestCSVEncodings < Test::Unit::TestCase
}.join('').encode(encoding)
end
def encode_for_tests(data, options = { })
yield ary_to_data(encode_ary(data, "UTF-8"), options)
yield ary_to_data(encode_ary(data, "UTF-16BE"), options)
def encode_for_tests(data, **options)
yield ary_to_data(encode_ary(data, "UTF-8"), **options)
yield ary_to_data(encode_ary(data, "UTF-16BE"), **options)
end
def each_encoding

View file

@ -52,6 +52,20 @@ line,4,jkl
assert_equal([",,,", nil], CSV.parse_line(",,,;", col_sep: ";"))
end
def test_col_sep_nil
assert_raise_with_message(ArgumentError,
":col_sep must be 1 or more characters: nil") do
CSV.parse(@sample_data, col_sep: nil)
end
end
def test_col_sep_empty
assert_raise_with_message(ArgumentError,
":col_sep must be 1 or more characters: \"\"") do
CSV.parse(@sample_data, col_sep: "")
end
end
def test_row_sep
error = assert_raise(CSV::MalformedCSVError) do
CSV.parse_line("1,2,3\n,4,5\r\n", row_sep: "\r\n")
@ -110,10 +124,10 @@ line,4,jkl
def test_line
lines = [
%Q(abc,def\n),
%Q(abc,"d\nef"\n),
%Q(abc,"d\r\nef"\n),
%Q(abc,"d\ref")
%Q(\u{3000}abc,def\n),
%Q(\u{3000}abc,"d\nef"\n),
%Q(\u{3000}abc,"d\r\nef"\n),
%Q(\u{3000}abc,"d\ref")
]
csv = CSV.new(lines.join(''))
lines.each do |line|

View file

@ -205,6 +205,32 @@ module TestCSVWriteGeneral
assert_equal(%Q[あ,い,う#{$INPUT_RECORD_SEPARATOR}].encode("EUC-JP"),
generate_line(row))
end
def test_encoding_with_default_internal
with_default_internal(Encoding::UTF_8) do
row = ["", "", ""].collect {|field| field.encode("EUC-JP")}
assert_equal(%Q[あ,い,う#{$INPUT_RECORD_SEPARATOR}].encode("EUC-JP"),
generate_line(row, encoding: Encoding::EUC_JP))
end
end
def test_with_default_internal
with_default_internal(Encoding::UTF_8) do
row = ["", "", ""].collect {|field| field.encode("EUC-JP")}
assert_equal(%Q[あ,い,う#{$INPUT_RECORD_SEPARATOR}].encode("EUC-JP"),
generate_line(row))
end
end
def with_default_internal(encoding)
original = Encoding.default_internal
begin
Encoding.default_internal = encoding
yield
ensure
Encoding.default_internal = original
end
end
end
class TestCSVWriteGeneralGenerateLine < Test::Unit::TestCase