diff --git a/NEWS b/NEWS index 02b12f3d82..e11a07ed2c 100644 --- a/NEWS +++ b/NEWS @@ -404,6 +404,12 @@ sufficient information, see the ChangeLog file or Redmine * Coverage.line_stub, which is a simple helper function that creates the "stub" of line coverage from a given source code. +[CSV] + + * Upgrade to 3.0.2. This includes performance improvement especially + writing. Writing is about 2 times faster. + https://github.com/ruby/csv/blob/master/NEWS.md + [ERB] [New options] diff --git a/lib/csv.rb b/lib/csv.rb index dca2a45b6a..ebdc6e5c6d 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -93,36 +93,22 @@ require "forwardable" require "English" require "date" require "stringio" -require_relative "csv/table" + +require_relative "csv/fields_converter" +require_relative "csv/match_p" +require_relative "csv/parser" require_relative "csv/row" +require_relative "csv/table" +require_relative "csv/writer" -# This provides String#match? and Regexp#match? for Ruby 2.3. -unless String.method_defined?(:match?) - class CSV - module MatchP - refine String do - def match?(pattern) - self =~ pattern - end - end - - refine Regexp do - def match?(string) - self =~ string - end - end - end - end - - using CSV::MatchP -end +using CSV::MatchP if CSV.const_defined?(:MatchP) # # This class provides a complete interface to CSV files and data. It offers # tools to enable you to read and write to and from Strings or IO objects, as # needed. # -# The most generic interface of a class is: +# The most generic interface of the library is: # # csv = CSV.new(string_or_io, **options) # @@ -204,18 +190,18 @@ end # # Headers are part of data # data = CSV.parse(<<~ROWS, headers: true) # Name,Department,Salary -# Bob,Engeneering,1000 +# Bob,Engineering,1000 # Jane,Sales,2000 # John,Management,5000 # ROWS # # data.class #=> CSV::Table -# data.first #=> # -# data.first.to_h #=> {"Name"=>"Bob", "Department"=>"Engeneering", "Salary"=>"1000"} +# data.first #=> # +# data.first.to_h #=> {"Name"=>"Bob", "Department"=>"Engineering", "Salary"=>"1000"} # # # Headers provided by developer # data = CSV.parse('Bob,Engeneering,1000', headers: %i[name department salary]) -# data.first #=> # +# data.first #=> # # # === Typed data reading # @@ -902,10 +888,24 @@ class CSV # Options cannot be overridden in the instance methods for performance reasons, # so be sure to set what you want here. # - def initialize(data, col_sep: ",", row_sep: :auto, quote_char: '"', field_size_limit: nil, - converters: nil, unconverted_fields: nil, headers: false, return_headers: false, - write_headers: nil, header_converters: nil, skip_blanks: false, force_quotes: false, - skip_lines: nil, liberal_parsing: false, internal_encoding: nil, external_encoding: nil, encoding: nil, + def initialize(data, + col_sep: ",", + row_sep: :auto, + quote_char: '"', + field_size_limit: nil, + converters: nil, + unconverted_fields: nil, + headers: false, + return_headers: false, + write_headers: nil, + header_converters: nil, + skip_blanks: false, + force_quotes: false, + skip_lines: nil, + liberal_parsing: false, + internal_encoding: nil, + external_encoding: nil, + encoding: nil, nil_value: nil, empty_value: "") raise ArgumentError.new("Cannot parse nil as CSV") if data.nil? @@ -913,64 +913,79 @@ class CSV # create the IO object we will read from @io = data.is_a?(String) ? StringIO.new(data) : data @encoding = determine_encoding(encoding, internal_encoding) - # - # prepare for building safe regular expressions in the target encoding, - # if we can transcode the needed characters - # - @re_esc = "\\".encode(@encoding).freeze rescue "" - @re_chars = /#{%"[-\\]\\[\\.^$?*+{}()|# \r\n\t\f\v]".encode(@encoding)}/ - @unconverted_fields = unconverted_fields - # Stores header row settings and loads header converters, if needed. - @use_headers = headers - @return_headers = return_headers - @write_headers = write_headers + @base_fields_converter_options = { + nil_value: nil_value, + empty_value: empty_value, + } + @initial_converters = converters + @initial_header_converters = header_converters - # headers must be delayed until shift(), in case they need a row of content - @headers = nil + @parser_options = { + column_separator: col_sep, + row_separator: row_sep, + quote_character: quote_char, + field_size_limit: field_size_limit, + unconverted_fields: unconverted_fields, + headers: headers, + return_headers: return_headers, + skip_blanks: skip_blanks, + skip_lines: skip_lines, + liberal_parsing: liberal_parsing, + encoding: @encoding, + nil_value: nil_value, + empty_value: empty_value, + } + @parser = nil - @nil_value = nil_value - @empty_value = empty_value - @empty_value_is_empty_string = (empty_value == "") + @writer_options = { + encoding: @encoding, + force_encoding: (not encoding.nil?), + force_quotes: force_quotes, + headers: headers, + write_headers: write_headers, + column_separator: col_sep, + row_separator: row_sep, + quote_character: quote_char, + } - init_separators(col_sep, row_sep, quote_char, force_quotes) - init_parsers(skip_blanks, field_size_limit, liberal_parsing) - init_converters(converters, :@converters, :convert) - init_converters(header_converters, :@header_converters, :header_convert) - init_comments(skip_lines) - - @force_encoding = !!encoding - - # track our own lineno since IO gets confused about line-ends is CSV fields - @lineno = 0 - - # make sure headers have been assigned - if header_row? and [Array, String].include? @use_headers.class and @write_headers - parse_headers # won't read data for Array or String - self << @headers - end + @writer = nil + writer if @writer_options[:write_headers] end # # The encoded :col_sep used in parsing and writing. See CSV::new # for details. # - attr_reader :col_sep + def col_sep + parser.column_separator + end + # # The encoded :row_sep used in parsing and writing. See CSV::new # for details. # - attr_reader :row_sep + def row_sep + parser.row_separator + end + # # The encoded :quote_char used in parsing and writing. See CSV::new # for details. # - attr_reader :quote_char + def quote_char + parser.quote_character + end + # The limit for field size, if any. See CSV::new for details. - attr_reader :field_size_limit + def field_size_limit + parser.field_size_limit + end # The regex marking a line as a comment. See CSV::new for details - attr_reader :skip_lines + def skip_lines + parser.skip_lines + end # # Returns the current list of converters in effect. See CSV::new for details. @@ -978,7 +993,7 @@ class CSV # as is. # def converters - @converters.map do |converter| + fields_converter.map do |converter| name = Converters.rassoc(converter) name ? name.first : converter end @@ -987,42 +1002,68 @@ class CSV # Returns +true+ if unconverted_fields() to parsed results. See CSV::new # for details. # - def unconverted_fields?() @unconverted_fields end + def unconverted_fields? + parser.unconverted_fields? + end + # # Returns +nil+ if headers will not be used, +true+ if they will but have not # yet been read, or the actual headers after they have been read. See # CSV::new for details. # def headers - @headers || true if @use_headers + if @writer + @writer.headers + else + parsed_headers = parser.headers + return parsed_headers if parsed_headers + raw_headers = @parser_options[:headers] + raw_headers = nil if raw_headers == false + raw_headers + end end # # Returns +true+ if headers will be returned as a row of results. # See CSV::new for details. # - def return_headers?() @return_headers end + def return_headers? + parser.return_headers? + end + # Returns +true+ if headers are written in output. See CSV::new for details. - def write_headers?() @write_headers end + def write_headers? + @writer_options[:write_headers] + end + # # Returns the current list of converters in effect for headers. See CSV::new # for details. Built-in converters will be returned by name, while others # will be returned as is. # def header_converters - @header_converters.map do |converter| + header_fields_converter.map do |converter| name = HeaderConverters.rassoc(converter) name ? name.first : converter end end + # # Returns +true+ blank lines are skipped by the parser. See CSV::new # for details. # - def skip_blanks?() @skip_blanks end + def skip_blanks? + parser.skip_blanks? + end + # Returns +true+ if all output fields are quoted. See CSV::new for details. - def force_quotes?() @force_quotes end + def force_quotes? + @writer_options[:force_quotes] + end + # Returns +true+ if illegal input is handled. See CSV::new for details. - def liberal_parsing?() @liberal_parsing end + def liberal_parsing? + parser.liberal_parsing? + end # # The Encoding CSV is parsing or writing in. This will be the Encoding you @@ -1031,10 +1072,23 @@ class CSV attr_reader :encoding # - # The line number of the last row read from this file. Fields with nested + # The line number of the last row read from this file. Fields with nested # line-end characters will not affect this count. # - attr_reader :lineno, :line + def lineno + if @writer + @writer.lineno + else + parser.lineno + end + end + + # + # The last row read from this file. + # + def line + parser.line + end ### IO and StringIO Delegation ### @@ -1048,9 +1102,9 @@ class CSV # Rewinds the underlying IO object and resets CSV's lineno() counter. def rewind - @headers = nil - @lineno = 0 - + @parser = nil + @parser_enumerator = nil + @writer.rewind if @writer @io.rewind end @@ -1064,34 +1118,8 @@ class CSV # The data source must be open for writing. # def <<(row) - # make sure headers have been assigned - if header_row? and [Array, String].include? @use_headers.class and !@write_headers - parse_headers # won't read data for Array or String - end - - # handle CSV::Row objects and Hashes - row = case row - when self.class::Row then row.fields - when Hash then @headers.map { |header| row[header] } - else row - end - - @headers = row if header_row? - @lineno += 1 - - output = row.map(&@quote).join(@col_sep) + @row_sep # quote and separate - if @io.is_a?(StringIO) and - output.encoding != (encoding = raw_encoding) - if @force_encoding - output = output.encode(encoding) - elsif (compatible_encoding = Encoding.compatible?(@io.string, output)) - @io.set_encoding(compatible_encoding) - @io.seek(0, IO::SEEK_END) - end - end - @io << output - - self # for chaining + writer << row + self end alias_method :add_row, :<< alias_method :puts, :<< @@ -1112,7 +1140,7 @@ class CSV # converted field or the field itself. # def convert(name = nil, &converter) - add_converter(:@converters, self.class::Converters, name, &converter) + fields_converter.add_converter(name, &converter) end # @@ -1127,10 +1155,7 @@ class CSV # effect. # def header_convert(name = nil, &converter) - add_converter( :@header_converters, - self.class::HeaderConverters, - name, - &converter ) + header_fields_converter.add_converter(name, &converter) end include Enumerable @@ -1142,14 +1167,8 @@ class CSV # # The data source must be open for reading. # - def each - if block_given? - while row = shift - yield row - end - else - to_enum - end + def each(&block) + parser.parse(&block) end # @@ -1159,8 +1178,9 @@ class CSV # def read rows = to_a - if @use_headers - Table.new(rows) + headers = parser.headers + if headers + Table.new(rows, headers: headers) else rows end @@ -1169,7 +1189,7 @@ class CSV # Returns +true+ if the next row read will be a header row. def header_row? - @use_headers and @headers.nil? + parser.header_row? end # @@ -1180,171 +1200,11 @@ class CSV # The data source must be open for reading. # def shift - ######################################################################### - ### This method is purposefully kept a bit long as simple conditional ### - ### checks are faster than numerous (expensive) method calls. ### - ######################################################################### - - # handle headers not based on document content - if header_row? and @return_headers and - [Array, String].include? @use_headers.class - if @unconverted_fields - return add_unconverted_fields(parse_headers, Array.new) - else - return parse_headers - end - end - - # - # it can take multiple calls to @io.gets() to get a full line, - # because of \r and/or \n characters embedded in quoted fields - # - in_extended_col = false - csv = Array.new - - loop do - # add another read to the line - unless parse = @io.gets(@row_sep) - return nil - end - - if in_extended_col - @line.concat(parse) - else - @line = parse.clone - end - - begin - parse.sub!(@parsers[:line_end], "") - rescue ArgumentError - unless parse.valid_encoding? - message = "Invalid byte sequence in #{parse.encoding}" - raise MalformedCSVError.new(message, lineno + 1) - end - raise - end - - if csv.empty? - # - # I believe a blank line should be an Array.new, not Ruby 1.8 - # CSV's [nil] - # - if parse.empty? - @lineno += 1 - if @skip_blanks - next - elsif @unconverted_fields - return add_unconverted_fields(Array.new, Array.new) - elsif @use_headers - return self.class::Row.new(@headers, Array.new) - else - return Array.new - end - end - end - - next if @skip_lines and @skip_lines.match parse - - parts = parse.split(@col_sep_split_separator, -1) - if parts.empty? - if in_extended_col - csv[-1] << @col_sep # will be replaced with a @row_sep after the parts.each loop - else - csv << nil - end - end - - # This loop is the hot path of csv parsing. Some things may be non-dry - # for a reason. Make sure to benchmark when refactoring. - parts.each do |part| - if in_extended_col - # If we are continuing a previous column - if part.end_with?(@quote_char) && part.count(@quote_char) % 2 != 0 - # extended column ends - csv.last << part[0..-2] - if csv.last.match?(@parsers[:stray_quote]) - raise MalformedCSVError.new("Missing or stray quote", - lineno + 1) - end - csv.last.gsub!(@double_quote_char, @quote_char) - in_extended_col = false - else - csv.last << part << @col_sep - end - elsif part.start_with?(@quote_char) - # If we are starting a new quoted column - if part.count(@quote_char) % 2 != 0 - # start an extended column - csv << (part[1..-1] << @col_sep) - in_extended_col = true - elsif part.end_with?(@quote_char) - # regular quoted column - csv << part[1..-2] - if csv.last.match?(@parsers[:stray_quote]) - raise MalformedCSVError.new("Missing or stray quote", - lineno + 1) - end - csv.last.gsub!(@double_quote_char, @quote_char) - elsif @liberal_parsing - csv << part - else - raise MalformedCSVError.new("Missing or stray quote", - lineno + 1) - end - elsif part.match?(@parsers[:quote_or_nl]) - # Unquoted field with bad characters. - if part.match?(@parsers[:nl_or_lf]) - message = "Unquoted fields do not allow \\r or \\n" - raise MalformedCSVError.new(message, lineno + 1) - else - if @liberal_parsing - csv << part - else - raise MalformedCSVError.new("Illegal quoting", lineno + 1) - end - end - else - # Regular ole unquoted field. - csv << (part.empty? ? nil : part) - end - end - - # Replace tacked on @col_sep with @row_sep if we are still in an extended - # column. - csv[-1][-1] = @row_sep if in_extended_col - - if in_extended_col - # if we're at eof?(), a quoted field wasn't closed... - if @io.eof? - raise MalformedCSVError.new("Unclosed quoted field", - lineno + 1) - elsif @field_size_limit and csv.last.size >= @field_size_limit - raise MalformedCSVError.new("Field size exceeded", - lineno + 1) - end - # otherwise, we need to loop and pull some more data to complete the row - else - @lineno += 1 - - # save fields unconverted fields, if needed... - unconverted = csv.dup if @unconverted_fields - - if @use_headers - # parse out header rows and handle CSV::Row conversions... - csv = parse_headers(csv) - else - # convert fields, if needed... - csv = convert_fields(csv) - end - - # inject unconverted fields and accessor, if requested... - if @unconverted_fields and not csv.respond_to? :unconverted_fields - add_unconverted_fields(csv, unconverted) - end - - # return the results - break csv - end + @parser_enumerator ||= parser.parse + begin + @parser_enumerator.next + rescue StopIteration + nil end end alias_method :gets, :shift @@ -1369,15 +1229,19 @@ class CSV # show encoding str << " encoding:" << @encoding.name # show other attributes - %w[ lineno col_sep row_sep - quote_char skip_blanks liberal_parsing ].each do |attr_name| - if a = instance_variable_get("@#{attr_name}") + ["lineno", "col_sep", "row_sep", "quote_char"].each do |attr_name| + if a = __send__(attr_name) str << " " << attr_name << ":" << a.inspect end end - if @use_headers - str << " headers:" << headers.inspect + ["skip_blanks", "liberal_parsing"].each do |attr_name| + if a = __send__("#{attr_name}?") + str << " " << attr_name << ":" << a.inspect + end end + _headers = headers + _headers = headers + str << " headers:" << _headers.inspect if _headers str << ">" begin str.join('') @@ -1393,7 +1257,7 @@ class CSV def determine_encoding(encoding, internal_encoding) # honor the IO encoding if we can, otherwise default to ASCII-8BIT - io_encoding = raw_encoding(nil) + io_encoding = raw_encoding return io_encoding if io_encoding return Encoding.find(internal_encoding) if internal_encoding @@ -1406,216 +1270,17 @@ class CSV Encoding.default_internal || Encoding.default_external end - # - # Stores the indicated separators for later use. - # - # If auto-discovery was requested for @row_sep, this method will read - # ahead in the @io and try to find one. +ARGF+, +STDIN+, +STDOUT+, - # +STDERR+ and any stream open for output only with a default - # @row_sep of $INPUT_RECORD_SEPARATOR ($/). - # - # This method also establishes the quoting rules used for CSV output. - # - def init_separators(col_sep, row_sep, quote_char, force_quotes) - # store the selected separators - @col_sep = col_sep.to_s.encode(@encoding) - if @col_sep == " " - @col_sep_split_separator = Regexp.new(/#{Regexp.escape(@col_sep)}/) - else - @col_sep_split_separator = @col_sep + def normalize_converters(converters) + converters ||= [] + unless converters.is_a?(Array) + converters = [converters] end - @row_sep = row_sep # encode after resolving :auto - @quote_char = quote_char.to_s.encode(@encoding) - @double_quote_char = @quote_char * 2 - - if @quote_char.length != 1 - raise ArgumentError, ":quote_char has to be a single character String" - end - - # - # automatically discover row separator when requested - # (not fully encoding safe) - # - if @row_sep == :auto - if [ARGF, STDIN, STDOUT, STDERR].include?(@io) or - (defined?(Zlib) and @io.class == Zlib::GzipWriter) - @row_sep = $INPUT_RECORD_SEPARATOR - else - begin - # - # remember where we were (pos() will raise an exception if @io is pipe - # or not opened for reading) - # - saved_pos = @io.pos - while @row_sep == :auto - # - # if we run out of data, it's probably a single line - # (ensure will set default value) - # - break unless sample = @io.gets(nil, 1024) - - cr = encode_str("\r") - lf = encode_str("\n") - # extend sample if we're unsure of the line ending - if sample.end_with?(cr) - sample << (@io.gets(nil, 1) || "") - end - - # try to find a standard separator - sample.each_char.each_cons(2) do |char, next_char| - case char - when cr - if next_char == lf - @row_sep = encode_str("\r\n") - else - @row_sep = cr - end - break - when lf - @row_sep = lf - break - end - end - end - - # tricky seek() clone to work around GzipReader's lack of seek() - @io.rewind - # reset back to the remembered position - while saved_pos > 1024 # avoid loading a lot of data into memory - @io.read(1024) - saved_pos -= 1024 - end - @io.read(saved_pos) if saved_pos.nonzero? - rescue IOError # not opened for reading - # do nothing: ensure will set default - rescue NoMethodError # Zlib::GzipWriter doesn't have some IO methods - # do nothing: ensure will set default - rescue SystemCallError # pipe - # do nothing: ensure will set default - ensure - # - # set default if we failed to detect - # (stream not opened for reading, a pipe, or a single line of data) - # - @row_sep = $INPUT_RECORD_SEPARATOR if @row_sep == :auto - end - end - end - @row_sep = @row_sep.to_s.encode(@encoding) - - # establish quoting rules - @force_quotes = force_quotes - do_quote = lambda do |field| - field = String(field) - encoded_quote = @quote_char.encode(field.encoding) - encoded_quote + field.gsub(encoded_quote, encoded_quote * 2) + encoded_quote - end - quotable_chars = encode_str("\r\n", @col_sep, @quote_char) - @quote = if @force_quotes - do_quote - else - lambda do |field| - if field.nil? # represent +nil+ fields as empty unquoted fields - "" - else - field = String(field) # Stringify fields - # represent empty fields as empty quoted fields - if field.empty? or - field.count(quotable_chars).nonzero? - do_quote.call(field) - else - field # unquoted field - end - end - end - end - end - - # Pre-compiles parsers and stores them by name for access during reads. - def init_parsers(skip_blanks, field_size_limit, liberal_parsing) - # store the parser behaviors - @skip_blanks = skip_blanks - @field_size_limit = field_size_limit - @liberal_parsing = liberal_parsing - - # prebuild Regexps for faster parsing - esc_row_sep = escape_re(@row_sep) - esc_quote = escape_re(@quote_char) - @parsers = { - # for detecting parse errors - quote_or_nl: encode_re("[", esc_quote, "\r\n]"), - nl_or_lf: encode_re("[\r\n]"), - stray_quote: encode_re( "[^", esc_quote, "]", esc_quote, - "[^", esc_quote, "]" ), - # safer than chomp!() - line_end: encode_re(esc_row_sep, "\\z"), - # illegal unquoted characters - return_newline: encode_str("\r\n") - } - end - - # - # Loads any converters requested during construction. - # - # If +field_name+ is set :converters (the default) field converters - # are set. When +field_name+ is :header_converters header converters - # are added instead. - # - # The :unconverted_fields option is also activated for - # :converters calls, if requested. - # - def init_converters(converters, ivar_name, convert_method) - converters = case converters - when nil then [] - when Array then converters - else [converters] - end - instance_variable_set(ivar_name, []) - convert = method(convert_method) - - # load converters - converters.each do |converter| - if converter.is_a? Proc # custom code block - convert.call(&converter) - else # by name - convert.call(converter) - end - end - end - - # Stores the pattern of comments to skip from the provided options. - # - # The pattern must respond to +.match+, else ArgumentError is raised. - # Strings are converted to a Regexp. - # - # See also CSV.new - def init_comments(skip_lines) - @skip_lines = skip_lines - @skip_lines = Regexp.new(Regexp.escape(@skip_lines)) if @skip_lines.is_a? String - if @skip_lines and not @skip_lines.respond_to?(:match) - raise ArgumentError, ":skip_lines has to respond to matches" - end - end - # - # The actual work method for adding converters, used by both CSV.convert() and - # CSV.header_convert(). - # - # This method requires the +var_name+ of the instance variable to place the - # converters in, the +const+ Hash to lookup named converters in, and the - # normal parameters of the CSV.convert() and CSV.header_convert() methods. - # - def add_converter(var_name, const, name = nil, &converter) - if name.nil? # custom converter - instance_variable_get(var_name) << converter - else # named converter - combo = const[name] - case combo - when Array # combo converter - combo.each do |converter_name| - add_converter(var_name, const, converter_name) - end - else # individual named converter - instance_variable_get(var_name) << combo + converters.collect do |converter| + case converter + when Proc # custom code block + [nil, converter] + else # by name + [converter, nil] end end end @@ -1629,132 +1294,74 @@ class CSV # def convert_fields(fields, headers = false) if headers - converters = @header_converters + header_fields_converter.convert(fields, nil, 0) else - converters = @converters - if !@use_headers and - converters.empty? and - @nil_value.nil? and - @empty_value_is_empty_string - return fields - end - end - - fields.map.with_index do |field, index| - if field.nil? - field = @nil_value - elsif field.empty? - field = @empty_value unless @empty_value_is_empty_string - end - converters.each do |converter| - break if headers && field.nil? - field = if converter.arity == 1 # straight field converter - converter[field] - else # FieldInfo converter - header = @use_headers && !headers ? @headers[index] : nil - converter[field, FieldInfo.new(index, lineno, header)] - end - break unless field.is_a? String # short-circuit pipeline for speed - end - field # final state of each field, converted or original + fields_converter.convert(fields, @headers, lineno) end end # - # This method is used to turn a finished +row+ into a CSV::Row. Header rows - # are also dealt with here, either by returning a CSV::Row with identical - # headers and fields (save that the fields do not go through the converters) - # or by reading past them to return a field row. Headers are also saved in - # @headers for use in future rows. + # Returns the encoding of the internal IO object. # - # When +nil+, +row+ is assumed to be a header row not based on an actual row - # of the stream. - # - def parse_headers(row = nil) - if @headers.nil? # header row - @headers = case @use_headers # save headers - # Array of headers - when Array then @use_headers - # CSV header String - when String - self.class.parse_line( @use_headers, - col_sep: @col_sep, - row_sep: @row_sep, - quote_char: @quote_char ) - # first row is headers - else row - end - - # prepare converted and unconverted copies - row = @headers if row.nil? - @headers = convert_fields(@headers, true) - @headers.each { |h| h.freeze if h.is_a? String } - - if @return_headers # return headers - return self.class::Row.new(@headers, row, true) - elsif not [Array, String].include? @use_headers.class # skip to field row - return shift - end - end - - self.class::Row.new(@headers, convert_fields(row)) # field row - end - - # - # This method injects an instance variable unconverted_fields into - # +row+ and an accessor method for +row+ called unconverted_fields(). The - # variable is set to the contents of +fields+. - # - def add_unconverted_fields(row, fields) - class << row - attr_reader :unconverted_fields - end - row.instance_variable_set(:@unconverted_fields, fields) - row - end - - # - # This method is an encoding safe version of Regexp::escape(). It will escape - # any characters that would change the meaning of a regular expression in the - # encoding of +str+. Regular expression characters that cannot be transcoded - # to the target encoding will be skipped and no escaping will be performed if - # a backslash cannot be transcoded. - # - def escape_re(str) - str.gsub(@re_chars) {|c| @re_esc + c} - end - - # - # Builds a regular expression in @encoding. All +chunks+ will be - # transcoded to that encoding. - # - def encode_re(*chunks) - Regexp.new(encode_str(*chunks)) - end - - # - # Builds a String in @encoding. All +chunks+ will be transcoded to - # that encoding. - # - def encode_str(*chunks) - chunks.map { |chunk| chunk.encode(@encoding.name) }.join('') - end - - # - # Returns the encoding of the internal IO object or the +default+ if the - # encoding cannot be determined. - # - def raw_encoding(default = Encoding::ASCII_8BIT) + def raw_encoding if @io.respond_to? :internal_encoding @io.internal_encoding || @io.external_encoding - elsif @io.is_a? StringIO - @io.string.encoding elsif @io.respond_to? :encoding @io.encoding else - default + nil end end + + def fields_converter + @fields_converter ||= build_fields_converter + end + + def build_fields_converter + specific_options = { + builtin_converters: Converters, + } + options = @base_fields_converter_options.merge(specific_options) + fields_converter = FieldsConverter.new(options) + normalize_converters(@initial_converters).each do |name, converter| + fields_converter.add_converter(name, &converter) + end + fields_converter + end + + def header_fields_converter + @header_fields_converter ||= build_header_fields_converter + end + + def build_header_fields_converter + specific_options = { + builtin_converters: HeaderConverters, + accept_nil: true, + } + options = @base_fields_converter_options.merge(specific_options) + fields_converter = FieldsConverter.new(options) + normalize_converters(@initial_header_converters).each do |name, converter| + fields_converter.add_converter(name, &converter) + end + fields_converter + end + + def parser + @parser ||= Parser.new(@io, parser_options) + end + + def parser_options + @parser_options.merge(fields_converter: fields_converter, + header_fields_converter: header_fields_converter) + end + + def writer + @writer ||= Writer.new(@io, writer_options) + end + + def writer_options + @writer_options.merge(header_fields_converter: header_fields_converter) + end end # Passes +args+ to CSV::instance. diff --git a/lib/csv/csv.gemspec b/lib/csv/csv.gemspec index fae5caae19..0c9d265584 100644 --- a/lib/csv/csv.gemspec +++ b/lib/csv/csv.gemspec @@ -18,12 +18,26 @@ Gem::Specification.new do |spec| spec.homepage = "https://github.com/ruby/csv" spec.license = "BSD-2-Clause" - spec.files = ["lib/csv.rb", "lib/csv/table.rb", "lib/csv/core_ext/string.rb", "lib/csv/core_ext/array.rb", "lib/csv/row.rb", "lib/csv/version.rb"] - spec.files += ["README.md", "LICENSE.txt", "news.md"] + spec.files = [ + "LICENSE.txt", + "NEWS.md", + "README.md", + "lib/csv.rb", + "lib/csv/core_ext/array.rb", + "lib/csv/core_ext/string.rb", + "lib/csv/fields_converter.rb", + "lib/csv/match_p.rb", + "lib/csv/parser.rb", + "lib/csv/row.rb", + "lib/csv/table.rb", + "lib/csv/version.rb", + "lib/csv/writer.rb", + ] spec.require_paths = ["lib"] spec.required_ruby_version = ">= 2.3.0" spec.add_development_dependency "bundler" spec.add_development_dependency "rake" spec.add_development_dependency "benchmark-ips" + spec.add_development_dependency "simplecov" end diff --git a/lib/csv/fields_converter.rb b/lib/csv/fields_converter.rb new file mode 100644 index 0000000000..c2fa5798ff --- /dev/null +++ b/lib/csv/fields_converter.rb @@ -0,0 +1,78 @@ +# frozen_string_literal: true + +class CSV + class FieldsConverter + include Enumerable + + def initialize(options={}) + @converters = [] + @nil_value = options[:nil_value] + @empty_value = options[:empty_value] + @empty_value_is_empty_string = (@empty_value == "") + @accept_nil = options[:accept_nil] + @builtin_converters = options[:builtin_converters] + @need_static_convert = need_static_convert? + end + + def add_converter(name=nil, &converter) + if name.nil? # custom converter + @converters << converter + else # named converter + combo = @builtin_converters[name] + case combo + when Array # combo converter + combo.each do |sub_name| + add_converter(sub_name) + end + else # individual named converter + @converters << combo + end + end + end + + def each(&block) + @converters.each(&block) + end + + def empty? + @converters.empty? + end + + def convert(fields, headers, lineno) + return fields unless need_convert? + + fields.collect.with_index do |field, index| + if field.nil? + field = @nil_value + elsif field.empty? + field = @empty_value unless @empty_value_is_empty_string + end + @converters.each do |converter| + break if field.nil? and @accept_nil + if converter.arity == 1 # straight field converter + field = converter[field] + else # FieldInfo converter + if headers + header = headers[index] + else + header = nil + end + field = converter[field, FieldInfo.new(index, lineno, header)] + end + break unless field.is_a?(String) # short-circuit pipeline for speed + end + field # final state of each field, converted or original + end + end + + private + def need_static_convert? + not (@nil_value.nil? and @empty_value_is_empty_string) + end + + def need_convert? + @need_static_convert or + (not @converters.empty?) + end + end +end diff --git a/lib/csv/match_p.rb b/lib/csv/match_p.rb new file mode 100644 index 0000000000..775559a3eb --- /dev/null +++ b/lib/csv/match_p.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +# This provides String#match? and Regexp#match? for Ruby 2.3. +unless String.method_defined?(:match?) + class CSV + module MatchP + refine String do + def match?(pattern) + self =~ pattern + end + end + + refine Regexp do + def match?(string) + self =~ string + end + end + end + end +end diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb new file mode 100644 index 0000000000..2682c27ea3 --- /dev/null +++ b/lib/csv/parser.rb @@ -0,0 +1,713 @@ +# frozen_string_literal: true + +require "strscan" + +require_relative "match_p" +require_relative "row" +require_relative "table" + +using CSV::MatchP if CSV.const_defined?(:MatchP) + +class CSV + class Parser + class InvalidEncoding < StandardError + end + + class Scanner < StringScanner + alias_method :scan_all, :scan + + def initialize(*args) + super + @keeps = [] + end + + def keep_start + @keeps.push(pos) + end + + def keep_end + start = @keeps.pop + string[start, pos - start] + end + + def keep_back + self.pos = @keeps.pop + end + + def keep_drop + @keeps.pop + end + end + + class InputsScanner + def initialize(inputs, encoding, chunk_size: 8192) + @inputs = inputs.dup + @encoding = encoding + @chunk_size = chunk_size + @last_scanner = @inputs.empty? + @keeps = [] + read_chunk + end + + def scan(pattern) + value = @scanner.scan(pattern) + return value if @last_scanner + + if value + read_chunk if @scanner.eos? + return value + else + nil + end + end + + def scan_all(pattern) + value = @scanner.scan(pattern) + return value if @last_scanner + + return nil if value.nil? + while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern)) + value << sub_value + end + value + end + + def eos? + @scanner.eos? + end + + def keep_start + @keeps.push([@scanner.pos, nil]) + end + + def keep_end + start, buffer = @keeps.pop + keep = @scanner.string[start, @scanner.pos - start] + if buffer + buffer << keep + keep = buffer + end + keep + end + + def keep_back + start, buffer = @keeps.pop + if buffer + string = @scanner.string + keep = string[start, string.size - start] + if keep and not keep.empty? + @inputs.unshift(StringIO.new(keep)) + @last_scanner = false + end + @scanner = StringScanner.new(buffer) + else + @scanner.pos = start + end + end + + def keep_drop + @keeps.pop + end + + def rest + @scanner.rest + end + + private + def read_chunk + return false if @last_scanner + + unless @keeps.empty? + keep = @keeps.last + keep_start = keep[0] + string = @scanner.string + keep_data = string[keep_start, @scanner.pos - keep_start] + if keep_data + keep_buffer = keep[1] + if keep_buffer + keep_buffer << keep_data + else + keep[1] = keep_data.dup + end + end + keep[0] = 0 + end + + input = @inputs.first + case input + when StringIO + string = input.string + raise InvalidEncoding unless string.valid_encoding? + @scanner = StringScanner.new(string) + @inputs.shift + @last_scanner = @inputs.empty? + true + else + chunk = input.gets(nil, @chunk_size) + if chunk + raise InvalidEncoding unless chunk.valid_encoding? + @scanner = StringScanner.new(chunk) + if input.respond_to?(:eof?) and input.eof? + @inputs.shift + @last_scanner = @inputs.empty? + end + true + else + @scanner = StringScanner.new("".encode(@encoding)) + @inputs.shift + @last_scanner = @inputs.empty? + if @last_scanner + false + else + read_chunk + end + end + end + end + end + + def initialize(input, options) + @input = input + @options = options + @samples = [] + + prepare + end + + def column_separator + @column_separator + end + + def row_separator + @row_separator + end + + def quote_character + @quote_character + end + + def field_size_limit + @field_size_limit + end + + def skip_lines + @skip_lines + end + + def unconverted_fields? + @unconverted_fields + end + + def headers + @headers + end + + def header_row? + @use_headers and @headers.nil? + end + + def return_headers? + @return_headers + end + + def skip_blanks? + @skip_blanks + end + + def liberal_parsing? + @liberal_parsing + end + + def lineno + @lineno + end + + def line + last_line + end + + def parse(&block) + return to_enum(__method__) unless block_given? + + if @return_headers and @headers + headers = Row.new(@headers, @raw_headers, true) + if @unconverted_fields + headers = add_unconverted_fields(headers, []) + end + yield headers + end + + row = [] + begin + @scanner = build_scanner + skip_needless_lines + start_row + while true + @quoted_column_value = false + @unquoted_column_value = false + value = parse_column_value + if value and @field_size_limit and value.size >= @field_size_limit + raise MalformedCSVError.new("Field size exceeded", @lineno + 1) + end + if parse_column_end + row << value + elsif parse_row_end + if row.empty? and value.nil? + emit_row([], &block) unless @skip_blanks + else + row << value + emit_row(row, &block) + row = [] + end + skip_needless_lines + start_row + elsif @scanner.eos? + return if row.empty? and value.nil? + row << value + emit_row(row, &block) + return + else + if @quoted_column_value + message = "Do not allow except col_sep_split_separator " + + "after quoted fields" + raise MalformedCSVError.new(message, @lineno + 1) + elsif @unquoted_column_value and @scanner.scan(@cr_or_lf) + message = "Unquoted fields do not allow \\r or \\n" + raise MalformedCSVError.new(message, @lineno + 1) + elsif @scanner.rest.start_with?(@quote_character) + message = "Illegal quoting" + raise MalformedCSVError.new(message, @lineno + 1) + else + raise MalformedCSVError.new("TODO: Meaningful message", + @lineno + 1) + end + end + end + rescue InvalidEncoding + message = "Invalid byte sequence in #{@encoding}" + raise MalformedCSVError.new(message, @lineno + 1) + end + end + + private + def prepare + prepare_variable + prepare_regexp + prepare_line + prepare_header + prepare_parser + end + + def prepare_variable + @encoding = @options[:encoding] + @liberal_parsing = @options[:liberal_parsing] + @unconverted_fields = @options[:unconverted_fields] + @field_size_limit = @options[:field_size_limit] + @skip_blanks = @options[:skip_blanks] + @fields_converter = @options[:fields_converter] + @header_fields_converter = @options[:header_fields_converter] + end + + def prepare_regexp + @column_separator = @options[:column_separator].to_s.encode(@encoding) + @row_separator = + resolve_row_separator(@options[:row_separator]).encode(@encoding) + @quote_character = @options[:quote_character].to_s.encode(@encoding) + if @quote_character.length != 1 + raise ArgumentError, ":quote_char has to be a single character String" + end + + escaped_column_separator = Regexp.escape(@column_separator) + escaped_row_separator = Regexp.escape(@row_separator) + escaped_quote_character = Regexp.escape(@quote_character) + + skip_lines = @options[:skip_lines] + case skip_lines + when String + @skip_lines = skip_lines.encode(@encoding) + when Regexp, nil + @skip_lines = skip_lines + else + unless skip_lines.respond_to?(:match) + message = + ":skip_lines has to respond to \#match: #{skip_lines.inspect}" + raise ArgumentError, message + end + @skip_lines = skip_lines + end + + @column_end = Regexp.new(escaped_column_separator) + if @column_separator.size > 1 + @column_ends = @column_separator.each_char.collect do |char| + Regexp.new(Regexp.escape(char)) + end + else + @column_ends = nil + end + @row_end = Regexp.new(escaped_row_separator) + if @row_separator.size > 1 + @row_ends = @row_separator.each_char.collect do |char| + Regexp.new(Regexp.escape(char)) + end + else + @row_ends = nil + end + @quotes = Regexp.new(escaped_quote_character + + "+".encode(@encoding)) + @quoted_value = Regexp.new("[^".encode(@encoding) + + escaped_quote_character + + "]+".encode(@encoding)) + if @liberal_parsing + @unquoted_value = Regexp.new("[^".encode(@encoding) + + escaped_column_separator + + "\r\n]+".encode(@encoding)) + else + @unquoted_value = Regexp.new("[^".encode(@encoding) + + escaped_quote_character + + escaped_column_separator + + "\r\n]+".encode(@encoding)) + end + @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding)) + @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding)) + end + + def resolve_row_separator(separator) + if separator == :auto + cr = "\r".encode(@encoding) + lf = "\n".encode(@encoding) + if @input.is_a?(StringIO) + separator = detect_row_separator(@input.string, cr, lf) + elsif @input.respond_to?(:gets) + if @input.is_a?(File) + chunk_size = 32 * 1024 + else + chunk_size = 1024 + end + begin + while separator == :auto + # + # if we run out of data, it's probably a single line + # (ensure will set default value) + # + break unless sample = @input.gets(nil, chunk_size) + + # extend sample if we're unsure of the line ending + if sample.end_with?(cr) + sample << (@input.gets(nil, 1) || "") + end + + @samples << sample + + separator = detect_row_separator(sample, cr, lf) + end + rescue IOError + # do nothing: ensure will set default + end + end + separator = $INPUT_RECORD_SEPARATOR if separator == :auto + end + separator.to_s.encode(@encoding) + end + + def detect_row_separator(sample, cr, lf) + lf_index = sample.index(lf) + if lf_index + cr_index = sample[0, lf_index].index(cr) + else + cr_index = sample.index(cr) + end + if cr_index and lf_index + if cr_index + 1 == lf_index + cr + lf + elsif cr_index < lf_index + cr + else + lf + end + elsif cr_index + cr + elsif lf_index + lf + else + :auto + end + end + + def prepare_line + @lineno = 0 + @last_line = nil + @scanner = nil + end + + def last_line + if @scanner + @last_line ||= @scanner.keep_end + else + @last_line + end + end + + def prepare_header + @return_headers = @options[:return_headers] + + headers = @options[:headers] + case headers + when Array + @raw_headers = headers + @use_headers = true + when String + @raw_headers = parse_headers(headers) + @use_headers = true + when nil, false + @raw_headers = nil + @use_headers = false + else + @raw_headers = nil + @use_headers = true + end + if @raw_headers + @headers = adjust_headers(@raw_headers) + else + @headers = nil + end + end + + def parse_headers(row) + CSV.parse_line(row, + col_sep: @column_separator, + row_sep: @row_separator, + quote_char: @quote_character) + end + + def adjust_headers(headers) + adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno) + adjusted_headers.each {|h| h.freeze if h.is_a? String} + adjusted_headers + end + + def prepare_parser + @may_quoted = may_quoted? + end + + def may_quoted? + if @input.is_a?(StringIO) + sample = @input.string + else + return false if @samples.empty? + sample = @samples.first + end + sample[0, 128].index(@quote_character) + end + + SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes") + if SCANNER_TEST + class UnoptimizedStringIO + def initialize(string) + @io = StringIO.new(string) + end + + def gets(*args) + @io.gets(*args) + end + + def eof? + @io.eof? + end + end + + def build_scanner + inputs = @samples.collect do |sample| + UnoptimizedStringIO.new(sample) + end + if @input.is_a?(StringIO) + inputs << UnoptimizedStringIO.new(@input.string) + else + inputs << @input + end + InputsScanner.new(inputs, @encoding, chunk_size: 1) + end + else + def build_scanner + string = nil + if @samples.empty? and @input.is_a?(StringIO) + string = @input.string + elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof? + string = @samples[0] + end + if string + unless string.valid_encoding? + message = "Invalid byte sequence in #{@encoding}" + raise MalformedCSVError.new(message, @lineno + 1) + end + Scanner.new(string) + else + inputs = @samples.collect do |sample| + StringIO.new(sample) + end + inputs << @input + InputsScanner.new(inputs, @encoding) + end + end + end + + def skip_needless_lines + return unless @skip_lines + + while true + @scanner.keep_start + line = @scanner.scan_all(@not_line_end) || "".encode(@encoding) + line << @row_separator if parse_row_end + if skip_line?(line) + @scanner.keep_drop + else + @scanner.keep_back + return + end + end + end + + def skip_line?(line) + case @skip_lines + when String + line.include?(@skip_lines) + when Regexp + @skip_lines.match?(line) + else + @skip_lines.match(line) + end + end + + def parse_column_value + if @liberal_parsing + quoted_value = parse_quoted_column_value + if quoted_value + unquoted_value = parse_unquoted_column_value + if unquoted_value + @quote_character + quoted_value + @quote_character + unquoted_value + else + quoted_value + end + else + parse_unquoted_column_value + end + elsif @may_quoted + parse_quoted_column_value || + parse_unquoted_column_value + else + parse_unquoted_column_value || + parse_quoted_column_value + end + end + + def parse_unquoted_column_value + value = @scanner.scan_all(@unquoted_value) + @unquoted_column_value = true if value + value + end + + def parse_quoted_column_value + quotes = @scanner.scan_all(@quotes) + return nil unless quotes + + @quoted_column_value = true + n_quotes = quotes.size + if (n_quotes % 2).zero? + quotes[0, (n_quotes - 2) / 2] + else + value = quotes[0, (n_quotes - 1) / 2] + while true + quoted_value = @scanner.scan_all(@quoted_value) + value << quoted_value if quoted_value + quotes = @scanner.scan_all(@quotes) + unless quotes + message = "Unclosed quoted field" + raise MalformedCSVError.new(message, @lineno + 1) + end + n_quotes = quotes.size + if n_quotes == 1 + break + elsif (n_quotes % 2) == 1 + value << quotes[0, (n_quotes - 1) / 2] + break + else + value << quotes[0, n_quotes / 2] + end + end + value + end + end + + def parse_column_end + return true if @scanner.scan(@column_end) + return false unless @column_ends + + @scanner.keep_start + if @column_ends.all? {|column_end| @scanner.scan(column_end)} + @scanner.keep_drop + true + else + @scanner.keep_back + false + end + end + + def parse_row_end + return true if @scanner.scan(@row_end) + return false unless @row_ends + @scanner.keep_start + if @row_ends.all? {|row_end| @scanner.scan(row_end)} + @scanner.keep_drop + true + else + @scanner.keep_back + false + end + end + + def start_row + if @last_line + @last_line = nil + else + @scanner.keep_drop + end + @scanner.keep_start + end + + def emit_row(row, &block) + @lineno += 1 + + raw_row = row + if @use_headers + if @headers.nil? + @headers = adjust_headers(row) + return unless @return_headers + row = Row.new(@headers, row, true) + else + row = Row.new(@headers, + @fields_converter.convert(raw_row, @headers, @lineno)) + end + else + # convert fields, if needed... + row = @fields_converter.convert(raw_row, nil, @lineno) + end + + # inject unconverted fields and accessor, if requested... + if @unconverted_fields and not row.respond_to?(:unconverted_fields) + add_unconverted_fields(row, raw_row) + end + + yield(row) + end + + # This method injects an instance variable unconverted_fields into + # +row+ and an accessor method for +row+ called unconverted_fields(). The + # variable is set to the contents of +fields+. + def add_unconverted_fields(row, fields) + class << row + attr_reader :unconverted_fields + end + row.instance_variable_set(:@unconverted_fields, fields) + row + end + end +end diff --git a/lib/csv/table.rb b/lib/csv/table.rb index 17a7c542e4..b13d1ada10 100644 --- a/lib/csv/table.rb +++ b/lib/csv/table.rb @@ -16,6 +16,11 @@ class CSV # Construct a new CSV::Table from +array_of_rows+, which are expected # to be CSV::Row objects. All rows are assumed to have the same headers. # + # The optional +headers+ parameter can be set to Array of headers. + # If headers aren't set, headers are fetched from CSV::Row objects. + # Otherwise, headers() method will return headers being set in + # headers arugument. + # # A CSV::Table object supports the following Array methods through # delegation: # @@ -23,8 +28,17 @@ class CSV # * length() # * size() # - def initialize(array_of_rows) + def initialize(array_of_rows, headers: nil) @table = array_of_rows + @headers = headers + unless @headers + if @table.empty? + @headers = [] + else + @headers = @table.first.headers + end + end + @mode = :col_or_row end @@ -122,11 +136,7 @@ class CSV # other rows). An empty Array is returned for empty tables. # def headers - if @table.empty? - Array.new - else - @table.first.headers - end + @headers.dup end # @@ -171,6 +181,10 @@ class CSV @table[index_or_header] = value end else # set column + unless index_or_header.is_a? Integer + index = @headers.index(index_or_header) || @headers.size + @headers[index] = index_or_header + end if value.is_a? Array # multiple values @table.each_with_index do |row, i| if row.header_row? @@ -258,6 +272,11 @@ class CSV (@mode == :col_or_row and index_or_header.is_a? Integer) @table.delete_at(index_or_header) else # by header + if index_or_header.is_a? Integer + @headers.delete_at(index_or_header) + else + @headers.delete(index_or_header) + end @table.map { |row| row.delete(index_or_header).last } end end diff --git a/lib/csv/version.rb b/lib/csv/version.rb index d62a093418..d6b59b3097 100644 --- a/lib/csv/version.rb +++ b/lib/csv/version.rb @@ -2,5 +2,5 @@ class CSV # The version of the installed library. - VERSION = "3.0.1" + VERSION = "3.0.2" end diff --git a/lib/csv/writer.rb b/lib/csv/writer.rb new file mode 100644 index 0000000000..2f2ab095d7 --- /dev/null +++ b/lib/csv/writer.rb @@ -0,0 +1,144 @@ +# frozen_string_literal: true + +require_relative "match_p" +require_relative "row" + +using CSV::MatchP if CSV.const_defined?(:MatchP) + +class CSV + class Writer + attr_reader :lineno + attr_reader :headers + + def initialize(output, options) + @output = output + @options = options + @lineno = 0 + prepare + if @options[:write_headers] and @headers + self << @headers + end + end + + def <<(row) + case row + when Row + row = row.fields + when Hash + row = @headers.collect {|header| row[header]} + end + + @headers ||= row if @use_headers + @lineno += 1 + + line = row.collect(&@quote).join(@column_separator) + @row_separator + if @output_encoding + line = line.encode(@output_encoding) + end + @output << line + + self + end + + def rewind + @lineno = 0 + @headers = nil if @options[:headers].nil? + end + + private + def prepare + @encoding = @options[:encoding] + + prepare_header + prepare_format + prepare_output + end + + def prepare_header + headers = @options[:headers] + case headers + when Array + @headers = headers + @use_headers = true + when String + @headers = CSV.parse_line(headers, + col_sep: @options[:column_separator], + row_sep: @options[:row_separator], + quote_char: @options[:quote_character]) + @use_headers = true + when true + @headers = nil + @use_headers = true + else + @headers = nil + @use_headers = false + end + return unless @headers + + converter = @options[:header_fields_converter] + @headers = converter.convert(@headers, nil, 0) + @headers.each do |header| + header.freeze if header.is_a?(String) + end + end + + def prepare_format + @column_separator = @options[:column_separator].to_s.encode(@encoding) + row_separator = @options[:row_separator] + if row_separator == :auto + @row_separator = $INPUT_RECORD_SEPARATOR.encode(@encoding) + else + @row_separator = row_separator.to_s.encode(@encoding) + end + quote_character = @options[:quote_character] + quote = lambda do |field| + field = String(field) + encoded_quote_character = quote_character.encode(field.encoding) + encoded_quote_character + + field.gsub(encoded_quote_character, + encoded_quote_character * 2) + + encoded_quote_character + end + if @options[:force_quotes] + @quote = quote + else + quotable_pattern = + Regexp.new("[\r\n".encode(@encoding) + + Regexp.escape(@column_separator) + + Regexp.escape(quote_character.encode(@encoding)) + + "]".encode(@encoding)) + @quote = lambda do |field| + if field.nil? # represent +nil+ fields as empty unquoted fields + "" + else + field = String(field) # Stringify fields + # represent empty fields as empty quoted fields + if field.empty? or quotable_pattern.match?(field) + quote.call(field) + else + field # unquoted field + end + end + end + end + end + + def prepare_output + @output_encoding = nil + return unless @output.is_a?(StringIO) + + output_encoding = @output.internal_encoding || @output.external_encoding + if @encoding != output_encoding + if @options[:force_encoding] + @output_encoding = output_encoding + else + compatible_encoding = Encoding.compatible?(@encoding, output_encoding) + if compatible_encoding + @output.set_encoding(compatible_encoding) + @output.seek(0, IO::SEEK_END) + end + end + end + end + end +end diff --git a/test/csv/test_csv_parsing.rb b/test/csv/test_csv_parsing.rb index e65bbad92e..3fe1bd79e4 100755 --- a/test/csv/test_csv_parsing.rb +++ b/test/csv/test_csv_parsing.rb @@ -143,55 +143,52 @@ class TestCSV::Parsing < TestCSV end end - def test_malformed_csv - assert_raise(CSV::MalformedCSVError) do + def test_malformed_csv_cr_first_line + error = assert_raise(CSV::MalformedCSVError) do CSV.parse_line("1,2\r,3", row_sep: "\n") end + assert_equal("Unquoted fields do not allow \\r or \\n in line 1.", + error.message) + end - bad_data = <<-CSV + def test_malformed_csv_cr_middle_line + csv = <<-CSV line,1,abc line,2,"def\nghi" line,4,some\rjunk line,5,jkl CSV - lines = bad_data.lines.to_a - assert_equal(6, lines.size) - assert_match(/\Aline,4/, lines.find { |l| l =~ /some\rjunk/ }) - csv = CSV.new(bad_data) - begin - loop do - assert_not_nil(csv.shift) - assert_send([csv.lineno, :<, 4]) - end - rescue CSV::MalformedCSVError - assert_equal( "Unquoted fields do not allow \\r or \\n in line 4.", - $!.message ) + error = assert_raise(CSV::MalformedCSVError) do + CSV.parse(csv) end + assert_equal("Unquoted fields do not allow \\r or \\n in line 4.", + error.message) + end - assert_raise(CSV::MalformedCSVError) { CSV.parse_line('1,2,"3...') } + def test_malformed_csv_unclosed_quote + error = assert_raise(CSV::MalformedCSVError) do + CSV.parse_line('1,2,"3...') + end + assert_equal("Unclosed quoted field in line 1.", + error.message) + end - bad_data = <<-CSV + def test_malformed_csv_illegal_quote_middle_line + csv = <<-CSV line,1,abc line,2,"def\nghi" line,4,8'10" line,5,jkl CSV - lines = bad_data.lines.to_a - assert_equal(6, lines.size) - assert_match(/\Aline,4/, lines.find { |l| l =~ /8'10"/ }) - csv = CSV.new(bad_data) - begin - loop do - assert_not_nil(csv.shift) - assert_send([csv.lineno, :<, 4]) - end - rescue CSV::MalformedCSVError - assert_equal("Illegal quoting in line 4.", $!.message) + error = assert_raise(CSV::MalformedCSVError) do + CSV.parse(csv) end + assert_equal("Illegal quoting in line 4.", + error.message) end def test_the_parse_fails_fast_when_it_can_for_unquoted_fields @@ -239,6 +236,24 @@ line,5,jkl CSV.parse("a b d", col_sep: " ")) end + def test_row_sep_auto_cr + assert_equal([["a"]], CSV.parse("a\r")) + end + + def test_row_sep_auto_lf + assert_equal([["a"]], CSV.parse("a\n")) + end + + def test_row_sep_auto_cr_lf + assert_equal([["a"]], CSV.parse("a\r\n")) + end + + def test_headers_empty_line + assert_equal(CSV::Table.new([CSV::Row.new(["header1"], [])], + headers: ["header1"]), + CSV.parse("\n", headers: "header1")) + end + private def assert_parse_errors_out(*args) diff --git a/test/csv/test_data_converters.rb b/test/csv/test_data_converters.rb index 114049d66f..8b3163da18 100755 --- a/test/csv/test_data_converters.rb +++ b/test/csv/test_data_converters.rb @@ -243,23 +243,35 @@ class TestCSV::DataConverters < TestCSV CSV.parse_line(@data, converters: [:numeric, @custom]) ) end - def test_unconverted_fields - [ [ @data, - ["Numbers", :integer, 1, :float, 3.015], - %w{Numbers :integer 1 :float 3.015} ], - ["\n", Array.new, Array.new] ].each do |test, fields, unconverted| - row = nil - assert_nothing_raised(Exception) do - row = CSV.parse_line( test, - converters: [:numeric, @custom], - unconverted_fields: true ) - end - assert_not_nil(row) - assert_equal(fields, row) - assert_respond_to(row, :unconverted_fields) - assert_equal(unconverted, row.unconverted_fields) - end + def test_unconverted_fields_number + row = CSV.parse_line(@data, + converters: [:numeric, @custom], + unconverted_fields: true) + assert_equal([ + ["Numbers", :integer, 1, :float, 3.015], + ["Numbers", ":integer", "1", ":float", "3.015"], + ], + [ + row, + row.unconverted_fields, + ]) + end + def test_unconverted_fields_empty_line + row = CSV.parse_line("\n", + converters: [:numeric, @custom], + unconverted_fields: true) + assert_equal([ + [], + [], + ], + [ + row, + row.unconverted_fields, + ]) + end + + def test_unconverted_fields data = <<-CSV first,second,third 1,2,3 diff --git a/test/csv/test_features.rb b/test/csv/test_features.rb index 45d937e037..53b513d0fa 100755 --- a/test/csv/test_features.rb +++ b/test/csv/test_features.rb @@ -58,26 +58,37 @@ line,4,jkl end def test_row_sep - assert_raise(CSV::MalformedCSVError) do - CSV.parse_line("1,2,3\n,4,5\r\n", row_sep: "\r\n") + error = assert_raise(CSV::MalformedCSVError) do + CSV.parse_line("1,2,3\n,4,5\r\n", row_sep: "\r\n") end + assert_equal("Unquoted fields do not allow \\r or \\n in line 1.", + error.message) assert_equal( ["1", "2", "3\n", "4", "5"], CSV.parse_line(%Q{1,2,"3\n",4,5\r\n}, row_sep: "\r\n")) end def test_quote_char TEST_CASES.each do |test_case| - assert_equal( test_case.last.map { |t| t.tr('"', "'") unless t.nil? }, - CSV.parse_line( test_case.first.tr('"', "'"), - quote_char: "'" ) ) + assert_equal(test_case.last.map {|t| t.tr('"', "'") unless t.nil?}, + CSV.parse_line(test_case.first.tr('"', "'"), + quote_char: "'" )) end end - def test_bug_8405 + def test_quote_char_special_regexp_char TEST_CASES.each do |test_case| - assert_equal( test_case.last.map { |t| t.tr('"', "|") unless t.nil? }, - CSV.parse_line( test_case.first.tr('"', "|"), - quote_char: "|" ) ) + assert_equal(test_case.last.map {|t| t.tr('"', "|") unless t.nil?}, + CSV.parse_line(test_case.first.tr('"', "|"), + quote_char: "|")) + end + end + + def test_quote_char_special_regexp_char_liberal_parsing + TEST_CASES.each do |test_case| + assert_equal(test_case.last.map {|t| t.tr('"', "|") unless t.nil?}, + CSV.parse_line(test_case.first.tr('"', "|"), + quote_char: "|", + liberal_parsing: true)) end end @@ -157,27 +168,68 @@ line,4,jkl assert_equal(3, count) end - def test_liberal_parsing + def test_liberal_parsing_middle_quote_start input = '"Johnson, Dwayne",Dwayne "The Rock" Johnson' - assert_raise(CSV::MalformedCSVError) do + error = assert_raise(CSV::MalformedCSVError) do CSV.parse_line(input) end + assert_equal("Illegal quoting in line 1.", + error.message) assert_equal(["Johnson, Dwayne", 'Dwayne "The Rock" Johnson'], CSV.parse_line(input, liberal_parsing: true)) + end + def test_liberal_parsing_middle_quote_end input = '"quoted" field' - assert_raise(CSV::MalformedCSVError) do + error = assert_raise(CSV::MalformedCSVError) do CSV.parse_line(input) end + assert_equal("Do not allow except col_sep_split_separator " + + "after quoted fields in line 1.", + error.message) assert_equal(['"quoted" field'], CSV.parse_line(input, liberal_parsing: true)) + end - assert_raise(CSV::MalformedCSVError) do + def test_liberal_parsing_quote_after_column_separator + error = assert_raise(CSV::MalformedCSVError) do CSV.parse_line('is,this "three," or four,fields', liberal_parsing: true) end + assert_equal("Unclosed quoted field in line 1.", + error.message) + end + def test_liberal_parsing_quote_before_column_separator assert_equal(["is", 'this "three', ' or four"', "fields"], - CSV.parse_line('is,this "three, or four",fields', liberal_parsing: true)) + CSV.parse_line('is,this "three, or four",fields', + liberal_parsing: true)) + end + + def test_liberal_parsing_backslash_quote + assert_equal([ + "1", + "\"Hamlet says, \\\"Seems", + "\\\" madam! Nay it is; I know not \\\"seems.\\\"\"", + ], + CSV.parse_line('1,' + + '"Hamlet says, \"Seems,' + + '\" madam! Nay it is; I know not \"seems.\""', + liberal_parsing: true)) + end + + def test_liberal_parsing_space_quote + input = <<~CSV + Los Angeles, 34°03'N, 118°15'W + New York City, 40°42'46"N, 74°00'21"W + Paris, 48°51'24"N, 2°21'03"E + CSV + assert_equal( + [ + ["Los Angeles", " 34°03'N", " 118°15'W"], + ["New York City", " 40°42'46\"N", " 74°00'21\"W"], + ["Paris", " 48°51'24\"N", " 2°21'03\"E"], + ], + CSV.parse(input, liberal_parsing: true)) end def test_csv_behavior_readers @@ -338,11 +390,33 @@ line,4,jkl def test_requires_skip_lines_to_call_match regex_stub = RegexStub.new + csv = CSV.new(@sample_data, :skip_lines => regex_stub) assert_raise_with_message(ArgumentError, /skip_lines/) do - CSV.new(@sample_data, :skip_lines => regex_stub) + csv.shift end end + class Matchable + def initialize(pattern) + @pattern = pattern + end + + def match(line) + @pattern.match(line) + end + end + + def test_skip_lines_match + csv = <<-CSV.chomp +1 +# 2 +3 +# 4 + CSV + assert_equal([["1"], ["3"]], + CSV.parse(csv, :skip_lines => Matchable.new(/\A#/))) + end + def test_comment_rows_are_ignored sample_data = "line,1,a\n#not,a,line\nline,2,b\n #also,no,line" c = CSV.new sample_data, :skip_lines => /\A\s*#/ @@ -375,4 +449,48 @@ line,4,jkl def test_table_nil_equality assert_nothing_raised(NoMethodError) { CSV.parse("test", headers: true) == nil } end + + # non-seekable input stream for testing https://github.com/ruby/csv/issues/44 + class DummyIO + extend Forwardable + def_delegators :@io, :gets, :read, :pos, :eof? # no seek or rewind! + def initialize(data) + @io = StringIO.new(data) + end + end + + def test_line_separator_autodetection_for_non_seekable_input_lf + c = CSV.new(DummyIO.new("one,two,three\nfoo,bar,baz\n")) + assert_equal [["one", "two", "three"], ["foo", "bar", "baz"]], c.each.to_a + end + + def test_line_separator_autodetection_for_non_seekable_input_cr + c = CSV.new(DummyIO.new("one,two,three\rfoo,bar,baz\r")) + assert_equal [["one", "two", "three"], ["foo", "bar", "baz"]], c.each.to_a + end + + def test_line_separator_autodetection_for_non_seekable_input_cr_lf + c = CSV.new(DummyIO.new("one,two,three\r\nfoo,bar,baz\r\n")) + assert_equal [["one", "two", "three"], ["foo", "bar", "baz"]], c.each.to_a + end + + def test_line_separator_autodetection_for_non_seekable_input_1024_over_lf + table = (1..10).map { |row| (1..200).map { |col| "row#{row}col#{col}" }.to_a }.to_a + input = table.map { |line| line.join(",") }.join("\n") + c = CSV.new(DummyIO.new(input)) + assert_equal table, c.each.to_a + end + + def test_line_separator_autodetection_for_non_seekable_input_1024_over_cr_lf + table = (1..10).map { |row| (1..200).map { |col| "row#{row}col#{col}" }.to_a }.to_a + input = table.map { |line| line.join(",") }.join("\r\n") + c = CSV.new(DummyIO.new(input)) + assert_equal table, c.each.to_a + end + + def test_line_separator_autodetection_for_non_seekable_input_many_cr_only + # input with lots of CRs (to make sure no bytes are lost due to look-ahead) + c = CSV.new(DummyIO.new("foo\r" + "\r" * 9999 + "bar\r")) + assert_equal [["foo"]] + [[]] * 9999 + [["bar"]], c.each.to_a + end end diff --git a/test/csv/test_interface.rb b/test/csv/test_interface.rb index 912e2ec7f5..309fbbb87b 100755 --- a/test/csv/test_interface.rb +++ b/test/csv/test_interface.rb @@ -139,6 +139,18 @@ class TestCSV::Interface < TestCSV assert_equal(Array.new, CSV.parse_line("\n1,2,3")) end + def test_parse_header_only + table = CSV.parse("a,b,c", headers: true) + assert_equal([ + ["a", "b", "c"], + [], + ], + [ + table.headers, + table.each.to_a, + ]) + end + def test_read_and_readlines assert_equal( @expected, CSV.read(@path, col_sep: "\t", row_sep: "\r\n") ) @@ -236,7 +248,7 @@ class TestCSV::Interface < TestCSV CSV.open(@path, "w", headers: true) do |csv| csv << headers csv << %w{1 2 3} - assert_equal(headers, csv.instance_variable_get(:@headers)) + assert_equal(headers, csv.headers) end end diff --git a/test/csv/test_table.rb b/test/csv/test_table.rb index d99b7d2932..a5ae8e0381 100755 --- a/test/csv/test_table.rb +++ b/test/csv/test_table.rb @@ -21,6 +21,8 @@ class TestCSV::Table < TestCSV @header_table = CSV::Table.new( [CSV::Row.new(%w{A B C}, %w{A B C}, true)] + @rows ) + + @header_only_table = CSV::Table.new([], headers: %w{A B C}) end def test_initialze @@ -63,6 +65,10 @@ class TestCSV::Table < TestCSV assert_equal Array.new, t.headers end + def test_headers_only + assert_equal(%w[A B C], @header_only_table.headers) + end + def test_index ################## ### Mixed Mode ### @@ -471,6 +477,21 @@ A CSV end + def test_delete_headers_only + ################### + ### Column Mode ### + ################### + @header_only_table.by_col! + + # delete by index + assert_equal([], @header_only_table.delete(0)) + assert_equal(%w[B C], @header_only_table.headers) + + # delete by header + assert_equal([], @header_only_table.delete("C")) + assert_equal(%w[B], @header_only_table.headers) + end + def test_values_at ################## ### Mixed Mode ###