From bfc73852b1f03d8dee405cdeb0f2883d89c78b2d Mon Sep 17 00:00:00 2001 From: Manfred Stienstra Date: Sun, 21 Sep 2008 17:28:05 +0200 Subject: [PATCH] Improve documentation. --- .../core_ext/string/multibyte.rb | 23 ++-- activesupport/lib/active_support/multibyte.rb | 4 +- .../lib/active_support/multibyte/chars.rb | 113 ++++++++++-------- .../active_support/multibyte/exceptions.rb | 1 + 4 files changed, 74 insertions(+), 67 deletions(-) diff --git a/activesupport/lib/active_support/core_ext/string/multibyte.rb b/activesupport/lib/active_support/core_ext/string/multibyte.rb index 5a2dc36f72..3bf79bc7e1 100644 --- a/activesupport/lib/active_support/core_ext/string/multibyte.rb +++ b/activesupport/lib/active_support/core_ext/string/multibyte.rb @@ -6,7 +6,9 @@ module ActiveSupport #:nodoc: # Implements multibyte methods for easier access to multibyte characters in a String instance. module Multibyte unless '1.9'.respond_to?(:force_encoding) - # +mb_chars+ is a multibyte safe proxy method for string methods. + # == Multibyte proxy + # + # +mb_chars+ is a multibyte safe proxy for string methods. # # In Ruby 1.8 and older it creates and returns an instance of the ActiveSupport::Multibyte::Chars class which # encapsulates the original string. A Unicode safe version of all the String methods are defined on this proxy @@ -19,11 +21,10 @@ module ActiveSupport #:nodoc: # name.mb_chars.reverse.to_s #=> "rellüM sualC" # name.mb_chars.length #=> 12 # - # In Ruby 1.9 and newer +mb_chars+ returns +self+ because String is (mostly) encoding aware so we don't need - # a proxy class any more. This means that +mb_chars+ makes it easier to write code that runs on multiple Ruby - # versions. + # In Ruby 1.9 and newer +mb_chars+ returns +self+ because String is (mostly) encoding aware. This means that + # it becomes easy to run one version of your code on multiple Ruby versions. # - # == Method chaining + # == Method chaining # # All the methods on the Chars proxy which normally return a string will return a Chars object. This allows # method chaining on the result of any of these methods. @@ -32,12 +33,12 @@ module ActiveSupport #:nodoc: # # == Interoperability and configuration # - # The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between + # The Chars object tries to be as interchangeable with String objects as possible: sorting and comparing between # String and Char work like expected. The bang! methods change the internal string representation in the Chars # object. Interoperability problems can be resolved easily with a +to_s+ call. # # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For - # information about how to change the default Multibyte behaviour, see ActiveSupport::Multibyte. + # information about how to change the default Multibyte behaviour see ActiveSupport::Multibyte. def mb_chars if ActiveSupport::Multibyte.proxy_class.wants?(self) ActiveSupport::Multibyte.proxy_class.new(self) @@ -56,15 +57,11 @@ module ActiveSupport #:nodoc: alias chars mb_chars end else - # In Ruby 1.9 and newer +mb_chars+ returns self. In Ruby 1.8 and older +mb_chars+ creates and returns an - # Unicode safe proxy for string operations, this makes it easier to write code that runs on multiple Ruby - # versions. - def mb_chars + def mb_chars #:nodoc self end - # Returns true if the string has valid UTF-8 encoding. - def is_utf8? + def is_utf8? #:nodoc case encoding when Encoding::UTF_8 valid_encoding? diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb index 63c0d50166..018aafe607 100644 --- a/activesupport/lib/active_support/multibyte.rb +++ b/activesupport/lib/active_support/multibyte.rb @@ -5,7 +5,7 @@ require 'active_support/multibyte/exceptions' require 'active_support/multibyte/unicode_database' module ActiveSupport #:nodoc: - module Multibyte #:nodoc: + module Multibyte # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more # information about normalization. NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd] @@ -30,4 +30,4 @@ module ActiveSupport #:nodoc: mattr_accessor :proxy_class self.proxy_class = ActiveSupport::Multibyte::Chars end -end \ No newline at end of file +end diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb index 27cc3c65a2..c61367968e 100644 --- a/activesupport/lib/active_support/multibyte/chars.rb +++ b/activesupport/lib/active_support/multibyte/chars.rb @@ -2,7 +2,7 @@ module ActiveSupport #:nodoc: module Multibyte #:nodoc: - # Chars enables you to work transparently with multibyte encodings in the Ruby String class without having extensive + # Chars enables you to work transparently with UTF-8 encoding in the Ruby String class without having extensive # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an # encoding safe manner. All the normal String methods are also implemented on the proxy. # @@ -88,14 +88,14 @@ module ActiveSupport #:nodoc: alias to_s wrapped_string alias to_str wrapped_string - # Creates a new Chars instance. +string+ is the wrapped string. if '1.9'.respond_to?(:force_encoding) + # Creates a new Chars instance by wrapping _string_. def initialize(string) @wrapped_string = string @wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen? end else - def initialize(string) + def initialize(string) #:nodoc: @wrapped_string = string end end @@ -121,10 +121,10 @@ module ActiveSupport #:nodoc: true end - # Returns +true+ if the Chars class can and should act as a proxy for the string +string+. Returns + # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns # +false+ otherwise. def self.wants?(string) - RUBY_VERSION < '1.9' && $KCODE == 'UTF8' && consumes?(string) + $KCODE == 'UTF8' && consumes?(string) end # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise. @@ -138,9 +138,9 @@ module ActiveSupport #:nodoc: include Comparable - # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, equal or after the - # object on the right side of the operation. It accepts any object that implements +to_s+. See String.<=> - # for more details. + # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, + # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+. + # See String#<=> for more details. # # Example: # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1 @@ -148,7 +148,7 @@ module ActiveSupport #:nodoc: @wrapped_string <=> other.to_s end - # Returns a new Chars object containing the other object concatenated to the string. + # Returns a new Chars object containing the _other_ object concatenated to the string. # # Example: # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl" @@ -156,7 +156,7 @@ module ActiveSupport #:nodoc: self << other end - # Like String.=~ only it returns the character offset (in codepoints) instead of the byte offset. + # Like String#=~ only it returns the character offset (in codepoints) instead of the byte offset. # # Example: # 'Café périferôl'.mb_chars =~ /ô/ #=> 12 @@ -164,7 +164,7 @@ module ActiveSupport #:nodoc: translate_offset(@wrapped_string =~ other) end - # Works just like String#split, with the exception that the items in the resulting list are Chars + # Works just like String#split, with the exception that the items in the resulting list are Chars # instances instead of String. This makes chaining methods easier. # # Example: @@ -173,7 +173,7 @@ module ActiveSupport #:nodoc: @wrapped_string.split(*args).map { |i| i.mb_chars } end - # Inserts the passed string at specified codepoint offsets + # Inserts the passed string at specified codepoint offsets. # # Example: # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl" @@ -189,7 +189,7 @@ module ActiveSupport #:nodoc: self end - # Returns true if contained string contains +other+. Returns false otherwise. + # Returns +true+ if contained string contains _other_. Returns +false+ otherwise. # # Example: # 'Café'.mb_chars.include?('é') #=> true @@ -198,17 +198,17 @@ module ActiveSupport #:nodoc: @wrapped_string.include?(other) end - # Returns the position of the passed argument in the string, counting in codepoints + # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found. # # Example: # 'Café périferôl'.mb_chars.index('ô') #=> 12 - def index(*args) - index = @wrapped_string.index(*args) + # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0 + def index(needle, offset=0) + index = @wrapped_string.index(needle, offset) index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil end - # Works just like the indexed replace method on string, except instead of byte offsets you specify - # character offsets. + # Like String#[]=, except instead of byte offsets you specify character offsets. # # Example: # @@ -248,7 +248,7 @@ module ActiveSupport #:nodoc: end end - # Works just like String#rjust, only integer specifies characters instead of bytes. + # Works just like String#rjust, only integer specifies characters instead of bytes. # # Example: # @@ -261,7 +261,7 @@ module ActiveSupport #:nodoc: justify(integer, :right, padstr) end - # Works just like String#ljust, only integer specifies characters instead of bytes. + # Works just like String#ljust, only integer specifies characters instead of bytes. # # Example: # @@ -274,7 +274,7 @@ module ActiveSupport #:nodoc: justify(integer, :left, padstr) end - # Works just like String#center, only integer specifies characters instead of bytes. + # Works just like String#center, only integer specifies characters instead of bytes. # # Example: # @@ -308,7 +308,7 @@ module ActiveSupport #:nodoc: end alias_method :length, :size - # Reverses all characters in the string + # Reverses all characters in the string. # # Example: # 'Café'.mb_chars.reverse.to_s #=> 'éfaC' @@ -343,7 +343,7 @@ module ActiveSupport #:nodoc: end alias_method :[], :slice - # Convert characters in the string to uppercase + # Convert characters in the string to uppercase. # # Example: # 'Laurent, òu sont les tests?'.mb_chars.upcase.to_s #=> "LAURENT, ÒU SONT LES TESTS?" @@ -351,7 +351,7 @@ module ActiveSupport #:nodoc: apply_mapping :uppercase_mapping end - # Convert characters in the string to lowercase + # Convert characters in the string to lowercase. # # Example: # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum" @@ -359,7 +359,7 @@ module ActiveSupport #:nodoc: apply_mapping :lowercase_mapping end - # Converts the first character to uppercase and the remainder to lowercase + # Converts the first character to uppercase and the remainder to lowercase. # # Example: # 'über'.mb_chars.capitalize.to_s #=> "Über" @@ -418,6 +418,7 @@ module ActiveSupport #:nodoc: self.class.g_unpack(@wrapped_string).length end + # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. def tidy_bytes chars(self.class.tidy_bytes(@wrapped_string)) end @@ -435,24 +436,35 @@ module ActiveSupport #:nodoc: class << self - # Unpack the string at codepoints boundaries - def u_unpack(str) + # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't + # valid UTF-8. + # + # Example: + # Chars.u_unpack('Café') #=> [67, 97, 102, 233] + def u_unpack(string) begin - str.unpack 'U*' + string.unpack 'U*' rescue ArgumentError raise EncodingError.new('malformed UTF-8 character') end end - # Detect whether the codepoint is in a certain character class. Primarily used by the - # grapheme cluster support. + # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified + # character class and +false+ otherwise. Valid character classes are: :cr, :lf, :l, + # :v, :lv, :lvt and :t. + # + # Primarily used by the grapheme cluster support. def in_char_class?(codepoint, classes) classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false end - # Unpack the string at grapheme boundaries - def g_unpack(str) - codepoints = u_unpack(str) + # Unpack the string at grapheme boundaries. Returns a list of character lists. + # + # Example: + # Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]] + # Chars.g_unpack('Café') #=> [[67], [97], [102], [233]] + def g_unpack(string) + codepoints = u_unpack(string) unpacked = [] pos = 0 marker = 0 @@ -481,13 +493,15 @@ module ActiveSupport #:nodoc: unpacked end - # Reverse operation of g_unpack + # Reverse operation of g_unpack. + # + # Example: + # Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि' def g_pack(unpacked) (unpacked.flatten).pack('U*') end - # Generates a padding string of a certain size. - def padding(padsize, padstr=' ') + def padding(padsize, padstr=' ') #:nodoc: if padsize != 0 new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize) else @@ -495,7 +509,7 @@ module ActiveSupport #:nodoc: end end - # Re-order codepoints so the string becomes canonical + # Re-order codepoints so the string becomes canonical. def reorder_characters(codepoints) length = codepoints.length- 1 pos = 0 @@ -511,7 +525,7 @@ module ActiveSupport #:nodoc: codepoints end - # Decompose composed characters to the decomposed form + # Decompose composed characters to the decomposed form. def decompose_codepoints(type, codepoints) codepoints.inject([]) do |decomposed, cp| # if it's a hangul syllable starter character @@ -532,7 +546,7 @@ module ActiveSupport #:nodoc: end end - # Compose decomposed characters to the composed form + # Compose decomposed characters to the composed form. def compose_codepoints(codepoints) pos = 0 eoa = codepoints.length - 1 @@ -591,9 +605,9 @@ module ActiveSupport #:nodoc: codepoints end - # Replaces all the non-UTF-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid UTF-8 string - def tidy_bytes(str) - str.split(//u).map do |c| + # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. + def tidy_bytes(string) + string.split(//u).map do |c| if !UTF8_PAT.match(c) n = c.unpack('C')[0] n < 128 ? n.chr : @@ -608,8 +622,7 @@ module ActiveSupport #:nodoc: protected - # Translate a byte offset in the wrapped string to a character offset by looking for the character boundary - def translate_offset(byte_offset) + def translate_offset(byte_offset) #:nodoc: return nil if byte_offset.nil? return 0 if @wrapped_string == '' chunk = @wrapped_string[0..byte_offset] @@ -629,9 +642,7 @@ module ActiveSupport #:nodoc: end end - # Justifies a string in a certain way. Valid values for way are :right, :left and - # :center. - def justify(integer, way, padstr=' ') + def justify(integer, way, padstr=' ') #:nodoc: raise ArgumentError, "zero width padding" if padstr.length == 0 padsize = integer - size padsize = padsize > 0 ? padsize : 0 @@ -648,8 +659,7 @@ module ActiveSupport #:nodoc: chars(result) end - # Map codepoints to one of it's attributes. - def apply_mapping(mapping) + def apply_mapping(mapping) #:nodoc: chars(self.class.u_unpack(@wrapped_string).map do |codepoint| cp = UCD.codepoints[codepoint] if cp and (ncp = cp.send(mapping)) and ncp > 0 @@ -660,9 +670,8 @@ module ActiveSupport #:nodoc: end.pack('U*')) end - # Creates a new instance - def chars(str) - self.class.new(str) + def chars(string) #:nodoc: + self.class.new(string) end end end diff --git a/activesupport/lib/active_support/multibyte/exceptions.rb b/activesupport/lib/active_support/multibyte/exceptions.rb index af760cc561..62066e3c71 100644 --- a/activesupport/lib/active_support/multibyte/exceptions.rb +++ b/activesupport/lib/active_support/multibyte/exceptions.rb @@ -2,6 +2,7 @@ module ActiveSupport #:nodoc: module Multibyte #:nodoc: + # Raised when a problem with the encoding was found. class EncodingError < StandardError; end end end \ No newline at end of file