Enhanced RDoc for case mapping (#5245)

Adds file doc/case_mapping.rdoc, which describes case mapping and provides a link target that methods doc can link to. Revises: String#capitalize String#capitalize! String#casecmp String#casecmp? String#downcase String#downcase! String#swapcase String#swapcase! String#upcase String#upcase! Symbol#capitalize Symbol#casecmp Symbol#casecmp? Symbol#downcase Symbol#swapcase Symbol#upcase
Merged-By: BurdetteLamar <BurdetteLamar@Yahoo.com>
2022-11-09 12:17:21 -05:00 · 2021-12-17 06:05:31 -06:00 · 2021-12-17 06:05:31 -06:00 · f7e266e6d2 · 2021-12-17 21:06:00 +09:00
commit f7e266e6d2
parent 4639336b05
2 changed files with 284 additions and 135 deletions
--- a/doc/case_mapping.rdoc
+++ b/doc/case_mapping.rdoc
@ -0,0 +1,116 @@
+== Case Mapping
+
+Some string-oriented methods use case mapping.
+
+In String:
+
+- String#capitalize
+- String#capitalize!
+- String#casecmp
+- String#casecmp?
+- String#downcase
+- String#downcase!
+- String#swapcase
+- String#swapcase!
+- String#upcase
+- String#upcase!
+
+In Symbol:
+
+- Symbol#capitalize
+- Symbol#casecmp
+- Symbol#casecmp?
+- Symbol#downcase
+- Symbol#swapcase
+- Symbol#upcase
+
+=== Default Case Mapping
+
+By default, all of these methods use full Unicode case mapping,
+which is suitable for most languages.
+See {Unicode Latin Case Chart}[https://www.unicode.org/charts/case].
+
+Non-ASCII case mapping and folding are supported for UTF-8,
+UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
+
+Context-dependent case mapping as described in
+{Table 3-17 of the Unicode standard}[https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf]
+is currently not supported.
+
+In most cases, case conversions of a string have the same number of characters.
+There are exceptions (see also +:fold+ below):
+
+  s = "\u00DF" # => "ß"
+  s.upcase     # => "SS"
+  s = "\u0149" # => "ŉ"
+  s.upcase     # => "ʼN"
+
+Case mapping may also depend on locale (see also +:turkic+ below):
+
+  s = "\u0049"        # => "I"
+  s.downcase          # => "i" # Dot above.
+  s.downcase(:turkic) # => "ı" # No dot above.
+
+Case changes may not be reversible:
+
+  s = 'Hello World!' # => "Hello World!"
+  s.downcase         # => "hello world!"
+  s.downcase.upcase  # => "HELLO WORLD!" # Different from original s.
+
+Case changing methods may not maintain Unicode normalization.
+See String#unicode_normalize).
+
+=== Options for Case Mapping
+
+Except for +casecmp+ and +casecmp?+,
+each of the case-mapping methods listed above
+accepts optional arguments, <tt>*options</tt>.
+
+The arguments may be:
+
+- +:ascii+ only.
+- +:fold+ only.
+- +:turkic+ or +:lithuanian+ or both.
+
+The options:
+
+- +:ascii+:
+  ASCII-only mapping:
+  uppercase letters ('A'..'Z') are mapped to lowercase letters ('a'..'z);
+  other characters are not changed
+
+    s = "Foo \u00D8 \u00F8 Bar" # => "Foo Ø ø Bar"
+    s.upcase                    # => "FOO Ø Ø BAR"
+    s.downcase                  # => "foo ø ø bar"
+    s.upcase(:ascii)            # => "FOO Ø ø BAR"
+    s.downcase(:ascii)          # => "foo Ø ø bar"
+
+- +:turkic+:
+  Full Unicode case mapping, adapted for the Turkic languages
+  that distinguish dotted and dotless I, for example Turkish and Azeri.
+
+    s = 'Türkiye'       # => "Türkiye"
+    s.upcase            # => "TÜRKIYE"
+    s.upcase(:turkic)   # => "TÜRKİYE" # Dot above.
+
+    s = 'TÜRKIYE'       # => "TÜRKIYE"
+    s.downcase          # => "türkiye"
+    s.downcase(:turkic) # => "türkıye" # No dot above.
+
+- +:lithuanian+:
+  Not yet implemented.
+
+- +:fold+ (available only for String#downcase, String#downcase!,
+  and Symbol#downcase):
+  Unicode case folding,
+  which is more far-reaching than Unicode case mapping.
+
+    s = "\u00DF"      # => "ß"
+    s.downcase        # => "ß"
+    s.downcase(:fold) # => "ss"
+    s.upcase          # => "SS"
+
+    s = "\uFB04"      # => "ﬄ"
+    s.downcase        # => "ﬄ"
+    s.upcase          # => "FFL"
+    s.downcase(:fold) # => "ffl"
--- a/string.c
+++ b/string.c
@ -3702,13 +3702,13 @@ static VALUE str_casecmp_p(VALUE str1, VALUE str2);

 /*
 *  call-seq:
- *    casecmp(other_str) -> -1, 0, 1, or nil
+ *    casecmp(other_string) -> -1, 0, 1, or nil
 *
- *  Compares +self+ and +other_string+, ignoring case, and returning:
+ *  Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
 *
- *  - -1 if +other_string+ is larger.
+ *  - -1 if <tt>other_string.downcase</tt> is larger.
 *  - 0 if the two are equal.
- *  - 1 if +other_string+ is smaller.
+ *  - 1 if <tt>other_string.downcase</tt> is smaller.
 *  - +nil+ if the two are incomparable.
 *
 *  Examples:
@ -3720,6 +3720,10 @@ static VALUE str_casecmp_p(VALUE str1, VALUE str2);
 *    'foo'.casecmp('FOO') # => 0
 *    'foo'.casecmp(1) # => nil
 *
+ *  See {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#casecmp?.
+ *
 */

 static VALUE
@ -3806,6 +3810,10 @@ str_casecmp(VALUE str1, VALUE str2)
 *
 *    'foo'.casecmp?(1) # => nil
 *
+ *  See {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#casecmp.
+ *
 */

 static VALUE
@ -7151,13 +7159,21 @@ upcase_single(VALUE str)

 /*
 *  call-seq:
- *     str.upcase!              -> str or nil
- *     str.upcase!([options])   -> str or nil
+ *    upcase!(*options) -> self or nil
 *
- *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
- *  were made.
+ *  Upcases the characters in +self+;
+ *  returns +self+ if any changes were made, +nil+ otherwise:
+ *
+ *    s = 'Hello World!' # => "Hello World!"
+ *    s.upcase!          # => "HELLO WORLD!"
+ *    s                  # => "HELLO WORLD!"
+ *    s.upcase!          # => nil
+ *
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#upcase, String#downcase, String#downcase!.
 *
- *  See String#downcase for meaning of +options+ and use with different encodings.
 */

 static VALUE
@ -7185,15 +7201,18 @@ rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)

 /*
 *  call-seq:
- *     str.upcase              -> new_str
- *     str.upcase([options])   -> new_str
+ *    upcase(*options) -> string
 *
- *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
- *  uppercase counterparts.
+ *  Returns a string containing the upcased characters in +self+:
 *
- *  See String#downcase for meaning of +options+ and use with different encodings.
+ *     s = 'Hello World!' # => "Hello World!"
+ *     s.upcase           # => "HELLO WORLD!"
+ *
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#upcase!, String#downcase, String#downcase!.
 *
- *     "hEllO".upcase   #=> "HELLO"
 */

 static VALUE
@ -7242,13 +7261,21 @@ downcase_single(VALUE str)

 /*
 *  call-seq:
- *     str.downcase!             -> str or nil
- *     str.downcase!([options])  -> str or nil
+ *    downcase!(*options) -> self or nil
 *
- *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
- *  changes were made.
+ *  Downcases the characters in +self+;
+ *  returns +self+ if any changes were made, +nil+ otherwise:
+ *
+ *    s = 'Hello World!' # => "Hello World!"
+ *    s.downcase!        # => "hello world!"
+ *    s                  # => "hello world!"
+ *    s.downcase!        # => nil
+ *
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#downcase, String#upcase, String#upcase!.
 *
- *  See String#downcase for meaning of +options+ and use with different encodings.
 */

 static VALUE
@ -7276,52 +7303,18 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)

 /*
 *  call-seq:
- *     str.downcase              -> new_str
- *     str.downcase([options])   -> new_str
+ *    downcase(*options) -> string
 *
- *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
- *  lowercase counterparts. Which letters exactly are replaced, and by which
- *  other letters, depends on the presence or absence of options, and on the
- *  +encoding+ of the string.
+ *  Returns a string containing the downcased characters in +self+:
 *
- *  The meaning of the +options+ is as follows:
+ *     s = 'Hello World!' # => "Hello World!"
+ *     s.downcase         # => "hello world!"
 *
- *  No option ::
- *    Full Unicode case mapping, suitable for most languages
- *    (see :turkic and :lithuanian options below for exceptions).
- *    Context-dependent case mapping as described in Table 3-14 of the
- *    Unicode standard is currently not supported.
- *  :ascii ::
- *    Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
- *    ``a'' to ``z'', are affected.
- *    This option cannot be combined with any other option.
- *  :turkic ::
- *    Full Unicode case mapping, adapted for Turkic languages
- *    (Turkish, Azerbaijani, ...). This means that upper case I is mapped to
- *    lower case dotless i, and so on.
- *  :lithuanian ::
- *    Currently, just full Unicode case mapping. In the future, full Unicode
- *    case mapping adapted for Lithuanian (keeping the dot on the lower case
- *    i even if there is an accent on top).
- *  :fold ::
- *    Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
- *    which is more far-reaching than Unicode case mapping.
- *    This option currently cannot be combined with any other option
- *    (i.e. there is currently no variant for turkic languages).
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
 *
- *  Please note that several assumptions that are valid for ASCII-only case
- *  conversions do not hold for more general case conversions. For example,
- *  the length of the result may not be the same as the length of the input
- *  (neither in characters nor in bytes), some roundtrip assumptions
- *  (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
- *  normalization (i.e. String#unicode_normalize) is not necessarily maintained
- *  by case mapping operations.
+ *  Related: String#downcase!, String#upcase, String#upcase!.
 *
- *  Non-ASCII case mapping/folding is currently supported for UTF-8,
- *  UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
- *  This support will be extended to other encodings.
- *
- *     "hEllO".downcase   #=> "hello"
 */

 static VALUE
@ -7352,20 +7345,22 @@ rb_str_downcase(int argc, VALUE *argv, VALUE str)

 /*
 *  call-seq:
- *     str.capitalize!              -> str or nil
- *     str.capitalize!([options])   -> str or nil
+ *    capitalize!(*options) -> self or nil
 *
- *  Modifies <i>str</i> by converting the first character to uppercase and the
- *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
- *  There is an exception for modern Georgian (mkhedruli/MTAVRULI), where
- *  the result is the same as for String#downcase, to avoid mixed case.
+ *  Upcases the first character in +self+;
+ *  downcases the remaining characters;
+ *  returns +self+ if any changes were made, +nil+ otherwise:
 *
- *  See String#downcase for meaning of +options+ and use with different encodings.
+ *    s = 'hello World!' # => "hello World!"
+ *    s.capitalize!      # => "Hello world!"
+ *    s                  # => "Hello world!"
+ *    s.capitalize!      # => nil
+ *
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#capitalize.
 *
- *     a = "hello"
- *     a.capitalize!   #=> "Hello"
- *     a               #=> "Hello"
- *     a.capitalize!   #=> nil
 */

 static VALUE
@ -7390,17 +7385,20 @@ rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)

 /*
 *  call-seq:
- *     str.capitalize              -> new_str
- *     str.capitalize([options])   -> new_str
+ *    capitalize(*options) -> string
 *
- *  Returns a copy of <i>str</i> with the first character converted to uppercase
- *  and the remainder to lowercase.
+ *  Returns a string containing the characters in +self+;
+ *  the first character is upcased;
+ *  the remaining characters are downcased:
 *
- *  See String#downcase for meaning of +options+ and use with different encodings.
+ *     s = 'hello World!' # => "hello World!"
+ *     s.capitalize       # => "Hello world!"
+ *
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#capitalize!.
 *
- *     "hello".capitalize    #=> "Hello"
- *     "HELLO".capitalize    #=> "Hello"
- *     "123ABC".capitalize   #=> "123abc"
 */

 static VALUE
@ -7426,14 +7424,22 @@ rb_str_capitalize(int argc, VALUE *argv, VALUE str)

 /*
 *  call-seq:
- *     str.swapcase!              -> str or nil
- *     str.swapcase!([options])   -> str or nil
+ *    swapcase!(*options) -> self or nil
 *
- *  Equivalent to String#swapcase, but modifies the receiver in place,
- *  returning <i>str</i>, or <code>nil</code> if no changes were made.
+ *  Upcases each lowercase character in +self+;
+ *  downcases uppercase character;
+ *  returns +self+ if any changes were made, +nil+ otherwise:
+ *
+ *    s = 'Hello World!' # => "Hello World!"
+ *    s.swapcase!        # => "hELLO wORLD!"
+ *    s                  # => "Hello World!"
+ *    ''.swapcase!       # => nil
+ *
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#swapcase.
 *
- *  See String#downcase for meaning of +options+ and use with
- *  different encodings.
 */

 static VALUE
@ -7457,16 +7463,20 @@ rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)

 /*
 *  call-seq:
- *     str.swapcase              -> new_str
- *     str.swapcase([options])   -> new_str
+ *    swapcase(*options) -> string
 *
- *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
- *  to lowercase and lowercase characters converted to uppercase.
+ *  Returns a string containing the characters in +self+, with cases reversed;
+ *  each uppercase character is downcased;
+ *  each lowercase character is upcased:
 *
- *  See String#downcase for meaning of +options+ and use with different encodings.
+ *     s = 'Hello World!' # => "Hello World!"
+ *     s.swapcase         # => "hELLO wORLD!"
+ *
+ *  The casing may be affected by the given +options+;
+ *  see {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: String#swapcase!.
 *
- *     "Hello".swapcase          #=> "hELLO"
- *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
 */

 static VALUE
@ -11502,23 +11512,29 @@ sym_cmp(VALUE sym, VALUE other)
 }

 /*
- * call-seq:
- *   sym.casecmp(other_symbol)   -> -1, 0, +1, or nil
+ *  call-seq:
+ *    casecmp(other_symbol) -> -1, 0, 1, or nil
 *
- * Case-insensitive version of Symbol#<=>.
- * Currently, case-insensitivity only works on characters A-Z/a-z,
- * not all of Unicode. This is different from Symbol#casecmp?.
+ *  Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
 *
- *   :aBcDeF.casecmp(:abcde)     #=> 1
- *   :aBcDeF.casecmp(:abcdef)    #=> 0
- *   :aBcDeF.casecmp(:abcdefg)   #=> -1
- *   :abcdef.casecmp(:ABCDEF)    #=> 0
+ *    :aBcDeF.casecmp(:abcde)   # => 1
+ *    :aBcDeF.casecmp(:abcdef)  # => 0
+ *    :aBcDeF.casecmp(:abcdefg) # => -1
+ *    :abcdef.casecmp(:ABCDEF)  # => 0
 *
- * +nil+ is returned if the two symbols have incompatible encodings,
- * or if +other_symbol+ is not a symbol.
+ *  Returns +nil+ if the two symbols have incompatible encodings,
+ *  or if +other_symbol+ is not a symbol:
+ *
+ *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
+ *    other_sym = :"\u{c4 d6 dc}"
+ *    sym.casecmp(other_sym) # => nil
+ *    :foo.casecmp(2)        # => nil
+ *
+ *  Currently, case-insensitivity only works on characters A-Z/a-z,
+ *  not all of Unicode. This is different from Symbol#casecmp?.
+ *
+ *  Related: Symbol#casecmp?.
 *
- *   :foo.casecmp(2)   #=> nil
- *   "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp(:"\u{c4 d6 dc}")   #=> nil
 */

 static VALUE
@ -11531,23 +11547,30 @@ sym_casecmp(VALUE sym, VALUE other)
 }

 /*
- * call-seq:
- *   sym.casecmp?(other_symbol)   -> true, false, or nil
+ *  call-seq:
+ *    casecmp?(other_symbol) -> true, false, or nil
 *
- * Returns +true+ if +sym+ and +other_symbol+ are equal after
- * Unicode case folding, +false+ if they are not equal.
+ *  Returns +true+ if +sym+ and +other_symbol+ are equal after
+ *  Unicode case folding, +false+ if they are not equal:
 *
- *   :aBcDeF.casecmp?(:abcde)     #=> false
- *   :aBcDeF.casecmp?(:abcdef)    #=> true
- *   :aBcDeF.casecmp?(:abcdefg)   #=> false
- *   :abcdef.casecmp?(:ABCDEF)    #=> true
- *   :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}")   #=> true
+ *    :aBcDeF.casecmp?(:abcde)                  # => false
+ *    :aBcDeF.casecmp?(:abcdef)                 # => true
+ *    :aBcDeF.casecmp?(:abcdefg)                # => false
+ *    :abcdef.casecmp?(:ABCDEF)                 # => true
+ *    :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
 *
- * +nil+ is returned if the two symbols have incompatible encodings,
- * or if +other_symbol+ is not a symbol.
+ *  Returns +nil+ if the two symbols have incompatible encodings,
+ *  or if +other_symbol+ is not a symbol:
+ *
+ *    sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
+ *    other_sym = :"\u{c4 d6 dc}"
+ *    sym.casecmp?(other_sym) # => nil
+ *    :foo.casecmp?(2)        # => nil
+ *
+ *  See {Case Mapping}[doc/case_mapping_rdoc.html].
+ *
+ *  Related: Symbol#casecmp.
 *
- *   :foo.casecmp?(2)   #=> nil
- *   "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym.casecmp?(:"\u{c4 d6 dc}")   #=> nil
 */

 static VALUE
@ -11644,11 +11667,13 @@ sym_empty(VALUE sym)
 }

 /*
- * call-seq:
- *   sym.upcase              -> symbol
- *   sym.upcase([options])   -> symbol
+ *  call-seq:
+ *    upcase(*options) -> symbol
+ *
+ *  Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
+ *
+ *  See String#upcase.
 *
- * Same as <code>sym.to_s.upcase.intern</code>.
 */

 static VALUE
@ -11658,11 +11683,15 @@ sym_upcase(int argc, VALUE *argv, VALUE sym)
 }

 /*
- * call-seq:
- *   sym.downcase              -> symbol
- *   sym.downcase([options])   -> symbol
+ *  call-seq:
+ *    downcase(*options) -> symbol
+ *
+ *  Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
+ *
+ *  See String#downcase.
+ *
+ *  Related: Symbol#upcase.
 *
- * Same as <code>sym.to_s.downcase.intern</code>.
 */

 static VALUE
@ -11672,11 +11701,13 @@ sym_downcase(int argc, VALUE *argv, VALUE sym)
 }

 /*
- * call-seq:
- *   sym.capitalize              -> symbol
- *   sym.capitalize([options])   -> symbol
+ *  call-seq:
+ *    capitalize(*options) -> symbol
+ *
+ *  Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
+ *
+ *  See String#capitalize.
 *
- * Same as <code>sym.to_s.capitalize.intern</code>.
 */

 static VALUE
@ -11686,11 +11717,13 @@ sym_capitalize(int argc, VALUE *argv, VALUE sym)
 }

 /*
- * call-seq:
- *   sym.swapcase              -> symbol
- *   sym.swapcase([options])   -> symbol
+ *  call-seq:
+ *    swapcase(*options) -> symbol
+ *
+ *  Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
+ *
+ *  See String#swapcase.
 *
- * Same as <code>sym.to_s.swapcase.intern</code>.
 */

 static VALUE