mirror of
				https://github.com/ruby/ruby.git
				synced 2022-11-09 12:17:21 -05:00 
			
		
		
		
	* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 rev.110.
* Fix: check_bom cuts \xfe\xff\xXX\xXX of UTF-32. * Add support --ic=UTF-32. * Fix: can't guess UTF-16 and UTF-32. * Fix: can't decode beyond BMP of UTF-16LE. * ext/nkf/nkf.c (guess): Support UTF-32. * ext/nkf/lib/kconv.rb (kconv): Support UTF-32. * ext/nkf/lib/kconv.rb (to_utf32): new method. * ext/nkf/lib/kconv.rb (to_utf32): new method. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@10938 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
		
							parent
							
								
									976b4e5f8b
								
							
						
					
					
						commit
						5300eecfb3
					
				
					 4 changed files with 131 additions and 31 deletions
				
			
		
							
								
								
									
										14
									
								
								ChangeLog
									
										
									
									
									
								
							
							
						
						
									
										14
									
								
								ChangeLog
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,3 +1,17 @@
 | 
			
		|||
Fri Sep 15 20:22:15 2006  NARUSE, Yui  <naruse@ruby-lang.org>
 | 
			
		||||
 | 
			
		||||
	* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 rev.110.
 | 
			
		||||
          * Fix: check_bom cuts \xfe\xff\xXX\xXX of UTF-32LE.
 | 
			
		||||
          * Add support --ic=UTF-32.
 | 
			
		||||
          * Fix: can't guess UTF-16 and UTF-32.
 | 
			
		||||
          * Fix: can't decode beyond BMP of UTF-16LE.
 | 
			
		||||
 | 
			
		||||
	* ext/nkf/nkf.c (guess): Support UTF-32.
 | 
			
		||||
 | 
			
		||||
	* ext/nkf/lib/kconv.rb (kconv): Support UTF-32.
 | 
			
		||||
 | 
			
		||||
	* ext/nkf/lib/kconv.rb (to_utf32): new method.
 | 
			
		||||
 | 
			
		||||
Fri Sep 15 05:23:24 2006  NARUSE, Yui  <naruse@ruby-lang.org>
 | 
			
		||||
 | 
			
		||||
	* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 2006-09-15.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -105,6 +105,8 @@ module Kconv
 | 
			
		|||
      opt << 'W'
 | 
			
		||||
    when ::NKF::UTF16
 | 
			
		||||
      opt << 'W16'
 | 
			
		||||
    when ::NKF::UTF32
 | 
			
		||||
      opt << 'W32'
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    case out_code
 | 
			
		||||
| 
						 | 
				
			
			@ -118,6 +120,8 @@ module Kconv
 | 
			
		|||
      opt << 'w'
 | 
			
		||||
    when ::NKF::UTF16
 | 
			
		||||
      opt << 'w16'
 | 
			
		||||
    when ::NKF::UTF32
 | 
			
		||||
      opt << 'w32'
 | 
			
		||||
    when ::NKF::NOCONV
 | 
			
		||||
      return str
 | 
			
		||||
    end
 | 
			
		||||
| 
						 | 
				
			
			@ -202,6 +206,20 @@ module Kconv
 | 
			
		|||
  end
 | 
			
		||||
  module_function :toutf16
 | 
			
		||||
 | 
			
		||||
  # call-seq:
 | 
			
		||||
  #    Kconv.toutf32(str)   -> string
 | 
			
		||||
  #
 | 
			
		||||
  # Convert <code>str</code> to UTF-32
 | 
			
		||||
  #
 | 
			
		||||
  # *Note*
 | 
			
		||||
  # This method decode MIME encoded string and
 | 
			
		||||
  # convert halfwidth katakana to fullwidth katakana.
 | 
			
		||||
  # If you don't want it, use NKF.nkf('-w32xm0', str).
 | 
			
		||||
  def toutf32(str)
 | 
			
		||||
    ::NKF::nkf('-w32m', str)
 | 
			
		||||
  end
 | 
			
		||||
  module_function :toutf32
 | 
			
		||||
 | 
			
		||||
  #
 | 
			
		||||
  # guess
 | 
			
		||||
  #
 | 
			
		||||
| 
						 | 
				
			
			@ -337,6 +355,17 @@ class String
 | 
			
		|||
  # If you don't want it, use NKF.nkf('-w16xm0', str).
 | 
			
		||||
  def toutf16; Kconv.toutf16(self) end
 | 
			
		||||
 | 
			
		||||
  # call-seq:
 | 
			
		||||
  #    String#toutf32   -> string
 | 
			
		||||
  #
 | 
			
		||||
  # Convert <code>self</code> to UTF-32
 | 
			
		||||
  #
 | 
			
		||||
  # *Note*
 | 
			
		||||
  # This method decode MIME encoded string and
 | 
			
		||||
  # convert halfwidth katakana to fullwidth katakana.
 | 
			
		||||
  # If you don't want it, use NKF.nkf('-w32xm0', str).
 | 
			
		||||
  def toutf32; Kconv.toutf32(self) end
 | 
			
		||||
 | 
			
		||||
  #
 | 
			
		||||
  # is Encoding
 | 
			
		||||
  #
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -581,6 +581,8 @@ struct input_code input_code_list[] = {
 | 
			
		|||
    {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
 | 
			
		||||
#ifdef UTF8_INPUT_ENABLE
 | 
			
		||||
    {"UTF-8",     0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
 | 
			
		||||
    {"UTF-16",    0, 0, 0, {0, 0, 0},     NULL, w_iconv16, 0},
 | 
			
		||||
    {"UTF-32",    0, 0, 0, {0, 0, 0},     NULL, w_iconv32, 0},
 | 
			
		||||
#endif
 | 
			
		||||
    {0}
 | 
			
		||||
};
 | 
			
		||||
| 
						 | 
				
			
			@ -1293,6 +1295,15 @@ void options(unsigned char *cp)
 | 
			
		|||
			     strcmp(codeset, "UTF-16LE-BOM") == 0){
 | 
			
		||||
			input_f = UTF16_INPUT;
 | 
			
		||||
			input_endian = ENDIAN_LITTLE;
 | 
			
		||||
		    }else if(strcmp(codeset, "UTF-32") == 0 ||
 | 
			
		||||
			     strcmp(codeset, "UTF-32BE") == 0 ||
 | 
			
		||||
			     strcmp(codeset, "UTF-32BE-BOM") == 0){
 | 
			
		||||
			input_f = UTF32_INPUT;
 | 
			
		||||
			input_endian = ENDIAN_BIG;
 | 
			
		||||
		    }else if(strcmp(codeset, "UTF-32LE") == 0 ||
 | 
			
		||||
			     strcmp(codeset, "UTF-32LE-BOM") == 0){
 | 
			
		||||
			input_f = UTF32_INPUT;
 | 
			
		||||
			input_endian = ENDIAN_LITTLE;
 | 
			
		||||
#endif
 | 
			
		||||
		    }
 | 
			
		||||
                    continue;
 | 
			
		||||
| 
						 | 
				
			
			@ -1901,12 +1912,7 @@ void options(unsigned char *cp)
 | 
			
		|||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef ANSI_C_PROTOTYPE
 | 
			
		||||
struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
 | 
			
		||||
#else
 | 
			
		||||
struct input_code * find_inputcode_byfunc(iconv_func)
 | 
			
		||||
     nkf_char (*iconv_func)();
 | 
			
		||||
#endif
 | 
			
		||||
{
 | 
			
		||||
    if (iconv_func){
 | 
			
		||||
        struct input_code *p = input_code_list;
 | 
			
		||||
| 
						 | 
				
			
			@ -2227,6 +2233,12 @@ void code_status(nkf_char c)
 | 
			
		|||
    struct input_code *result = 0;
 | 
			
		||||
    struct input_code *p = input_code_list;
 | 
			
		||||
    while (p->name){
 | 
			
		||||
        if (!p->status_func) {
 | 
			
		||||
	    ++p;
 | 
			
		||||
	    continue;
 | 
			
		||||
	}
 | 
			
		||||
        if (!p->status_func)
 | 
			
		||||
	    continue;
 | 
			
		||||
        (p->status_func)(p, c);
 | 
			
		||||
        if (p->stat > 0){
 | 
			
		||||
            action_flag = 0;
 | 
			
		||||
| 
						 | 
				
			
			@ -2407,8 +2419,11 @@ void check_bom(FILE *f)
 | 
			
		|||
		    if(!input_f){
 | 
			
		||||
			set_iconv(TRUE, w_iconv32);
 | 
			
		||||
		    }
 | 
			
		||||
		    input_endian = ENDIAN_BIG;
 | 
			
		||||
		    return;
 | 
			
		||||
		    if (iconv == w_iconv32) {
 | 
			
		||||
			input_endian = ENDIAN_BIG;
 | 
			
		||||
			return;
 | 
			
		||||
		    }
 | 
			
		||||
		    (*i_ungetc)(0xFF,f);
 | 
			
		||||
		}else (*i_ungetc)(c2,f);
 | 
			
		||||
		(*i_ungetc)(0xFE,f);
 | 
			
		||||
	    }else if(c2 == 0xFF){
 | 
			
		||||
| 
						 | 
				
			
			@ -2416,8 +2431,11 @@ void check_bom(FILE *f)
 | 
			
		|||
		    if(!input_f){
 | 
			
		||||
			set_iconv(TRUE, w_iconv32);
 | 
			
		||||
		    }
 | 
			
		||||
		    input_endian = ENDIAN_2143;
 | 
			
		||||
		    return;
 | 
			
		||||
		    if (iconv == w_iconv32) {
 | 
			
		||||
			input_endian = ENDIAN_2143;
 | 
			
		||||
			return;
 | 
			
		||||
		    }
 | 
			
		||||
		    (*i_ungetc)(0xFF,f);
 | 
			
		||||
		}else (*i_ungetc)(c2,f);
 | 
			
		||||
		(*i_ungetc)(0xFF,f);
 | 
			
		||||
	    }else (*i_ungetc)(c2,f);
 | 
			
		||||
| 
						 | 
				
			
			@ -2431,7 +2449,10 @@ void check_bom(FILE *f)
 | 
			
		|||
		if(!input_f){
 | 
			
		||||
		    set_iconv(TRUE, w_iconv);
 | 
			
		||||
		}
 | 
			
		||||
		return;
 | 
			
		||||
		if (iconv == w_iconv) {
 | 
			
		||||
		    return;
 | 
			
		||||
		}
 | 
			
		||||
		(*i_ungetc)(0xBF,f);
 | 
			
		||||
	    }else (*i_ungetc)(c2,f);
 | 
			
		||||
	    (*i_ungetc)(0xBB,f);
 | 
			
		||||
	}else (*i_ungetc)(c2,f);
 | 
			
		||||
| 
						 | 
				
			
			@ -2444,16 +2465,22 @@ void check_bom(FILE *f)
 | 
			
		|||
		    if(!input_f){
 | 
			
		||||
			set_iconv(TRUE, w_iconv32);
 | 
			
		||||
		    }
 | 
			
		||||
		    input_endian = ENDIAN_3412;
 | 
			
		||||
		    return;
 | 
			
		||||
		    if (iconv == w_iconv32) {
 | 
			
		||||
			input_endian = ENDIAN_3412;
 | 
			
		||||
			return;
 | 
			
		||||
		    }
 | 
			
		||||
		    (*i_ungetc)(0x00,f);
 | 
			
		||||
		}else (*i_ungetc)(c2,f);
 | 
			
		||||
		(*i_ungetc)(0x00,f);
 | 
			
		||||
	    }else (*i_ungetc)(c2,f);
 | 
			
		||||
	    if(!input_f){
 | 
			
		||||
		set_iconv(TRUE, w_iconv16);
 | 
			
		||||
	    }
 | 
			
		||||
	    input_endian = ENDIAN_BIG;
 | 
			
		||||
	    return;
 | 
			
		||||
	    if (iconv == w_iconv16) {
 | 
			
		||||
		input_endian = ENDIAN_BIG;
 | 
			
		||||
		return;
 | 
			
		||||
	    }
 | 
			
		||||
	    (*i_ungetc)(0xFF,f);
 | 
			
		||||
	}else (*i_ungetc)(c2,f);
 | 
			
		||||
	(*i_ungetc)(0xFE,f);
 | 
			
		||||
	break;
 | 
			
		||||
| 
						 | 
				
			
			@ -2464,16 +2491,22 @@ void check_bom(FILE *f)
 | 
			
		|||
		    if(!input_f){
 | 
			
		||||
			set_iconv(TRUE, w_iconv32);
 | 
			
		||||
		    }
 | 
			
		||||
		    input_endian = ENDIAN_LITTLE;
 | 
			
		||||
		    return;
 | 
			
		||||
		    if (iconv == w_iconv32) {
 | 
			
		||||
			input_endian = ENDIAN_LITTLE;
 | 
			
		||||
			return;
 | 
			
		||||
		    }
 | 
			
		||||
		    (*i_ungetc)(0x00,f);
 | 
			
		||||
		}else (*i_ungetc)(c2,f);
 | 
			
		||||
		(*i_ungetc)(0x00,f);
 | 
			
		||||
	    }else (*i_ungetc)(c2,f);
 | 
			
		||||
	    if(!input_f){
 | 
			
		||||
		set_iconv(TRUE, w_iconv16);
 | 
			
		||||
	    }
 | 
			
		||||
	    input_endian = ENDIAN_LITTLE;
 | 
			
		||||
	    return;
 | 
			
		||||
	    if (iconv == w_iconv16) {
 | 
			
		||||
		input_endian = ENDIAN_LITTLE;
 | 
			
		||||
		return;
 | 
			
		||||
	    }
 | 
			
		||||
	    (*i_ungetc)(0xFE,f);
 | 
			
		||||
	}else (*i_ungetc)(c2,f);
 | 
			
		||||
	(*i_ungetc)(0xFF,f);
 | 
			
		||||
	break;
 | 
			
		||||
| 
						 | 
				
			
			@ -2557,21 +2590,21 @@ nkf_char kanji_convert(FILE *f)
 | 
			
		|||
				c0 <<= 8;
 | 
			
		||||
				if ((c3 = (*i_getc)(f)) != EOF) {
 | 
			
		||||
				    c0 |= c3;
 | 
			
		||||
				} else c1 = EOF;
 | 
			
		||||
			    } else c1 = EOF;
 | 
			
		||||
				} else c2 = EOF;
 | 
			
		||||
			    } else c2 = EOF;
 | 
			
		||||
			}
 | 
			
		||||
		    }
 | 
			
		||||
		    } else c2 = EOF;
 | 
			
		||||
		} else {
 | 
			
		||||
		    if ((c2 = (*i_getc)(f)) != EOF) {
 | 
			
		||||
			if (0xD8 <= c2 && c2 <= 0xDB) {
 | 
			
		||||
			    if ((c3 = (*i_getc)(f)) != EOF) {
 | 
			
		||||
				c3 <<= 8;
 | 
			
		||||
				if ((c0 = (*i_getc)(f)) != EOF) {
 | 
			
		||||
				    c0 <<= 8;
 | 
			
		||||
				    c0 |= c3;
 | 
			
		||||
				} else c1 = EOF;
 | 
			
		||||
			    } else c1 = EOF;
 | 
			
		||||
				} else c2 = EOF;
 | 
			
		||||
			    } else c2 = EOF;
 | 
			
		||||
			}
 | 
			
		||||
		    } else c1 = EOF;
 | 
			
		||||
		    } else c2 = EOF;
 | 
			
		||||
		}
 | 
			
		||||
		SEND;
 | 
			
		||||
            } else if(iconv == w_iconv32){
 | 
			
		||||
| 
						 | 
				
			
			@ -2595,7 +2628,7 @@ nkf_char kanji_convert(FILE *f)
 | 
			
		|||
		    }
 | 
			
		||||
		    c2 = 0;
 | 
			
		||||
		}else{
 | 
			
		||||
		    c1 = EOF;
 | 
			
		||||
		    c2 = EOF;
 | 
			
		||||
		}
 | 
			
		||||
		SEND;
 | 
			
		||||
            } else
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -306,6 +306,8 @@ rb_nkf_guess1(VALUE obj, VALUE src)
 | 
			
		|||
 *       "UTF-8"
 | 
			
		||||
 *     when NKF::UTF16
 | 
			
		||||
 *       "UTF-16"
 | 
			
		||||
 *     when NKF::UTF32
 | 
			
		||||
 *       "UTF-32"
 | 
			
		||||
 *     when NKF::UNKNOWN
 | 
			
		||||
 *       "UNKNOWN"
 | 
			
		||||
 *     when NKF::BINARY
 | 
			
		||||
| 
						 | 
				
			
			@ -345,6 +347,8 @@ rb_nkf_guess2(VALUE obj, VALUE src)
 | 
			
		|||
      code = _UTF8;
 | 
			
		||||
    } else if (strcmp(input_codename, "UTF-16") == 0) {
 | 
			
		||||
      code = _UTF16;
 | 
			
		||||
    } else if (strcmp(input_codename, "UTF-32") == 0) {
 | 
			
		||||
      code = _UTF32;
 | 
			
		||||
    } else if (strlen(input_codename) > 0) {
 | 
			
		||||
      code = _UNKNOWN;
 | 
			
		||||
    }
 | 
			
		||||
| 
						 | 
				
			
			@ -382,16 +386,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
 | 
			
		|||
 *
 | 
			
		||||
 *  Output is buffered (DEFAULT), Output is unbuffered.
 | 
			
		||||
 *
 | 
			
		||||
 *  === -j -s -e -w -w16
 | 
			
		||||
 *  === -j -s -e -w -w16 -w32
 | 
			
		||||
 *
 | 
			
		||||
 *  Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
 | 
			
		||||
 *  UTF-8N, UTF-16BE.
 | 
			
		||||
 *  UTF-8N, UTF-16BE, UTF-32BE.
 | 
			
		||||
 *  Without this option and compile option, ISO-2022-JP is assumed.
 | 
			
		||||
 *
 | 
			
		||||
 *  === -J -S -E -W -W16
 | 
			
		||||
 *  === -J -S -E -W -W16 -W32
 | 
			
		||||
 *
 | 
			
		||||
 *  Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
 | 
			
		||||
 *  UTF-8, UTF-16LE.
 | 
			
		||||
 *  UTF-8, UTF-16, UTF-32.
 | 
			
		||||
 *
 | 
			
		||||
 *  ==== -J
 | 
			
		||||
 *
 | 
			
		||||
| 
						 | 
				
			
			@ -574,6 +578,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
 | 
			
		|||
 *
 | 
			
		||||
 *  [UTF-16LE-BOM] UTF-16 Little Endian with BOM
 | 
			
		||||
 *
 | 
			
		||||
 *  [UTF-32] same as UTF-32BE
 | 
			
		||||
 *
 | 
			
		||||
 *  [UTF-32BE] UTF-32 Big Endian without BOM
 | 
			
		||||
 *
 | 
			
		||||
 *  [UTF-32BE-BOM] UTF-32 Big Endian with BOM
 | 
			
		||||
 *
 | 
			
		||||
 *  [UTF-32LE] UTF-32 Little Endian without BOM
 | 
			
		||||
 *
 | 
			
		||||
 *  [UTF-32LE-BOM] UTF-32 Little Endian with BOM
 | 
			
		||||
 *
 | 
			
		||||
 *  [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
 | 
			
		||||
 *
 | 
			
		||||
 *  === --fb-{skip, html, xml, perl, java, subchar}
 | 
			
		||||
| 
						 | 
				
			
			@ -587,10 +601,20 @@ rb_nkf_guess2(VALUE obj, VALUE src)
 | 
			
		|||
 *  nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
 | 
			
		||||
 *  1st byte of argument is the escape character and following bytes are target characters.
 | 
			
		||||
 *
 | 
			
		||||
 *  === --disable-cp932ext
 | 
			
		||||
 *  === --no-cp932ext
 | 
			
		||||
 *
 | 
			
		||||
 *  Handle the characters extended in CP932 as unassigned characters.
 | 
			
		||||
 *
 | 
			
		||||
 *  == --no-best-fit-chars
 | 
			
		||||
 *
 | 
			
		||||
 *  When Unicode to Encoded byte conversion,
 | 
			
		||||
 *  don't convert characters which is not round trip safe.
 | 
			
		||||
 *  When Unicode to Unicode conversion,
 | 
			
		||||
 *  with this and -x option, nkf can be used as UTF converter.
 | 
			
		||||
 *  (In other words, without this and -x option, nkf doesn't save some characters)
 | 
			
		||||
 *
 | 
			
		||||
 *  When nkf convert string which related to path, you should use this opion.
 | 
			
		||||
 *
 | 
			
		||||
 *  === --cap-input
 | 
			
		||||
 *
 | 
			
		||||
 *  Decode hex encoded characters.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue