1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00
ruby--ruby/lib/xsd/charset.rb

179 lines
5.3 KiB
Ruby
Raw Normal View History

=begin
XSD4R - Charset handling library.
Copyright (C) 2001, 2003 NAKAMURA, Hiroshi.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PRATICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 675 Mass
Ave, Cambridge, MA 02139, USA.
=end
module XSD
module Charset
@encoding = $KCODE
class XSDError < StandardError; end
class CharsetError < XSDError; end
class UnknownCharsetError < CharsetError; end
class CharsetConversionError < CharsetError; end
public
###
## Maps
#
EncodingConvertMap = {}
def Charset.init
begin
require 'xsd/iconvcharset'
@encoding = 'UTF8'
EncodingConvertMap[['UTF8', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) }
EncodingConvertMap[['EUC' , 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) }
EncodingConvertMap[['EUC' , 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv("shift-jis", "euc-jp", str) }
* gc.c (Init_stack): stack region is far smaller than usual if pthread is used. * marshal.c (w_extended): singleton methods should not be checked when dumping via marshal_dump() or _dump(). [ruby-talk:85909] * file.c (getcwdofdrv): avoid using getcwd() directly, use my_getcwd() instead. * merged NeXT, OpenStep, Rhapsody ports patch from Eric Sunshine <sunshine@sunshineco.com>. [ruby-core:01596] * marshal.c (w_object): LINK check earlier than anything else, i.e. do not dump TYPE_IVAR for already dumped objects. (ruby-bugs PR#1220) * eval.c (rb_eval): call "inherited" only when a new class is generated; not on reopening. * eval.c (eval): prepend error position in evaluating string to * configure.in: revived NextStep, OpenStep, and Rhapsody ports which had become unbuildable; enhanced --enable-fat-binary option so that it accepts a list of desired architectures (rather than assuming a fixed list), or defaults to a platform-appropriate list if user does not provide an explicit list; made the default list of architectures for MAB (fat binary) more comprehensive; now uses -fno-common even when building the interpreter (in addition to using it for extensions), thus allowing the interpreter to be embedded into a plugin module of an external project (in addition to allowing embedding directly into an application); added checks for <netinet/in_systm.h> (needed by `socket' extension) and getcwd(); now ensures that -I/usr/local/include is employed when extensions' extconf.rb scripts invoke have_header() since extension checks on NextStep and OpenStep will fail without it if the desired resource resides in the /usr/local tree; fixed formatting of --help message. * Makefile.in: $(LIBRUBY_A) rule now deletes the archive before invoking $(AR) since `ar' on Apple/NeXT can not "update" MAB archives (see configure's --enable-fat-binary option); added rule for new missing/getcwd.c. * defines.h: fixed endian handling during MAB build (see configure's --enable-fat-binary option) to ensure that all portions of the project see the correct WORDS_BIGENDIAN value (some extension modules were getting the wrong endian setting); added missing constants GETPGRP_VOID, WNOHANG, WUNTRACED, X_OK, and type pid_t for NextStep and OpenStep; removed unnecessary and problematic HAVE_SYS_WAIT_H define in NeXT section. * dir.c: do not allow NAMLEN() macro to trust dirent::d_namlen on NextStep since, on some installations, this value always resolves uselessly to zero. * dln.c: added error reporting to NextStep extension loader since the previous behavior of failing silently was not useful; now ensures that NSLINKMODULE_OPTION_BINDNOW compatibility constant is defined for OpenStep and Rhapsody; no longer includes <mach-o/dyld.h> twice on Rhapsody since this header lacks multiple-include protection, which resulted in "redefinition" compilation errors. * main.c: also create hard reference to objc_msgSend() on NeXT platforms (in addition to Apple platforms). * lib/mkmf.rb: now exports XCFLAGS from configure script to extension makefiles so that extensions can be built MAB (see configure's --enable-fat-binary option); also utilize XCFLAGS in cc_command() (but not cpp_command() because MAB flags are incompatible with direct invocation of `cpp'). * ext/curses/extconf.rb: now additionally checks for presence of these curses functions which are not present on NextStep or Openstep: bkgd(), bkgdset(), color(), curs(), getbkgd(), init(), scrl(), set(), setscrreg(), wattroff(), wattron(), wattrset(), wbkgd(), wbkgdset(), wscrl(), wsetscrreg() * ext/curses/curses.c: added appropriate #ifdef's for additional set of curses functions now checked by extconf.rb; fixed curses_bkgd() and window_bkgd() to correctly return boolean result rather than numeric result; fixed window_getbkgd() to correctly signal an error by returning nil rather than -1. * ext/etc/etc.c: setup_passwd() and setup_group() now check for null pointers before invoking rb_tainted_str_new2() upon fields extracted from `struct passwd' and `struct group' since null pointers in some fields are common on NextStep/OpenStep (especially so for the `pw_comment' field) and rb_tainted_str_new2() throws an exception when it receives a null pointer. * ext/pty/pty.c: include "util.h" for strdup()/ruby_strdup() for platforms such as NextStep and OpenStep which lack strdup(). * ext/socket/getaddrinfo.c: cast first argument of getservbyname(), gethostbyaddr(), and gethostbyname() from (const char*) to non-const (char*) for older platforms such as NextStep and OpenStep. * ext/socket/socket.c: include "util.h" for strdup()/ruby_strdup() for platforms such as NextStep and OpenStep which lack strdup(); include <netinet/in_systm.h> if present for NextStep and OpenStep; cast first argument of gethostbyaddr() and getservbyname() from (const char*) to non-const (char*) for older platforms. * ext/syslog/syslog.c: include "util.h" for strdup()/ruby_strdup() for platforms such as NextStep and OpenStep which lack strdup(). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@5002 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2003-11-21 23:00:03 -05:00
if /(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM
EncodingConvertMap[['UTF8', 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv("cp932", "utf-8", str) }
EncodingConvertMap[['SJIS', 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", "cp932", str) }
EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "cp932", str) }
else
EncodingConvertMap[['UTF8', 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv("shift-jis", "utf-8", str) }
EncodingConvertMap[['SJIS', 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", "shift-jis", str) }
EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "shift-jis", str) }
end
rescue LoadError
begin
require 'nkf'
EncodingConvertMap[['EUC' , 'SJIS']] = Proc.new { |str| NKF.nkf('-sXm0', str) }
EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| NKF.nkf('-eXm0', str) }
rescue LoadError
end
begin
require 'uconv'
@encoding = 'UTF8'
EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc)
EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis)
EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8)
EncodingConvertMap[['SJIS', 'UTF8']] = Uconv.method(:sjistou8)
rescue LoadError
end
end
end
self.init
CharsetMap = {
'NONE' => 'us-ascii',
'EUC' => 'euc-jp',
'SJIS' => 'shift_jis',
'UTF8' => 'utf-8',
}
###
## handlers
#
def Charset.encoding
@encoding
end
def Charset.encoding_label
charset_label(@encoding)
end
def Charset.encoding_to_xml(str, charset)
encoding_conv(str, @encoding, charset_str(charset))
end
def Charset.encoding_from_xml(str, charset)
encoding_conv(str, charset_str(charset), @encoding)
end
def Charset.encoding_conv(str, enc_from, enc_to)
if enc_from == enc_to or enc_from == 'NONE' or enc_to == 'NONE'
str
elsif converter = EncodingConvertMap[[enc_from, enc_to]]
converter.call(str)
else
raise CharsetConversionError.new(
"Converter not found: #{ enc_from } -> #{ enc_to }")
end
end
def Charset.charset_label(encoding)
CharsetMap[encoding.upcase]
end
def Charset.charset_str(label)
CharsetMap.index(label.downcase)
end
# us_ascii = '[\x00-\x7F]'
us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted.
USASCIIRegexp = Regexp.new("\\A#{ us_ascii }*\\z", nil, "NONE")
twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'
threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'
character_euc = "(?:#{ us_ascii }|#{ twobytes_euc }|#{ threebytes_euc })"
EUCRegexp = Regexp.new("\\A#{ character_euc }*\\z", nil, "NONE")
# onebyte_sjis = '[\x00-\x7F\xA1-\xDF]'
onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]' # XML 1.0 restricted.
twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])'
character_sjis = "(?:#{ onebyte_sjis }|#{ twobytes_sjis })"
SJISRegexp = Regexp.new("\\A#{ character_sjis }*\\z", nil, "NONE")
# 0xxxxxxx
# 110yyyyy 10xxxxxx
twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])'
# 1110zzzz 10yyyyyy 10xxxxxx
threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])'
# 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx
fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])'
character_utf8 = "(?:#{ us_ascii }|#{ twobytes_utf8 }|#{ threebytes_utf8 }|#{ fourbytes_utf8 })"
UTF8Regexp = Regexp.new("\\A#{ character_utf8 }*\\z", nil, "NONE")
def Charset.is_us_ascii(str)
USASCIIRegexp =~ str
end
def Charset.is_utf8(str)
UTF8Regexp =~ str
end
def Charset.is_euc(str)
EUCRegexp =~ str
end
def Charset.is_sjis(str)
SJISRegexp =~ str
end
def Charset.is_ces(str, code = $KCODE)
case code
when 'NONE'
is_us_ascii(str)
when 'UTF8'
is_utf8(str)
when 'EUC'
is_euc(str)
when 'SJIS'
is_sjis(str)
else
raise UnknownCharsetError.new("Unknown charset: #{ code }")
end
end
end
end