1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00
ruby--ruby/enc/trans/utf8_mac.trans
naruse d0a4f8ada9 * enc/trans/utf8_mac.trans: Add converter for UTF8-MAC.
* enc/trans/utf8_mac-tbl.rb: ditto.

* test/ruby/test_econv.rb: tests for above.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@23296 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2009-04-26 14:21:43 +00:00

257 lines
6.1 KiB
Text

#include "transcode_data.h"
<%
require 'utf8_mac-tbl'
def hexstr(str)
str.unpack("H*")[0]
end
transcode_tblgen("UTF-8", "UTF8-MAC", [
*MAC_DECOMPOSE_TBL.map{|k,v|[hexstr(k), hexstr(v)]},
["{00-7F}", :nomap],
["{c2-df}{80-bf}", :asis],
["e0{a0-bf}{80-bf}", :asis],
["{e1-ec}{80-bf}{80-bf}", :asis],
["ed{80-9f}{80-bf}", :asis],
["{ee-ef}{80-bf}{80-bf}", :asis],
["f0{90-bf}{80-bf}{80-bf}", :asis],
["{f1-f3}{80-bf}{80-bf}{80-bf}", :asis],
])
map = {}
map["{00-7f}"] = :func_so
map["{c2-df}{80-bf}"] = :func_so
map["e0{a0-bf}{80-bf}"] = :func_so
map["{e1-ec}{80-bf}{80-bf}"] = :func_so
map["ed{80-9f}{80-bf}"] = :func_so
map["{ee-ef}{80-bf}{80-bf}"] = :func_so
map["f0{90-bf}{80-bf}{80-bf}"] = :func_so
map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so
map["f4{80-8f}{80-bf}{80-bf}"] = :func_so
transcode_generate_node(ActionMap.parse(map), "from_UTF8_MAC")
map = Hash[*MAC_DECOMPOSE_TBL.select{|k,v|v.length == 3}.
map{|k,v|[hexstr(v), hexstr(k)]}.flatten]
transcode_generate_node(ActionMap.parse(map), "from_utf8_mac_nfc3")
map = Hash[*MAC_DECOMPOSE_TBL.select{|k,v|v.length == 2}.
map{|k,v|[hexstr(v), hexstr(k)]}.flatten]
transcode_generate_node(ActionMap.parse(map), "from_utf8_mac_nfc2")
%>
<%= transcode_generated_code %>
#define BYTE_ADDR(index) (<%= OUTPUT_PREFIX %>byte_array + (index))
#define WORD_ADDR(index) (<%= OUTPUT_PREFIX %>word_array + INFO2WORDINDEX(index))
#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_info)))
#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_info)))
#define BL_MIN_BYTE (BL_BASE[0])
#define BL_MAX_BYTE (BL_BASE[1])
#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
#define STATUS_BUF_SIZE 16
struct from_utf8_mac_status {
unsigned char buf[STATUS_BUF_SIZE];
int beg;
int end;
int len;
};
#define buf_length(sp) (sp->len)
int
buf_bytesize(struct from_utf8_mac_status *sp)
{
int size = sp->end - sp->beg + STATUS_BUF_SIZE;
size %= STATUS_BUF_SIZE;
return size;
}
void
buf_push(struct from_utf8_mac_status *sp, const unsigned char *p, ssize_t l)
{
const unsigned char *pend = p + l;
while (p < pend) {
sp->buf[sp->end++] = *p++;
sp->end %= STATUS_BUF_SIZE;
}
sp->len++;
}
unsigned char
buf_shift(struct from_utf8_mac_status *sp)
{
unsigned char c = sp->buf[sp->beg++];
sp->beg %= STATUS_BUF_SIZE;
if ((c & 0xC0) != 0x80) sp->len--;
return c;
}
void
buf_shift_char(struct from_utf8_mac_status *sp)
{
while (sp->beg != sp->end) {
buf_shift(sp);
if ((sp->buf[sp->beg] & 0xC0) != 0x80) break;
}
}
void
buf_clear(struct from_utf8_mac_status *sp)
{
sp->beg = sp->end = sp->len = 0;
}
unsigned char
buf_at(struct from_utf8_mac_status *sp, int pos)
{
pos += sp->beg;
pos %= STATUS_BUF_SIZE;
return sp->buf[pos];
}
int
buf_output_char(struct from_utf8_mac_status *sp, unsigned char *o)
{
int n = 0;
while (sp->beg != sp->end) {
o[n++] = buf_shift(sp);
if ((sp->buf[sp->beg] & 0xC0) != 0x80) break;
}
return n;
}
int
buf_output_all(struct from_utf8_mac_status *sp, unsigned char *o)
{
int n = 0;
while (sp->beg != sp->end) {
o[n++] = buf_shift(sp);
}
return n;
}
VALUE
get_info(VALUE next_info, struct from_utf8_mac_status *sp) {
int pos = 0;
while (pos < buf_bytesize(sp)) {
unsigned char next_byte = buf_at(sp, pos++);
if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
next_info = INVALID;
else {
next_info = (VALUE)BL_ACTION(next_byte);
}
if ((next_info & 3) == 0) continue;
break;
}
return next_info;
}
int
buf_apply(int mode, struct from_utf8_mac_status *sp, unsigned char *o)
{
int n = 0;
VALUE next_info = mode == 3 ? from_utf8_mac_nfc3 : from_utf8_mac_nfc2;
next_info = get_info(next_info, sp);
switch (next_info & 0x1F) {
case THREEbt:
o[n++] = getBT1(next_info);
case TWObt:
o[n++] = getBT2(next_info);
o[n++] = getBT3(next_info);
if (mode == 3) {
buf_clear(sp);
}
else {
buf_shift_char(sp);
buf_shift_char(sp);
}
break;
default:
return 0;
}
return n;
}
static int
from_utf8_mac_init(void *statep)
{
struct from_utf8_mac_status *sp = statep;
buf_clear(sp);
return 0;
}
static ssize_t
from_utf8_mac_finish(void *statep,
unsigned char *o, size_t osize)
{
struct from_utf8_mac_status *sp = statep;
int n;
if (buf_length(sp) == 0) return 0;
n = buf_apply(2, sp, o) + buf_output_all(sp, o);
return n;
}
static ssize_t
fun_so_from_utf8_mac(void *statep,
const unsigned char *s, size_t l,
unsigned char *o, size_t osize)
{
struct from_utf8_mac_status *sp = statep;
int n = 0;
switch (l) {
case 1:
n = from_utf8_mac_finish(sp, o, osize);
break;
case 3:
if (s[0] == 0xE3 && s[1] == 0x82 && (s[2] == 0x99 || s[2] == 0x9A)) {
n = from_utf8_mac_finish(sp, o, osize);
o[n++] = *s++;
o[n++] = *s++;
o[n++] = *s++;
return n;
}
break;
case 4:
n = from_utf8_mac_finish(sp, o, osize);
o[n++] = *s++;
o[n++] = *s++;
o[n++] = *s++;
o[n++] = *s++;
return n;
}
buf_push(sp, s, l);
if (buf_length(sp) < 3) return n;
n = buf_apply(3, sp, o);
if (n > 0) return n;
n = buf_apply(2, sp, o);
if (n > 0) return n;
return buf_output_char(sp, o);
}
static const rb_transcoder
rb_from_utf8_mac = {
"UTF8-MAC", "UTF-8", from_UTF8_MAC,
TRANSCODE_TABLE_INFO,
1, /* input_unit_length */
4, /* max_input */
10, /* max_output */
asciicompat_encoder, /* asciicompat_type */
sizeof(struct from_utf8_mac_status), from_utf8_mac_init, from_utf8_mac_init,
NULL, NULL, NULL, fun_so_from_utf8_mac,
from_utf8_mac_finish
};
void
Init_utf8_mac(void)
{
<%= transcode_register_code %>
rb_register_transcoder(&rb_from_utf8_mac);
}