mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
2cbc7b69b7
problem. StringValueCStr modifies the argument and it should be preserved while the string StringValueCStr returns is used. Since the string is used by caller, the modified argument should be hold by caller. Actually GC.stress = true def (o=Object.new).to_str() "universal"+"_newline" end "\u3042".encode(o, "")' causes curious warning: rb_define_const: invalid name `' for constant git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19408 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
4012 lines
119 KiB
C
4012 lines
119 KiB
C
/**********************************************************************
|
|
|
|
transcode.c -
|
|
|
|
$Author$
|
|
created at: Tue Oct 30 16:10:22 JST 2007
|
|
|
|
Copyright (C) 2007 Martin Duerst
|
|
|
|
**********************************************************************/
|
|
|
|
#include "ruby/ruby.h"
|
|
#include "ruby/encoding.h"
|
|
#include "transcode_data.h"
|
|
#include <ctype.h>
|
|
|
|
VALUE rb_eConversionUndefinedError;
|
|
VALUE rb_eInvalidByteSequenceError;
|
|
VALUE rb_eNoConverterError;
|
|
|
|
VALUE rb_cEncodingConverter;
|
|
|
|
static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace;
|
|
static VALUE sym_xml, sym_text, sym_attr;
|
|
static VALUE sym_universal_newline;
|
|
static VALUE sym_crlf_newline;
|
|
static VALUE sym_cr_newline;
|
|
static VALUE sym_partial_input;
|
|
|
|
static VALUE sym_invalid_byte_sequence;
|
|
static VALUE sym_undefined_conversion;
|
|
static VALUE sym_destination_buffer_full;
|
|
static VALUE sym_source_buffer_empty;
|
|
static VALUE sym_finished;
|
|
static VALUE sym_after_output;
|
|
static VALUE sym_incomplete_input;
|
|
|
|
static unsigned char *
|
|
allocate_converted_string(const char *sname, const char *dname,
|
|
const unsigned char *str, size_t len,
|
|
unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
|
|
size_t *dst_len_ptr);
|
|
|
|
/* dynamic structure, one per conversion (similar to iconv_t) */
|
|
/* may carry conversion state (e.g. for iso-2022-jp) */
|
|
typedef struct rb_transcoding {
|
|
const rb_transcoder *transcoder;
|
|
|
|
int flags;
|
|
|
|
int resume_position;
|
|
unsigned int next_table;
|
|
VALUE next_info;
|
|
unsigned char next_byte;
|
|
unsigned int output_index;
|
|
|
|
int recognized_len; /* already interpreted */
|
|
int readagain_len; /* not yet interpreted */
|
|
union {
|
|
unsigned char ary[8]; /* max_input <= sizeof(ary) */
|
|
unsigned char *ptr; /* length: max_input */
|
|
} readbuf; /* recognized_len + readagain_len used */
|
|
|
|
int writebuf_off;
|
|
int writebuf_len;
|
|
union {
|
|
unsigned char ary[8]; /* max_output <= sizeof(ary) */
|
|
unsigned char *ptr; /* length: max_output */
|
|
} writebuf;
|
|
|
|
union rb_transcoding_state_t { /* opaque data for stateful encoding */
|
|
void *ptr;
|
|
double dummy_for_alignment;
|
|
} state;
|
|
} rb_transcoding;
|
|
#define TRANSCODING_READBUF(tc) \
|
|
((tc)->transcoder->max_input <= sizeof((tc)->readbuf.ary) ? \
|
|
(tc)->readbuf.ary : \
|
|
(tc)->readbuf.ptr)
|
|
#define TRANSCODING_WRITEBUF(tc) \
|
|
((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \
|
|
(tc)->writebuf.ary : \
|
|
(tc)->writebuf.ptr)
|
|
#define TRANSCODING_WRITEBUF_SIZE(tc) \
|
|
((tc)->transcoder->max_output <= sizeof((tc)->writebuf.ary) ? \
|
|
sizeof((tc)->writebuf.ary) : \
|
|
(tc)->transcoder->max_output)
|
|
#define TRANSCODING_STATE_EMBED_MAX sizeof(union rb_transcoding_state_t)
|
|
#define TRANSCODING_STATE(tc) \
|
|
((tc)->transcoder->state_size <= sizeof((tc)->state) ? \
|
|
(void *)&(tc)->state : \
|
|
(tc)->state.ptr)
|
|
|
|
typedef struct {
|
|
struct rb_transcoding *tc;
|
|
unsigned char *out_buf_start;
|
|
unsigned char *out_data_start;
|
|
unsigned char *out_data_end;
|
|
unsigned char *out_buf_end;
|
|
rb_econv_result_t last_result;
|
|
} rb_econv_elem_t;
|
|
|
|
struct rb_econv_t {
|
|
int flags;
|
|
const char *source_encoding_name;
|
|
const char *destination_encoding_name;
|
|
|
|
int started;
|
|
|
|
const unsigned char *replacement_str;
|
|
size_t replacement_len;
|
|
const char *replacement_enc;
|
|
int replacement_allocated;
|
|
|
|
unsigned char *in_buf_start;
|
|
unsigned char *in_data_start;
|
|
unsigned char *in_data_end;
|
|
unsigned char *in_buf_end;
|
|
rb_econv_elem_t *elems;
|
|
int num_allocated;
|
|
int num_trans;
|
|
int num_finished;
|
|
struct rb_transcoding *last_tc;
|
|
|
|
/* last error */
|
|
struct {
|
|
rb_econv_result_t result;
|
|
struct rb_transcoding *error_tc;
|
|
const char *source_encoding;
|
|
const char *destination_encoding;
|
|
const unsigned char *error_bytes_start;
|
|
size_t error_bytes_len;
|
|
size_t readagain_len;
|
|
} last_error;
|
|
|
|
/* The following fields are only for Encoding::Converter.
|
|
* rb_econv_open set them NULL. */
|
|
rb_encoding *source_encoding;
|
|
rb_encoding *destination_encoding;
|
|
};
|
|
|
|
/*
|
|
* Dispatch data and logic
|
|
*/
|
|
|
|
#define DECORATOR_P(sname, dname) (*(sname) == '\0')
|
|
|
|
typedef struct {
|
|
const char *sname;
|
|
const char *dname;
|
|
const char *lib; /* maybe null. it means that don't load the library. */
|
|
const rb_transcoder *transcoder;
|
|
} transcoder_entry_t;
|
|
|
|
static st_table *transcoder_table;
|
|
|
|
static transcoder_entry_t *
|
|
make_transcoder_entry(const char *sname, const char *dname)
|
|
{
|
|
st_data_t val;
|
|
st_table *table2;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
|
|
val = (st_data_t)st_init_strcasetable();
|
|
st_add_direct(transcoder_table, (st_data_t)sname, val);
|
|
}
|
|
table2 = (st_table *)val;
|
|
if (!st_lookup(table2, (st_data_t)dname, &val)) {
|
|
transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
|
|
entry->sname = sname;
|
|
entry->dname = dname;
|
|
entry->lib = NULL;
|
|
entry->transcoder = NULL;
|
|
val = (st_data_t)entry;
|
|
st_add_direct(table2, (st_data_t)dname, val);
|
|
}
|
|
return (transcoder_entry_t *)val;
|
|
}
|
|
|
|
static transcoder_entry_t *
|
|
get_transcoder_entry(const char *sname, const char *dname)
|
|
{
|
|
st_data_t val;
|
|
st_table *table2;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
|
|
return NULL;
|
|
}
|
|
table2 = (st_table *)val;
|
|
if (!st_lookup(table2, (st_data_t)dname, &val)) {
|
|
return NULL;
|
|
}
|
|
return (transcoder_entry_t *)val;
|
|
}
|
|
|
|
void
|
|
rb_register_transcoder(const rb_transcoder *tr)
|
|
{
|
|
const char *const sname = tr->src_encoding;
|
|
const char *const dname = tr->dst_encoding;
|
|
|
|
transcoder_entry_t *entry;
|
|
|
|
entry = make_transcoder_entry(sname, dname);
|
|
if (entry->transcoder) {
|
|
rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
|
|
sname, dname);
|
|
}
|
|
|
|
entry->transcoder = tr;
|
|
}
|
|
|
|
static void
|
|
declare_transcoder(const char *sname, const char *dname, const char *lib)
|
|
{
|
|
transcoder_entry_t *entry;
|
|
|
|
entry = make_transcoder_entry(sname, dname);
|
|
entry->lib = lib;
|
|
}
|
|
|
|
#define MAX_TRANSCODER_LIBNAME_LEN 64
|
|
static const char transcoder_lib_prefix[] = "enc/trans/";
|
|
|
|
void
|
|
rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
|
|
{
|
|
if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
|
|
rb_raise(rb_eArgError, "invalid library name - %s",
|
|
lib ? lib : "(null)");
|
|
}
|
|
declare_transcoder(enc1, enc2, lib);
|
|
}
|
|
|
|
#define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
|
|
|
|
typedef struct search_path_queue_tag {
|
|
struct search_path_queue_tag *next;
|
|
const char *enc;
|
|
} search_path_queue_t;
|
|
|
|
typedef struct {
|
|
st_table *visited;
|
|
search_path_queue_t *queue;
|
|
search_path_queue_t **queue_last_ptr;
|
|
const char *base_enc;
|
|
} search_path_bfs_t;
|
|
|
|
static int
|
|
transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
|
|
{
|
|
const char *dname = (const char *)key;
|
|
search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
|
|
search_path_queue_t *q;
|
|
|
|
if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
|
|
return ST_CONTINUE;
|
|
}
|
|
|
|
q = ALLOC(search_path_queue_t);
|
|
q->enc = dname;
|
|
q->next = NULL;
|
|
*bfs->queue_last_ptr = q;
|
|
bfs->queue_last_ptr = &q->next;
|
|
|
|
st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
|
|
return ST_CONTINUE;
|
|
}
|
|
|
|
static int
|
|
transcode_search_path(const char *sname, const char *dname,
|
|
void (*callback)(const char *sname, const char *dname, int depth, void *arg),
|
|
void *arg)
|
|
{
|
|
search_path_bfs_t bfs;
|
|
search_path_queue_t *q;
|
|
st_data_t val;
|
|
st_table *table2;
|
|
int found;
|
|
int pathlen;
|
|
|
|
if (encoding_equal(sname, dname))
|
|
return -1;
|
|
|
|
q = ALLOC(search_path_queue_t);
|
|
q->enc = sname;
|
|
q->next = NULL;
|
|
bfs.queue_last_ptr = &q->next;
|
|
bfs.queue = q;
|
|
|
|
bfs.visited = st_init_strcasetable();
|
|
st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
|
|
|
|
while (bfs.queue) {
|
|
q = bfs.queue;
|
|
bfs.queue = q->next;
|
|
if (!bfs.queue)
|
|
bfs.queue_last_ptr = &bfs.queue;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
|
|
xfree(q);
|
|
continue;
|
|
}
|
|
table2 = (st_table *)val;
|
|
|
|
if (st_lookup(table2, (st_data_t)dname, &val)) {
|
|
st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
|
|
xfree(q);
|
|
found = 1;
|
|
goto cleanup;
|
|
}
|
|
|
|
bfs.base_enc = q->enc;
|
|
st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
|
|
bfs.base_enc = NULL;
|
|
|
|
xfree(q);
|
|
}
|
|
found = 0;
|
|
|
|
cleanup:
|
|
while (bfs.queue) {
|
|
q = bfs.queue;
|
|
bfs.queue = q->next;
|
|
xfree(q);
|
|
}
|
|
|
|
if (found) {
|
|
const char *enc = dname;
|
|
int depth;
|
|
pathlen = 0;
|
|
while (1) {
|
|
st_lookup(bfs.visited, (st_data_t)enc, &val);
|
|
if (!val)
|
|
break;
|
|
pathlen++;
|
|
enc = (const char *)val;
|
|
}
|
|
depth = pathlen;
|
|
enc = dname;
|
|
while (1) {
|
|
st_lookup(bfs.visited, (st_data_t)enc, &val);
|
|
if (!val)
|
|
break;
|
|
callback((const char *)val, enc, --depth, arg);
|
|
enc = (const char *)val;
|
|
}
|
|
}
|
|
|
|
st_free_table(bfs.visited);
|
|
|
|
if (found)
|
|
return pathlen;
|
|
else
|
|
return -1;
|
|
}
|
|
|
|
static const rb_transcoder *
|
|
load_transcoder_entry(transcoder_entry_t *entry)
|
|
{
|
|
if (entry->transcoder)
|
|
return entry->transcoder;
|
|
|
|
if (entry->lib) {
|
|
const char *lib = entry->lib;
|
|
int len = strlen(lib);
|
|
char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
|
|
|
|
entry->lib = NULL;
|
|
|
|
if (len > MAX_TRANSCODER_LIBNAME_LEN)
|
|
return NULL;
|
|
memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
|
|
memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
|
|
if (!rb_require(path))
|
|
return NULL;
|
|
}
|
|
|
|
if (entry->transcoder)
|
|
return entry->transcoder;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static const char*
|
|
get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
|
|
{
|
|
if (encoding_equal(encname, "UTF-8")) {
|
|
*len_ret = 3;
|
|
*repl_encname_ptr = "UTF-8";
|
|
return "\xEF\xBF\xBD";
|
|
}
|
|
else {
|
|
*len_ret = 1;
|
|
*repl_encname_ptr = "US-ASCII";
|
|
return "?";
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Transcoding engine logic
|
|
*/
|
|
|
|
static const unsigned char *
|
|
transcode_char_start(rb_transcoding *tc,
|
|
const unsigned char *in_start,
|
|
const unsigned char *inchar_start,
|
|
const unsigned char *in_p,
|
|
size_t *char_len_ptr)
|
|
{
|
|
const unsigned char *ptr;
|
|
if (inchar_start - in_start < tc->recognized_len) {
|
|
MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
|
|
inchar_start, unsigned char, in_p - inchar_start);
|
|
ptr = TRANSCODING_READBUF(tc);
|
|
}
|
|
else {
|
|
ptr = inchar_start - tc->recognized_len;
|
|
}
|
|
*char_len_ptr = tc->recognized_len + (in_p - inchar_start);
|
|
return ptr;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
rb_transcoding *tc,
|
|
const int opt)
|
|
|
|
{
|
|
const rb_transcoder *tr = tc->transcoder;
|
|
int unitlen = tr->input_unit_length;
|
|
int readagain_len = 0;
|
|
|
|
const unsigned char *inchar_start;
|
|
const unsigned char *in_p;
|
|
|
|
unsigned char *out_p;
|
|
|
|
unsigned char empty_buf;
|
|
unsigned char *empty_ptr = &empty_buf;
|
|
|
|
if (!in_pos) {
|
|
in_pos = (const unsigned char **)&empty_ptr;
|
|
in_stop = empty_ptr;
|
|
}
|
|
|
|
if (!out_pos) {
|
|
out_pos = &empty_ptr;
|
|
out_stop = empty_ptr;
|
|
}
|
|
|
|
in_p = inchar_start = *in_pos;
|
|
|
|
out_p = *out_pos;
|
|
|
|
#define SUSPEND(ret, num) \
|
|
do { \
|
|
tc->resume_position = (num); \
|
|
if (0 < in_p - inchar_start) \
|
|
MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
|
|
inchar_start, unsigned char, in_p - inchar_start); \
|
|
*in_pos = in_p; \
|
|
*out_pos = out_p; \
|
|
tc->recognized_len += in_p - inchar_start; \
|
|
if (readagain_len) { \
|
|
tc->recognized_len -= readagain_len; \
|
|
tc->readagain_len = readagain_len; \
|
|
} \
|
|
return ret; \
|
|
resume_label ## num:; \
|
|
} while (0)
|
|
#define SUSPEND_OBUF(num) \
|
|
do { \
|
|
while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
|
|
} while (0)
|
|
|
|
#define SUSPEND_AFTER_OUTPUT(num) \
|
|
if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
|
|
SUSPEND(econv_after_output, num); \
|
|
}
|
|
|
|
#define next_table (tc->next_table)
|
|
#define next_info (tc->next_info)
|
|
#define next_byte (tc->next_byte)
|
|
#define writebuf_len (tc->writebuf_len)
|
|
#define writebuf_off (tc->writebuf_off)
|
|
|
|
switch (tc->resume_position) {
|
|
case 0: break;
|
|
case 1: goto resume_label1;
|
|
case 2: goto resume_label2;
|
|
case 3: goto resume_label3;
|
|
case 4: goto resume_label4;
|
|
case 5: goto resume_label5;
|
|
case 6: goto resume_label6;
|
|
case 7: goto resume_label7;
|
|
case 8: goto resume_label8;
|
|
case 9: goto resume_label9;
|
|
case 10: goto resume_label10;
|
|
case 11: goto resume_label11;
|
|
case 12: goto resume_label12;
|
|
case 13: goto resume_label13;
|
|
case 14: goto resume_label14;
|
|
case 15: goto resume_label15;
|
|
case 16: goto resume_label16;
|
|
case 17: goto resume_label17;
|
|
case 18: goto resume_label18;
|
|
case 19: goto resume_label19;
|
|
case 20: goto resume_label20;
|
|
case 21: goto resume_label21;
|
|
case 22: goto resume_label22;
|
|
case 23: goto resume_label23;
|
|
case 24: goto resume_label24;
|
|
case 25: goto resume_label25;
|
|
case 26: goto resume_label26;
|
|
case 27: goto resume_label27;
|
|
case 28: goto resume_label28;
|
|
}
|
|
|
|
while (1) {
|
|
inchar_start = in_p;
|
|
tc->recognized_len = 0;
|
|
next_table = tr->conv_tree_start;
|
|
|
|
SUSPEND_AFTER_OUTPUT(24);
|
|
|
|
if (in_stop <= in_p) {
|
|
if (!(opt & ECONV_PARTIAL_INPUT))
|
|
break;
|
|
SUSPEND(econv_source_buffer_empty, 7);
|
|
continue;
|
|
}
|
|
|
|
#define BYTE_ADDR(index) (tr->byte_array + (index))
|
|
#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
|
|
#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
|
|
#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
|
|
#define BL_MIN_BYTE (BL_BASE[0])
|
|
#define BL_MAX_BYTE (BL_BASE[1])
|
|
#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
|
|
#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
|
|
|
|
next_byte = (unsigned char)*in_p++;
|
|
follow_byte:
|
|
if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
|
|
next_info = INVALID;
|
|
else {
|
|
next_info = (VALUE)BL_ACTION(next_byte);
|
|
}
|
|
follow_info:
|
|
switch (next_info & 0x1F) {
|
|
case NOMAP: /* xxx: copy last byte only? */
|
|
SUSPEND_OBUF(3); *out_p++ = next_byte;
|
|
continue;
|
|
case 0x00: case 0x04: case 0x08: case 0x0C:
|
|
case 0x10: case 0x14: case 0x18: case 0x1C:
|
|
SUSPEND_AFTER_OUTPUT(25);
|
|
while (in_p >= in_stop) {
|
|
if (!(opt & ECONV_PARTIAL_INPUT))
|
|
goto incomplete;
|
|
SUSPEND(econv_source_buffer_empty, 5);
|
|
}
|
|
next_byte = (unsigned char)*in_p++;
|
|
next_table = next_info;
|
|
goto follow_byte;
|
|
case ZERObt: /* drop input */
|
|
continue;
|
|
case ONEbt:
|
|
SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
|
|
continue;
|
|
case TWObt:
|
|
SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
|
|
SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
|
|
continue;
|
|
case THREEbt:
|
|
SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
|
|
SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
|
|
SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
|
|
continue;
|
|
case FOURbt:
|
|
SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
|
|
SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
|
|
SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
|
|
SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
|
|
continue;
|
|
case STR1:
|
|
tc->output_index = 0;
|
|
while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
|
|
SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
|
|
tc->output_index++;
|
|
}
|
|
continue;
|
|
case FUNii:
|
|
next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
|
|
goto follow_info;
|
|
case FUNsi:
|
|
{
|
|
const unsigned char *char_start;
|
|
size_t char_len;
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
|
|
goto follow_info;
|
|
}
|
|
case FUNio:
|
|
SUSPEND_OBUF(13);
|
|
if (tr->max_output <= out_stop - out_p)
|
|
out_p += tr->func_io(TRANSCODING_STATE(tc),
|
|
next_info, out_p, out_stop - out_p);
|
|
else {
|
|
writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
|
|
next_info,
|
|
TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(20);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
break;
|
|
case FUNso:
|
|
{
|
|
const unsigned char *char_start;
|
|
size_t char_len;
|
|
SUSPEND_OBUF(14);
|
|
if (tr->max_output <= out_stop - out_p) {
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
out_p += tr->func_so(TRANSCODING_STATE(tc),
|
|
char_start, (size_t)char_len,
|
|
out_p, out_stop - out_p);
|
|
}
|
|
else {
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
|
|
char_start, (size_t)char_len,
|
|
TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(22);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case INVALID:
|
|
if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
|
|
if (tc->recognized_len + (in_p - inchar_start) < unitlen)
|
|
SUSPEND_AFTER_OUTPUT(26);
|
|
while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
|
|
in_p = in_stop;
|
|
SUSPEND(econv_source_buffer_empty, 8);
|
|
}
|
|
if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
|
|
in_p = in_stop;
|
|
}
|
|
else {
|
|
in_p = inchar_start + (unitlen - tc->recognized_len);
|
|
}
|
|
}
|
|
else {
|
|
int invalid_len; /* including the last byte which causes invalid */
|
|
int discard_len;
|
|
invalid_len = tc->recognized_len + (in_p - inchar_start);
|
|
discard_len = ((invalid_len - 1) / unitlen) * unitlen;
|
|
readagain_len = invalid_len - discard_len;
|
|
}
|
|
goto invalid;
|
|
case UNDEF:
|
|
goto undef;
|
|
}
|
|
continue;
|
|
|
|
invalid:
|
|
SUSPEND(econv_invalid_byte_sequence, 1);
|
|
continue;
|
|
|
|
incomplete:
|
|
SUSPEND(econv_incomplete_input, 27);
|
|
continue;
|
|
|
|
undef:
|
|
SUSPEND(econv_undefined_conversion, 2);
|
|
continue;
|
|
}
|
|
|
|
/* cleanup */
|
|
if (tr->finish_func) {
|
|
SUSPEND_OBUF(4);
|
|
if (tr->max_output <= out_stop - out_p) {
|
|
out_p += tr->finish_func(TRANSCODING_STATE(tc),
|
|
out_p, out_stop - out_p);
|
|
}
|
|
else {
|
|
writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
|
|
TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(23);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
}
|
|
while (1)
|
|
SUSPEND(econv_finished, 6);
|
|
#undef SUSPEND
|
|
#undef next_table
|
|
#undef next_info
|
|
#undef next_byte
|
|
#undef writebuf_len
|
|
#undef writebuf_off
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
rb_transcoding *tc,
|
|
const int opt)
|
|
{
|
|
if (tc->readagain_len) {
|
|
unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
|
|
const unsigned char *readagain_pos = readagain_buf;
|
|
const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
|
|
rb_econv_result_t res;
|
|
|
|
MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
|
|
unsigned char, tc->readagain_len);
|
|
tc->readagain_len = 0;
|
|
res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
|
|
if (res != econv_source_buffer_empty) {
|
|
MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
|
|
readagain_pos, unsigned char, readagain_stop - readagain_pos);
|
|
tc->readagain_len += readagain_stop - readagain_pos;
|
|
return res;
|
|
}
|
|
}
|
|
return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
|
|
}
|
|
|
|
static rb_transcoding *
|
|
rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
|
|
{
|
|
rb_transcoding *tc;
|
|
|
|
tc = ALLOC(rb_transcoding);
|
|
tc->transcoder = tr;
|
|
tc->flags = flags;
|
|
if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
|
|
tc->state.ptr = xmalloc(tr->state_size);
|
|
if (tr->state_init_func) {
|
|
(tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
|
|
}
|
|
tc->resume_position = 0;
|
|
tc->recognized_len = 0;
|
|
tc->readagain_len = 0;
|
|
tc->writebuf_len = 0;
|
|
tc->writebuf_off = 0;
|
|
if (sizeof(tc->readbuf.ary) < tr->max_input) {
|
|
tc->readbuf.ptr = xmalloc(tr->max_input);
|
|
}
|
|
if (sizeof(tc->writebuf.ary) < tr->max_output) {
|
|
tc->writebuf.ptr = xmalloc(tr->max_output);
|
|
}
|
|
return tc;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
rb_transcoding_convert(rb_transcoding *tc,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags)
|
|
{
|
|
return transcode_restartable(
|
|
input_ptr, output_ptr,
|
|
input_stop, output_stop,
|
|
tc, flags);
|
|
}
|
|
|
|
static void
|
|
rb_transcoding_close(rb_transcoding *tc)
|
|
{
|
|
const rb_transcoder *tr = tc->transcoder;
|
|
if (tr->state_fini_func) {
|
|
(tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
|
|
}
|
|
if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
|
|
xfree(tc->state.ptr);
|
|
if (sizeof(tc->readbuf.ary) < tr->max_input)
|
|
xfree(tc->readbuf.ptr);
|
|
if (sizeof(tc->writebuf.ary) < tr->max_output)
|
|
xfree(tc->writebuf.ptr);
|
|
xfree(tc);
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_alloc(int n_hint)
|
|
{
|
|
rb_econv_t *ec;
|
|
|
|
if (n_hint <= 0)
|
|
n_hint = 1;
|
|
|
|
ec = ALLOC(rb_econv_t);
|
|
ec->flags = 0;
|
|
ec->source_encoding_name = NULL;
|
|
ec->destination_encoding_name = NULL;
|
|
ec->started = 0;
|
|
ec->replacement_str = NULL;
|
|
ec->replacement_len = 0;
|
|
ec->replacement_enc = NULL;
|
|
ec->replacement_allocated = 0;
|
|
ec->in_buf_start = NULL;
|
|
ec->in_data_start = NULL;
|
|
ec->in_data_end = NULL;
|
|
ec->in_buf_end = NULL;
|
|
ec->num_allocated = n_hint;
|
|
ec->num_trans = 0;
|
|
ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
|
|
ec->num_finished = 0;
|
|
ec->last_tc = NULL;
|
|
ec->last_error.result = econv_source_buffer_empty;
|
|
ec->last_error.error_tc = NULL;
|
|
ec->last_error.source_encoding = NULL;
|
|
ec->last_error.destination_encoding = NULL;
|
|
ec->last_error.error_bytes_start = NULL;
|
|
ec->last_error.error_bytes_len = 0;
|
|
ec->last_error.readagain_len = 0;
|
|
ec->source_encoding = NULL;
|
|
ec->destination_encoding = NULL;
|
|
return ec;
|
|
}
|
|
|
|
static int
|
|
rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
|
|
{
|
|
int n, j;
|
|
int bufsize = 4096;
|
|
unsigned char *p;
|
|
|
|
if (ec->num_trans == ec->num_allocated) {
|
|
n = ec->num_allocated * 2;
|
|
REALLOC_N(ec->elems, rb_econv_elem_t, n);
|
|
ec->num_allocated = n;
|
|
}
|
|
|
|
p = xmalloc(bufsize);
|
|
|
|
MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
|
|
|
|
ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
|
|
ec->elems[i].out_buf_start = p;
|
|
ec->elems[i].out_buf_end = p + bufsize;
|
|
ec->elems[i].out_data_start = p;
|
|
ec->elems[i].out_data_end = p;
|
|
ec->elems[i].last_result = econv_source_buffer_empty;
|
|
|
|
ec->num_trans++;
|
|
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
|
|
for (j = ec->num_trans-1; i <= j; j--) {
|
|
rb_transcoding *tc = ec->elems[j].tc;
|
|
const rb_transcoder *tr2 = tc->transcoder;
|
|
if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
|
|
ec->last_tc = tc;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
|
|
{
|
|
rb_econv_t *ec;
|
|
int i, ret;
|
|
|
|
for (i = 0; i < n; i++) {
|
|
const rb_transcoder *tr;
|
|
tr = load_transcoder_entry(entries[i]);
|
|
if (!tr)
|
|
return NULL;
|
|
}
|
|
|
|
ec = rb_econv_alloc(n);
|
|
|
|
for (i = 0; i < n; i++) {
|
|
const rb_transcoder *tr = load_transcoder_entry(entries[i]);
|
|
ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
|
|
if (ret == -1) {
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return ec;
|
|
}
|
|
|
|
struct trans_open_t {
|
|
transcoder_entry_t **entries;
|
|
int num_additional;
|
|
};
|
|
|
|
static void
|
|
trans_open_i(const char *sname, const char *dname, int depth, void *arg)
|
|
{
|
|
struct trans_open_t *toarg = arg;
|
|
|
|
if (!toarg->entries) {
|
|
toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
|
|
}
|
|
toarg->entries[depth] = get_transcoder_entry(sname, dname);
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_open0(const char *sname, const char *dname, int ecflags)
|
|
{
|
|
transcoder_entry_t **entries = NULL;
|
|
int num_trans;
|
|
rb_econv_t *ec;
|
|
|
|
rb_encoding *senc, *denc;
|
|
int sidx, didx;
|
|
|
|
senc = NULL;
|
|
if (*sname) {
|
|
sidx = rb_enc_find_index(sname);
|
|
if (0 <= sidx) {
|
|
senc = rb_enc_from_index(sidx);
|
|
}
|
|
}
|
|
|
|
denc = NULL;
|
|
if (*dname) {
|
|
didx = rb_enc_find_index(dname);
|
|
if (0 <= didx) {
|
|
denc = rb_enc_from_index(didx);
|
|
}
|
|
}
|
|
|
|
if (*sname == '\0' && *dname == '\0') {
|
|
num_trans = 0;
|
|
entries = NULL;
|
|
}
|
|
else {
|
|
struct trans_open_t toarg;
|
|
toarg.entries = NULL;
|
|
toarg.num_additional = 0;
|
|
num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
|
|
entries = toarg.entries;
|
|
if (num_trans < 0) {
|
|
xfree(entries);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
|
|
xfree(entries);
|
|
if (!ec)
|
|
return NULL;
|
|
|
|
ec->flags = ecflags;
|
|
ec->source_encoding_name = sname;
|
|
ec->destination_encoding_name = dname;
|
|
|
|
return ec;
|
|
}
|
|
|
|
#define MAX_ECFLAGS_DECORATORS 32
|
|
|
|
static int
|
|
decorator_names(int ecflags, const char **decorators_ret)
|
|
{
|
|
int num_decorators;
|
|
|
|
if ((ecflags & ECONV_CRLF_NEWLINE_DECORATOR) &&
|
|
(ecflags & ECONV_CR_NEWLINE_DECORATOR))
|
|
return -1;
|
|
|
|
if ((ecflags & (ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR)) &&
|
|
(ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR))
|
|
return -1;
|
|
|
|
if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
|
|
(ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
|
|
return -1;
|
|
|
|
num_decorators = 0;
|
|
|
|
if (ecflags & ECONV_XML_TEXT_DECORATOR)
|
|
decorators_ret[num_decorators++] = "xml_text_escape";
|
|
if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
|
|
decorators_ret[num_decorators++] = "xml_attr_content_escape";
|
|
if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "xml_attr_quote";
|
|
|
|
if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "crlf_newline";
|
|
if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "cr_newline";
|
|
if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "universal_newline";
|
|
|
|
return num_decorators;
|
|
}
|
|
|
|
rb_econv_t *
|
|
rb_econv_open(const char *sname, const char *dname, int ecflags)
|
|
{
|
|
rb_econv_t *ec;
|
|
int num_decorators;
|
|
const char *decorators[MAX_ECFLAGS_DECORATORS];
|
|
int i;
|
|
|
|
num_decorators = decorator_names(ecflags, decorators);
|
|
if (num_decorators == -1)
|
|
return NULL;
|
|
|
|
ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
|
|
if (!ec)
|
|
return NULL;
|
|
|
|
for (i = 0; i < num_decorators; i++)
|
|
if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
|
|
ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
|
|
|
|
return ec;
|
|
}
|
|
|
|
static int
|
|
trans_sweep(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags,
|
|
int start)
|
|
{
|
|
int try;
|
|
int i, f;
|
|
|
|
const unsigned char **ipp, *is, *iold;
|
|
unsigned char **opp, *os, *oold;
|
|
rb_econv_result_t res;
|
|
|
|
try = 1;
|
|
while (try) {
|
|
try = 0;
|
|
for (i = start; i < ec->num_trans; i++) {
|
|
rb_econv_elem_t *te = &ec->elems[i];
|
|
|
|
if (i == 0) {
|
|
ipp = input_ptr;
|
|
is = input_stop;
|
|
}
|
|
else {
|
|
rb_econv_elem_t *prev_te = &ec->elems[i-1];
|
|
ipp = (const unsigned char **)&prev_te->out_data_start;
|
|
is = prev_te->out_data_end;
|
|
}
|
|
|
|
if (i == ec->num_trans-1) {
|
|
opp = output_ptr;
|
|
os = output_stop;
|
|
}
|
|
else {
|
|
if (te->out_buf_start != te->out_data_start) {
|
|
int len = te->out_data_end - te->out_data_start;
|
|
int off = te->out_data_start - te->out_buf_start;
|
|
MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
|
|
te->out_data_start = te->out_buf_start;
|
|
te->out_data_end -= off;
|
|
}
|
|
opp = &te->out_data_end;
|
|
os = te->out_buf_end;
|
|
}
|
|
|
|
f = flags;
|
|
if (ec->num_finished != i)
|
|
f |= ECONV_PARTIAL_INPUT;
|
|
if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
|
|
start = 1;
|
|
flags &= ~ECONV_AFTER_OUTPUT;
|
|
}
|
|
if (i != 0)
|
|
f &= ~ECONV_AFTER_OUTPUT;
|
|
iold = *ipp;
|
|
oold = *opp;
|
|
te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
|
|
if (iold != *ipp || oold != *opp)
|
|
try = 1;
|
|
|
|
switch (res) {
|
|
case econv_invalid_byte_sequence:
|
|
case econv_incomplete_input:
|
|
case econv_undefined_conversion:
|
|
case econv_after_output:
|
|
return i;
|
|
|
|
case econv_destination_buffer_full:
|
|
case econv_source_buffer_empty:
|
|
break;
|
|
|
|
case econv_finished:
|
|
ec->num_finished = i+1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
rb_trans_conv(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags,
|
|
int *result_position_ptr)
|
|
{
|
|
int i;
|
|
int needreport_index;
|
|
int sweep_start;
|
|
|
|
unsigned char empty_buf;
|
|
unsigned char *empty_ptr = &empty_buf;
|
|
|
|
if (!input_ptr) {
|
|
input_ptr = (const unsigned char **)&empty_ptr;
|
|
input_stop = empty_ptr;
|
|
}
|
|
|
|
if (!output_ptr) {
|
|
output_ptr = &empty_ptr;
|
|
output_stop = empty_ptr;
|
|
}
|
|
|
|
if (ec->elems[0].last_result == econv_after_output)
|
|
ec->elems[0].last_result = econv_source_buffer_empty;
|
|
|
|
needreport_index = -1;
|
|
for (i = ec->num_trans-1; 0 <= i; i--) {
|
|
switch (ec->elems[i].last_result) {
|
|
case econv_invalid_byte_sequence:
|
|
case econv_incomplete_input:
|
|
case econv_undefined_conversion:
|
|
case econv_after_output:
|
|
case econv_finished:
|
|
sweep_start = i+1;
|
|
needreport_index = i;
|
|
goto found_needreport;
|
|
|
|
case econv_destination_buffer_full:
|
|
case econv_source_buffer_empty:
|
|
break;
|
|
|
|
default:
|
|
rb_bug("unexpected transcode last result");
|
|
}
|
|
}
|
|
|
|
/* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
|
|
|
|
if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
|
|
(flags & ECONV_AFTER_OUTPUT)) {
|
|
rb_econv_result_t res;
|
|
|
|
res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
|
|
(flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
|
|
result_position_ptr);
|
|
|
|
if (res == econv_source_buffer_empty)
|
|
return econv_after_output;
|
|
return res;
|
|
}
|
|
|
|
sweep_start = 0;
|
|
|
|
found_needreport:
|
|
|
|
do {
|
|
needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
|
|
sweep_start = needreport_index + 1;
|
|
} while (needreport_index != -1 && needreport_index != ec->num_trans-1);
|
|
|
|
for (i = ec->num_trans-1; 0 <= i; i--) {
|
|
if (ec->elems[i].last_result != econv_source_buffer_empty) {
|
|
rb_econv_result_t res = ec->elems[i].last_result;
|
|
if (res == econv_invalid_byte_sequence ||
|
|
res == econv_incomplete_input ||
|
|
res == econv_undefined_conversion ||
|
|
res == econv_after_output) {
|
|
ec->elems[i].last_result = econv_source_buffer_empty;
|
|
}
|
|
if (result_position_ptr)
|
|
*result_position_ptr = i;
|
|
return res;
|
|
}
|
|
}
|
|
if (result_position_ptr)
|
|
*result_position_ptr = -1;
|
|
return econv_source_buffer_empty;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
rb_econv_convert0(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags)
|
|
{
|
|
rb_econv_result_t res;
|
|
int result_position;
|
|
int has_output = 0;
|
|
|
|
memset(&ec->last_error, 0, sizeof(ec->last_error));
|
|
|
|
if (ec->num_trans == 0) {
|
|
size_t len;
|
|
if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
|
|
if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
|
|
len = output_stop - *output_ptr;
|
|
memcpy(*output_ptr, ec->in_data_start, len);
|
|
*output_ptr = output_stop;
|
|
ec->in_data_start += len;
|
|
res = econv_destination_buffer_full;
|
|
goto gotresult;
|
|
}
|
|
len = ec->in_data_end - ec->in_data_start;
|
|
memcpy(*output_ptr, ec->in_data_start, len);
|
|
*output_ptr += len;
|
|
ec->in_data_start = ec->in_data_end = ec->in_buf_start;
|
|
if (flags & ECONV_AFTER_OUTPUT) {
|
|
res = econv_after_output;
|
|
goto gotresult;
|
|
}
|
|
}
|
|
if (output_stop - *output_ptr < input_stop - *input_ptr) {
|
|
len = output_stop - *output_ptr;
|
|
}
|
|
else {
|
|
len = input_stop - *input_ptr;
|
|
}
|
|
if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
|
|
*(*output_ptr)++ = *(*input_ptr)++;
|
|
res = econv_after_output;
|
|
goto gotresult;
|
|
}
|
|
memcpy(*output_ptr, *input_ptr, len);
|
|
*output_ptr += len;
|
|
*input_ptr += len;
|
|
if (*input_ptr != input_stop)
|
|
res = econv_destination_buffer_full;
|
|
else if (flags & ECONV_PARTIAL_INPUT)
|
|
res = econv_source_buffer_empty;
|
|
else
|
|
res = econv_finished;
|
|
goto gotresult;
|
|
}
|
|
|
|
if (ec->elems[ec->num_trans-1].out_data_start) {
|
|
unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
|
|
unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
|
|
if (data_start != data_end) {
|
|
size_t len;
|
|
if (output_stop - *output_ptr < data_end - data_start) {
|
|
len = output_stop - *output_ptr;
|
|
memcpy(*output_ptr, data_start, len);
|
|
*output_ptr = output_stop;
|
|
ec->elems[ec->num_trans-1].out_data_start += len;
|
|
res = econv_destination_buffer_full;
|
|
goto gotresult;
|
|
}
|
|
len = data_end - data_start;
|
|
memcpy(*output_ptr, data_start, len);
|
|
*output_ptr += len;
|
|
ec->elems[ec->num_trans-1].out_data_start =
|
|
ec->elems[ec->num_trans-1].out_data_end =
|
|
ec->elems[ec->num_trans-1].out_buf_start;
|
|
has_output = 1;
|
|
}
|
|
}
|
|
|
|
if (ec->in_buf_start &&
|
|
ec->in_data_start != ec->in_data_end) {
|
|
res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
|
|
(flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
|
|
if (res != econv_source_buffer_empty)
|
|
goto gotresult;
|
|
}
|
|
|
|
if (has_output &&
|
|
(flags & ECONV_AFTER_OUTPUT) &&
|
|
*input_ptr != input_stop) {
|
|
input_stop = *input_ptr;
|
|
res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
|
|
if (res == econv_source_buffer_empty)
|
|
res = econv_after_output;
|
|
}
|
|
else if ((flags & ECONV_AFTER_OUTPUT) ||
|
|
ec->num_trans == 1) {
|
|
res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
|
|
}
|
|
else {
|
|
flags |= ECONV_AFTER_OUTPUT;
|
|
do {
|
|
res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
|
|
} while (res == econv_after_output);
|
|
}
|
|
|
|
gotresult:
|
|
ec->last_error.result = res;
|
|
if (res == econv_invalid_byte_sequence ||
|
|
res == econv_incomplete_input ||
|
|
res == econv_undefined_conversion) {
|
|
rb_transcoding *error_tc = ec->elems[result_position].tc;
|
|
ec->last_error.error_tc = error_tc;
|
|
ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
|
|
ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
|
|
ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
|
|
ec->last_error.error_bytes_len = error_tc->recognized_len;
|
|
ec->last_error.readagain_len = error_tc->readagain_len;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
static int output_replacement_character(rb_econv_t *ec);
|
|
|
|
static int
|
|
output_hex_charref(rb_econv_t *ec)
|
|
{
|
|
int ret;
|
|
unsigned char utfbuf[1024];
|
|
const unsigned char *utf;
|
|
size_t utf_len;
|
|
int utf_allocated = 0;
|
|
char charef_buf[16];
|
|
const unsigned char *p;
|
|
|
|
if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
|
|
utf = ec->last_error.error_bytes_start;
|
|
utf_len = ec->last_error.error_bytes_len;
|
|
}
|
|
else {
|
|
utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
|
|
ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
|
|
utfbuf, sizeof(utfbuf),
|
|
&utf_len);
|
|
if (!utf)
|
|
return -1;
|
|
if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
|
|
utf_allocated = 1;
|
|
}
|
|
|
|
if (utf_len % 4 != 0)
|
|
goto fail;
|
|
|
|
p = utf;
|
|
while (4 <= utf_len) {
|
|
unsigned int u = 0;
|
|
u += p[0] << 24;
|
|
u += p[1] << 16;
|
|
u += p[2] << 8;
|
|
u += p[3];
|
|
snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
|
|
|
|
ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
|
|
if (ret == -1)
|
|
goto fail;
|
|
|
|
p += 4;
|
|
utf_len -= 4;
|
|
}
|
|
|
|
if (utf_allocated)
|
|
xfree((void *)utf);
|
|
return 0;
|
|
|
|
fail:
|
|
if (utf_allocated)
|
|
xfree((void *)utf);
|
|
return -1;
|
|
}
|
|
|
|
rb_econv_result_t
|
|
rb_econv_convert(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags)
|
|
{
|
|
rb_econv_result_t ret;
|
|
|
|
unsigned char empty_buf;
|
|
unsigned char *empty_ptr = &empty_buf;
|
|
|
|
ec->started = 1;
|
|
|
|
if (!input_ptr) {
|
|
input_ptr = (const unsigned char **)&empty_ptr;
|
|
input_stop = empty_ptr;
|
|
}
|
|
|
|
if (!output_ptr) {
|
|
output_ptr = &empty_ptr;
|
|
output_stop = empty_ptr;
|
|
}
|
|
|
|
resume:
|
|
ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
|
|
|
|
if (ret == econv_invalid_byte_sequence ||
|
|
ret == econv_incomplete_input) {
|
|
/* deal with invalid byte sequence */
|
|
/* todo: add more alternative behaviors */
|
|
switch (ec->flags & ECONV_INVALID_MASK) {
|
|
case ECONV_INVALID_REPLACE:
|
|
if (output_replacement_character(ec) == 0)
|
|
goto resume;
|
|
}
|
|
}
|
|
|
|
if (ret == econv_undefined_conversion) {
|
|
/* valid character in source encoding
|
|
* but no related character(s) in destination encoding */
|
|
/* todo: add more alternative behaviors */
|
|
switch (ec->flags & ECONV_UNDEF_MASK) {
|
|
case ECONV_UNDEF_REPLACE:
|
|
if (output_replacement_character(ec) == 0)
|
|
goto resume;
|
|
break;
|
|
|
|
case ECONV_UNDEF_HEX_CHARREF:
|
|
if (output_hex_charref(ec) == 0)
|
|
goto resume;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
const char *
|
|
rb_econv_encoding_to_insert_output(rb_econv_t *ec)
|
|
{
|
|
rb_transcoding *tc = ec->last_tc;
|
|
const rb_transcoder *tr;
|
|
|
|
if (tc == NULL)
|
|
return "";
|
|
|
|
tr = tc->transcoder;
|
|
|
|
if (tr->asciicompat_type == asciicompat_encoder)
|
|
return tr->src_encoding;
|
|
return tr->dst_encoding;
|
|
}
|
|
|
|
static unsigned char *
|
|
allocate_converted_string(const char *sname, const char *dname,
|
|
const unsigned char *str, size_t len,
|
|
unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
|
|
size_t *dst_len_ptr)
|
|
{
|
|
unsigned char *dst_str;
|
|
size_t dst_len;
|
|
size_t dst_bufsize;
|
|
|
|
rb_econv_t *ec;
|
|
rb_econv_result_t res;
|
|
|
|
const unsigned char *sp;
|
|
unsigned char *dp;
|
|
|
|
if (caller_dst_buf)
|
|
dst_bufsize = caller_dst_bufsize;
|
|
else if (len == 0)
|
|
dst_bufsize = 1;
|
|
else
|
|
dst_bufsize = len;
|
|
|
|
ec = rb_econv_open(sname, dname, 0);
|
|
if (ec == NULL)
|
|
return NULL;
|
|
if (caller_dst_buf)
|
|
dst_str = caller_dst_buf;
|
|
else
|
|
dst_str = xmalloc(dst_bufsize);
|
|
dst_len = 0;
|
|
sp = str;
|
|
dp = dst_str+dst_len;
|
|
res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
|
|
dst_len = dp - dst_str;
|
|
while (res == econv_destination_buffer_full) {
|
|
if (SIZE_MAX/2 < dst_bufsize) {
|
|
goto fail;
|
|
}
|
|
dst_bufsize *= 2;
|
|
if (dst_str == caller_dst_buf) {
|
|
unsigned char *tmp;
|
|
tmp = xmalloc(dst_bufsize);
|
|
memcpy(tmp, dst_str, dst_bufsize/2);
|
|
dst_str = tmp;
|
|
}
|
|
else {
|
|
dst_str = xrealloc(dst_str, dst_bufsize);
|
|
}
|
|
dp = dst_str+dst_len;
|
|
res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
|
|
dst_len = dp - dst_str;
|
|
}
|
|
if (res != econv_finished) {
|
|
goto fail;
|
|
}
|
|
rb_econv_close(ec);
|
|
*dst_len_ptr = dst_len;
|
|
return dst_str;
|
|
|
|
fail:
|
|
if (dst_str != caller_dst_buf)
|
|
xfree(dst_str);
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
|
|
/* result: 0:success -1:failure */
|
|
int
|
|
rb_econv_insert_output(rb_econv_t *ec,
|
|
const unsigned char *str, size_t len, const char *str_encoding)
|
|
{
|
|
const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
|
|
unsigned char insert_buf[4096];
|
|
const unsigned char *insert_str = NULL;
|
|
size_t insert_len;
|
|
|
|
int last_trans_index;
|
|
rb_transcoding *tc;
|
|
|
|
unsigned char **buf_start_p;
|
|
unsigned char **data_start_p;
|
|
unsigned char **data_end_p;
|
|
unsigned char **buf_end_p;
|
|
|
|
size_t need;
|
|
|
|
ec->started = 1;
|
|
|
|
if (len == 0)
|
|
return 0;
|
|
|
|
if (encoding_equal(insert_encoding, str_encoding)) {
|
|
insert_str = str;
|
|
insert_len = len;
|
|
}
|
|
else {
|
|
insert_str = allocate_converted_string(str_encoding, insert_encoding,
|
|
str, len, insert_buf, sizeof(insert_buf), &insert_len);
|
|
if (insert_str == NULL)
|
|
return -1;
|
|
}
|
|
|
|
need = insert_len;
|
|
|
|
last_trans_index = ec->num_trans-1;
|
|
if (ec->num_trans == 0) {
|
|
tc = NULL;
|
|
buf_start_p = &ec->in_buf_start;
|
|
data_start_p = &ec->in_data_start;
|
|
data_end_p = &ec->in_data_end;
|
|
buf_end_p = &ec->in_buf_end;
|
|
}
|
|
else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
|
|
tc = ec->elems[last_trans_index].tc;
|
|
need += tc->readagain_len;
|
|
if (need < insert_len)
|
|
goto fail;
|
|
if (last_trans_index == 0) {
|
|
buf_start_p = &ec->in_buf_start;
|
|
data_start_p = &ec->in_data_start;
|
|
data_end_p = &ec->in_data_end;
|
|
buf_end_p = &ec->in_buf_end;
|
|
}
|
|
else {
|
|
rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
|
|
buf_start_p = &ee->out_buf_start;
|
|
data_start_p = &ee->out_data_start;
|
|
data_end_p = &ee->out_data_end;
|
|
buf_end_p = &ee->out_buf_end;
|
|
}
|
|
}
|
|
else {
|
|
rb_econv_elem_t *ee = &ec->elems[last_trans_index];
|
|
buf_start_p = &ee->out_buf_start;
|
|
data_start_p = &ee->out_data_start;
|
|
data_end_p = &ee->out_data_end;
|
|
buf_end_p = &ee->out_buf_end;
|
|
tc = ec->elems[last_trans_index].tc;
|
|
}
|
|
|
|
if (*buf_start_p == NULL) {
|
|
unsigned char *buf = xmalloc(need);
|
|
*buf_start_p = buf;
|
|
*data_start_p = buf;
|
|
*data_end_p = buf;
|
|
*buf_end_p = buf+need;
|
|
}
|
|
else if (*buf_end_p - *data_end_p < need) {
|
|
MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
|
|
*data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
|
|
*data_start_p = *buf_start_p;
|
|
if (*buf_end_p - *data_end_p < need) {
|
|
unsigned char *buf;
|
|
size_t s = (*data_end_p - *buf_start_p) + need;
|
|
if (s < need)
|
|
goto fail;
|
|
buf = xrealloc(*buf_start_p, s);
|
|
*data_start_p = buf;
|
|
*data_end_p = buf + (*data_end_p - *buf_start_p);
|
|
*buf_start_p = buf;
|
|
*buf_end_p = buf + s;
|
|
}
|
|
}
|
|
|
|
memcpy(*data_end_p, insert_str, insert_len);
|
|
*data_end_p += insert_len;
|
|
if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
|
|
memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
|
|
*data_end_p += tc->readagain_len;
|
|
tc->readagain_len = 0;
|
|
}
|
|
|
|
if (insert_str != str && insert_str != insert_buf)
|
|
xfree((void*)insert_str);
|
|
return 0;
|
|
|
|
fail:
|
|
if (insert_str != str && insert_str != insert_buf)
|
|
xfree((void*)insert_str);
|
|
return -1;
|
|
}
|
|
|
|
void
|
|
rb_econv_close(rb_econv_t *ec)
|
|
{
|
|
int i;
|
|
|
|
if (ec->replacement_allocated) {
|
|
xfree((void *)ec->replacement_str);
|
|
}
|
|
for (i = 0; i < ec->num_trans; i++) {
|
|
rb_transcoding_close(ec->elems[i].tc);
|
|
if (ec->elems[i].out_buf_start)
|
|
xfree(ec->elems[i].out_buf_start);
|
|
}
|
|
xfree(ec->in_buf_start);
|
|
xfree(ec->elems);
|
|
xfree(ec);
|
|
}
|
|
|
|
int
|
|
rb_econv_putbackable(rb_econv_t *ec)
|
|
{
|
|
if (ec->num_trans == 0)
|
|
return 0;
|
|
return ec->elems[0].tc->readagain_len;
|
|
}
|
|
|
|
void
|
|
rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
|
|
{
|
|
rb_transcoding *tc;
|
|
if (ec->num_trans == 0 || n == 0)
|
|
return;
|
|
tc = ec->elems[0].tc;
|
|
memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
|
|
tc->readagain_len -= n;
|
|
}
|
|
|
|
struct asciicompat_encoding_t {
|
|
const char *ascii_compat_name;
|
|
const char *ascii_incompat_name;
|
|
};
|
|
|
|
static int
|
|
asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
|
|
{
|
|
struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
|
|
transcoder_entry_t *entry = (transcoder_entry_t *)val;
|
|
const rb_transcoder *tr;
|
|
|
|
if (DECORATOR_P(entry->sname, entry->dname))
|
|
return ST_CONTINUE;
|
|
tr = load_transcoder_entry(entry);
|
|
if (tr && tr->asciicompat_type == asciicompat_decoder) {
|
|
data->ascii_compat_name = tr->dst_encoding;
|
|
return ST_STOP;
|
|
}
|
|
return ST_CONTINUE;
|
|
}
|
|
|
|
const char *
|
|
rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
|
|
{
|
|
st_data_t v;
|
|
st_table *table2;
|
|
struct asciicompat_encoding_t data;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
|
|
return NULL;
|
|
table2 = (st_table *)v;
|
|
|
|
/*
|
|
* Assumption:
|
|
* There are at most one transcoder for
|
|
* converting from ASCII incompatible encoding.
|
|
*
|
|
* For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
|
|
*/
|
|
if (table2->num_entries != 1)
|
|
return NULL;
|
|
|
|
data.ascii_incompat_name = ascii_incompat_name;
|
|
data.ascii_compat_name = NULL;
|
|
st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
|
|
return data.ascii_compat_name;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
|
|
{
|
|
unsigned const char *ss, *sp, *se;
|
|
unsigned char *ds, *dp, *de;
|
|
rb_econv_result_t res;
|
|
int max_output;
|
|
|
|
if (NIL_P(dst)) {
|
|
dst = rb_str_buf_new(len);
|
|
if (ec->destination_encoding)
|
|
rb_enc_associate(dst, ec->destination_encoding);
|
|
}
|
|
|
|
if (ec->last_tc)
|
|
max_output = ec->last_tc->transcoder->max_output;
|
|
else
|
|
max_output = 1;
|
|
|
|
res = econv_destination_buffer_full;
|
|
while (res == econv_destination_buffer_full) {
|
|
long dlen = RSTRING_LEN(dst);
|
|
if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
|
|
unsigned long new_capa = (unsigned long)dlen + len + max_output;
|
|
if (LONG_MAX < new_capa)
|
|
rb_raise(rb_eArgError, "too long string");
|
|
rb_str_resize(dst, new_capa);
|
|
rb_str_set_len(dst, dlen);
|
|
}
|
|
ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
|
|
se = ss + len;
|
|
ds = (unsigned char *)RSTRING_PTR(dst);
|
|
de = ds + rb_str_capacity(dst);
|
|
dp = ds += dlen;
|
|
res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
|
|
off += sp - ss;
|
|
len -= sp - ss;
|
|
rb_str_set_len(dst, dlen + (dp - ds));
|
|
rb_econv_check_error(ec);
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
|
|
{
|
|
return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
|
|
{
|
|
return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
|
|
{
|
|
return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
|
|
}
|
|
|
|
static int
|
|
rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
|
|
{
|
|
transcoder_entry_t *entry;
|
|
const rb_transcoder *tr;
|
|
|
|
if (ec->started != 0)
|
|
return -1;
|
|
|
|
entry = get_transcoder_entry(sname, dname);
|
|
if (!entry)
|
|
return -1;
|
|
|
|
tr = load_transcoder_entry(entry);
|
|
if (!entry)
|
|
return -1;
|
|
|
|
return rb_econv_add_transcoder_at(ec, tr, n);
|
|
}
|
|
|
|
static int
|
|
rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
|
|
{
|
|
return rb_econv_add_converter(ec, "", decorator_name, n);
|
|
}
|
|
|
|
int
|
|
rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
|
|
{
|
|
const rb_transcoder *tr;
|
|
|
|
if (ec->num_trans == 0)
|
|
return rb_econv_decorate_at(ec, decorator_name, 0);
|
|
|
|
tr = ec->elems[0].tc->transcoder;
|
|
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
|
|
tr->asciicompat_type == asciicompat_decoder)
|
|
return rb_econv_decorate_at(ec, decorator_name, 1);
|
|
|
|
return rb_econv_decorate_at(ec, decorator_name, 0);
|
|
}
|
|
|
|
int
|
|
rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
|
|
{
|
|
const rb_transcoder *tr;
|
|
|
|
if (ec->num_trans == 0)
|
|
return rb_econv_decorate_at(ec, decorator_name, 0);
|
|
|
|
tr = ec->elems[ec->num_trans-1].tc->transcoder;
|
|
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
|
|
tr->asciicompat_type == asciicompat_encoder)
|
|
return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
|
|
|
|
return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
|
|
}
|
|
|
|
void
|
|
rb_econv_binmode(rb_econv_t *ec)
|
|
{
|
|
const rb_transcoder *trs[3];
|
|
int n, i, j;
|
|
transcoder_entry_t *entry;
|
|
int num_trans;
|
|
|
|
n = 0;
|
|
if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
|
|
entry = get_transcoder_entry("", "universal_newline");
|
|
if (entry->transcoder)
|
|
trs[n++] = entry->transcoder;
|
|
}
|
|
if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) {
|
|
entry = get_transcoder_entry("", "crlf_newline");
|
|
if (entry->transcoder)
|
|
trs[n++] = entry->transcoder;
|
|
}
|
|
if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) {
|
|
entry = get_transcoder_entry("", "cr_newline");
|
|
if (entry->transcoder)
|
|
trs[n++] = entry->transcoder;
|
|
}
|
|
|
|
num_trans = ec->num_trans;
|
|
j = 0;
|
|
for (i = 0; i < num_trans; i++) {
|
|
int k;
|
|
for (k = 0; k < n; k++)
|
|
if (trs[k] == ec->elems[i].tc->transcoder)
|
|
break;
|
|
if (k == n) {
|
|
ec->elems[j] = ec->elems[i];
|
|
j++;
|
|
}
|
|
else {
|
|
rb_transcoding_close(ec->elems[i].tc);
|
|
xfree(ec->elems[i].out_buf_start);
|
|
ec->num_trans--;
|
|
}
|
|
}
|
|
|
|
ec->flags &= ~(ECONV_UNIVERSAL_NEWLINE_DECORATOR|ECONV_CRLF_NEWLINE_DECORATOR|ECONV_CR_NEWLINE_DECORATOR);
|
|
|
|
}
|
|
|
|
static VALUE
|
|
econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
|
|
{
|
|
int has_description = 0;
|
|
|
|
if (NIL_P(mesg))
|
|
mesg = rb_str_new(NULL, 0);
|
|
|
|
if (*sname != '\0' || *dname != '\0') {
|
|
if (*sname == '\0')
|
|
rb_str_cat2(mesg, dname);
|
|
else if (*dname == '\0')
|
|
rb_str_cat2(mesg, sname);
|
|
else
|
|
rb_str_catf(mesg, "%s to %s", sname, dname);
|
|
has_description = 1;
|
|
}
|
|
|
|
if (ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR|
|
|
ECONV_CRLF_NEWLINE_DECORATOR|
|
|
ECONV_CR_NEWLINE_DECORATOR|
|
|
ECONV_XML_TEXT_DECORATOR|
|
|
ECONV_XML_ATTR_CONTENT_DECORATOR|
|
|
ECONV_XML_ATTR_QUOTE_DECORATOR)) {
|
|
const char *pre = "";
|
|
if (has_description)
|
|
rb_str_cat2(mesg, " with ");
|
|
if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "universal_newline");
|
|
}
|
|
if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "crlf_newline");
|
|
}
|
|
if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "cr_newline");
|
|
}
|
|
if (ecflags & ECONV_XML_TEXT_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "xml_text");
|
|
}
|
|
if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "xml_attr_content");
|
|
}
|
|
if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "xml_attr_quote");
|
|
}
|
|
has_description = 1;
|
|
}
|
|
if (!has_description) {
|
|
rb_str_cat2(mesg, "no-conversion");
|
|
}
|
|
|
|
return mesg;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
|
|
{
|
|
VALUE mesg, exc;
|
|
mesg = rb_str_new_cstr("code converter not found (");
|
|
econv_description(sname, dname, ecflags, mesg);
|
|
rb_str_cat2(mesg, ")");
|
|
exc = rb_exc_new3(rb_eNoConverterError, mesg);
|
|
return exc;
|
|
}
|
|
|
|
static VALUE
|
|
make_econv_exception(rb_econv_t *ec)
|
|
{
|
|
VALUE mesg, exc;
|
|
if (ec->last_error.result == econv_invalid_byte_sequence ||
|
|
ec->last_error.result == econv_incomplete_input) {
|
|
const char *err = (const char *)ec->last_error.error_bytes_start;
|
|
size_t error_len = ec->last_error.error_bytes_len;
|
|
VALUE bytes = rb_str_new(err, error_len);
|
|
VALUE dumped = rb_str_dump(bytes);
|
|
size_t readagain_len = ec->last_error.readagain_len;
|
|
VALUE bytes2 = Qnil;
|
|
VALUE dumped2;
|
|
int idx;
|
|
if (ec->last_error.result == econv_incomplete_input) {
|
|
mesg = rb_sprintf("incomplete %s on %s",
|
|
StringValueCStr(dumped),
|
|
ec->last_error.source_encoding);
|
|
}
|
|
else if (readagain_len) {
|
|
bytes2 = rb_str_new(err+error_len, readagain_len);
|
|
dumped2 = rb_str_dump(bytes2);
|
|
mesg = rb_sprintf("%s followed by %s on %s",
|
|
StringValueCStr(dumped),
|
|
StringValueCStr(dumped2),
|
|
ec->last_error.source_encoding);
|
|
}
|
|
else {
|
|
mesg = rb_sprintf("%s on %s",
|
|
StringValueCStr(dumped),
|
|
ec->last_error.source_encoding);
|
|
}
|
|
|
|
exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
|
|
rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
|
|
rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
|
|
rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
|
|
|
|
set_encs:
|
|
rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
|
|
rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
|
|
idx = rb_enc_find_index(ec->last_error.source_encoding);
|
|
if (0 <= idx)
|
|
rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
|
|
idx = rb_enc_find_index(ec->last_error.destination_encoding);
|
|
if (0 <= idx)
|
|
rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
|
|
return exc;
|
|
}
|
|
if (ec->last_error.result == econv_undefined_conversion) {
|
|
VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
|
|
ec->last_error.error_bytes_len);
|
|
VALUE dumped;
|
|
int idx;
|
|
dumped = rb_str_dump(bytes);
|
|
mesg = rb_sprintf("%s from %s to %s",
|
|
StringValueCStr(dumped),
|
|
ec->last_error.source_encoding,
|
|
ec->last_error.destination_encoding);
|
|
exc = rb_exc_new3(rb_eConversionUndefinedError, mesg);
|
|
idx = rb_enc_find_index(ec->last_error.source_encoding);
|
|
if (0 <= idx)
|
|
rb_enc_associate_index(bytes, idx);
|
|
rb_ivar_set(exc, rb_intern("error_char"), bytes);
|
|
goto set_encs;
|
|
}
|
|
return Qnil;
|
|
}
|
|
|
|
static void
|
|
more_output_buffer(
|
|
VALUE destination,
|
|
unsigned char *(*resize_destination)(VALUE, int, int),
|
|
int max_output,
|
|
unsigned char **out_start_ptr,
|
|
unsigned char **out_pos,
|
|
unsigned char **out_stop_ptr)
|
|
{
|
|
size_t len = (*out_pos - *out_start_ptr);
|
|
size_t new_len = (len + max_output) * 2;
|
|
*out_start_ptr = resize_destination(destination, len, new_len);
|
|
*out_pos = *out_start_ptr + len;
|
|
*out_stop_ptr = *out_start_ptr + new_len;
|
|
}
|
|
|
|
static int
|
|
make_replacement(rb_econv_t *ec)
|
|
{
|
|
rb_transcoding *tc;
|
|
const rb_transcoder *tr;
|
|
rb_encoding *enc;
|
|
const unsigned char *replacement;
|
|
const char *repl_enc;
|
|
const char *ins_enc;
|
|
size_t len;
|
|
|
|
if (ec->replacement_str)
|
|
return 0;
|
|
|
|
ins_enc = rb_econv_encoding_to_insert_output(ec);
|
|
|
|
tc = ec->last_tc;
|
|
if (*ins_enc) {
|
|
tr = tc->transcoder;
|
|
enc = rb_enc_find(tr->dst_encoding);
|
|
replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
|
|
}
|
|
else {
|
|
replacement = (unsigned char *)"?";
|
|
len = 1;
|
|
repl_enc = "";
|
|
}
|
|
|
|
ec->replacement_str = replacement;
|
|
ec->replacement_len = len;
|
|
ec->replacement_enc = repl_enc;
|
|
ec->replacement_allocated = 0;
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
rb_econv_set_replacement(rb_econv_t *ec,
|
|
const unsigned char *str, size_t len, const char *encname)
|
|
{
|
|
unsigned char *str2;
|
|
size_t len2;
|
|
const char *encname2;
|
|
|
|
encname2 = rb_econv_encoding_to_insert_output(ec);
|
|
|
|
if (encoding_equal(encname, encname2)) {
|
|
str2 = xmalloc(len);
|
|
MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
|
|
len2 = len;
|
|
encname2 = encname;
|
|
}
|
|
else {
|
|
str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
|
|
if (!str2)
|
|
return -1;
|
|
}
|
|
|
|
if (ec->replacement_allocated) {
|
|
xfree((void *)ec->replacement_str);
|
|
}
|
|
ec->replacement_allocated = 1;
|
|
ec->replacement_str = str2;
|
|
ec->replacement_len = len2;
|
|
ec->replacement_enc = encname2;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
output_replacement_character(rb_econv_t *ec)
|
|
{
|
|
int ret;
|
|
|
|
if (make_replacement(ec) == -1)
|
|
return -1;
|
|
|
|
ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
|
|
if (ret == -1)
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if 1
|
|
static void
|
|
transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
VALUE destination,
|
|
unsigned char *(*resize_destination)(VALUE, int, int),
|
|
const char *src_encoding,
|
|
const char *dst_encoding,
|
|
int ecflags,
|
|
VALUE ecopts)
|
|
{
|
|
rb_econv_t *ec;
|
|
rb_transcoding *last_tc;
|
|
rb_econv_result_t ret;
|
|
unsigned char *out_start = *out_pos;
|
|
int max_output;
|
|
VALUE exc;
|
|
|
|
ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
|
|
if (!ec)
|
|
rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
|
|
|
|
last_tc = ec->last_tc;
|
|
max_output = last_tc ? last_tc->transcoder->max_output : 1;
|
|
|
|
resume:
|
|
ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
|
|
|
|
if (ret == econv_invalid_byte_sequence ||
|
|
ret == econv_incomplete_input ||
|
|
ret == econv_undefined_conversion) {
|
|
exc = make_econv_exception(ec);
|
|
rb_econv_close(ec);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (ret == econv_destination_buffer_full) {
|
|
more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
|
|
goto resume;
|
|
}
|
|
|
|
rb_econv_close(ec);
|
|
return;
|
|
}
|
|
#else
|
|
/* sample transcode_loop implementation in byte-by-byte stream style */
|
|
static void
|
|
transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
VALUE destination,
|
|
unsigned char *(*resize_destination)(VALUE, int, int),
|
|
const char *src_encoding,
|
|
const char *dst_encoding,
|
|
int ecflags,
|
|
VALUE ecopts)
|
|
{
|
|
rb_econv_t *ec;
|
|
rb_transcoding *last_tc;
|
|
rb_econv_result_t ret;
|
|
unsigned char *out_start = *out_pos;
|
|
const unsigned char *ptr;
|
|
int max_output;
|
|
VALUE exc;
|
|
|
|
ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
|
|
if (!ec)
|
|
rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
|
|
|
|
last_tc = ec->last_tc;
|
|
max_output = last_tc ? last_tc->transcoder->max_output : 1;
|
|
|
|
ret = econv_source_buffer_empty;
|
|
ptr = *in_pos;
|
|
while (ret != econv_finished) {
|
|
unsigned char input_byte;
|
|
const unsigned char *p = &input_byte;
|
|
|
|
if (ret == econv_source_buffer_empty) {
|
|
if (ptr < in_stop) {
|
|
input_byte = *ptr;
|
|
ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
|
|
}
|
|
else {
|
|
ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
|
|
}
|
|
}
|
|
else {
|
|
ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
|
|
}
|
|
if (&input_byte != p)
|
|
ptr += p - &input_byte;
|
|
switch (ret) {
|
|
case econv_invalid_byte_sequence:
|
|
case econv_incomplete_input:
|
|
case econv_undefined_conversion:
|
|
exc = make_econv_exception(ec);
|
|
rb_econv_close(ec);
|
|
rb_exc_raise(exc);
|
|
break;
|
|
|
|
case econv_destination_buffer_full:
|
|
more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
|
|
break;
|
|
|
|
case econv_source_buffer_empty:
|
|
break;
|
|
|
|
case econv_finished:
|
|
break;
|
|
}
|
|
}
|
|
rb_econv_close(ec);
|
|
*in_pos = in_stop;
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
|
|
/*
|
|
* String-specific code
|
|
*/
|
|
|
|
static unsigned char *
|
|
str_transcoding_resize(VALUE destination, int len, int new_len)
|
|
{
|
|
rb_str_resize(destination, new_len);
|
|
return (unsigned char *)RSTRING_PTR(destination);
|
|
}
|
|
|
|
static int
|
|
econv_opts(VALUE opt)
|
|
{
|
|
VALUE v;
|
|
int ecflags = 0;
|
|
|
|
v = rb_hash_aref(opt, sym_invalid);
|
|
if (NIL_P(v)) {
|
|
}
|
|
else if (v==sym_replace) {
|
|
ecflags |= ECONV_INVALID_REPLACE;
|
|
}
|
|
else {
|
|
rb_raise(rb_eArgError, "unknown value for invalid character option");
|
|
}
|
|
|
|
v = rb_hash_aref(opt, sym_undef);
|
|
if (NIL_P(v)) {
|
|
}
|
|
else if (v==sym_replace) {
|
|
ecflags |= ECONV_UNDEF_REPLACE;
|
|
}
|
|
else {
|
|
rb_raise(rb_eArgError, "unknown value for undefined character option");
|
|
}
|
|
|
|
v = rb_hash_aref(opt, sym_xml);
|
|
if (!NIL_P(v)) {
|
|
if (v==sym_text) {
|
|
ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
|
|
}
|
|
else if (v==sym_attr) {
|
|
ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
|
|
}
|
|
else if (TYPE(v) == T_SYMBOL) {
|
|
rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
|
|
}
|
|
else {
|
|
rb_raise(rb_eArgError, "unexpected value for xml option");
|
|
}
|
|
}
|
|
|
|
v = rb_hash_aref(opt, sym_universal_newline);
|
|
if (RTEST(v))
|
|
ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
|
|
|
|
v = rb_hash_aref(opt, sym_crlf_newline);
|
|
if (RTEST(v))
|
|
ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
|
|
|
|
v = rb_hash_aref(opt, sym_cr_newline);
|
|
if (RTEST(v))
|
|
ecflags |= ECONV_CR_NEWLINE_DECORATOR;
|
|
|
|
return ecflags;
|
|
}
|
|
|
|
int
|
|
rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
|
|
{
|
|
int ecflags;
|
|
VALUE newhash = Qnil;
|
|
if (NIL_P(opthash)) {
|
|
*opts = Qnil;
|
|
return 0;
|
|
}
|
|
ecflags = econv_opts(opthash);
|
|
|
|
if ((ecflags & ECONV_INVALID_MASK) == ECONV_INVALID_REPLACE ||
|
|
(ecflags & ECONV_UNDEF_MASK) == ECONV_UNDEF_REPLACE) {
|
|
VALUE v = rb_hash_aref(opthash, sym_replace);
|
|
if (!NIL_P(v)) {
|
|
StringValue(v);
|
|
if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
|
|
VALUE dumped = rb_str_dump(v);
|
|
rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
|
|
StringValueCStr(dumped),
|
|
rb_enc_name(rb_enc_get(v)));
|
|
}
|
|
v = rb_str_new_frozen(v);
|
|
newhash = rb_hash_new();
|
|
rb_hash_aset(newhash, sym_replace, v);
|
|
}
|
|
}
|
|
if (!NIL_P(newhash))
|
|
rb_hash_freeze(newhash);
|
|
*opts = newhash;
|
|
|
|
return ecflags;
|
|
}
|
|
|
|
rb_econv_t *
|
|
rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
|
|
{
|
|
rb_econv_t *ec;
|
|
VALUE replacement;
|
|
|
|
if (NIL_P(opthash)) {
|
|
replacement = Qnil;
|
|
}
|
|
else {
|
|
if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash))
|
|
rb_bug("rb_econv_open_opts called with invalid opthash");
|
|
replacement = rb_hash_aref(opthash, sym_replace);
|
|
}
|
|
|
|
ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
|
|
if (!ec)
|
|
return ec;
|
|
|
|
if (!NIL_P(replacement)) {
|
|
int ret;
|
|
rb_encoding *enc = rb_enc_get(replacement);
|
|
|
|
ret = rb_econv_set_replacement(ec,
|
|
(const unsigned char *)RSTRING_PTR(replacement),
|
|
RSTRING_LEN(replacement),
|
|
rb_enc_name(enc));
|
|
if (ret == -1) {
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
}
|
|
return ec;
|
|
}
|
|
|
|
static int
|
|
enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
|
|
{
|
|
rb_encoding *enc;
|
|
const char *n;
|
|
int encidx;
|
|
VALUE encval;
|
|
|
|
if ((encidx = rb_to_encoding_index(encval = *arg)) < 0) {
|
|
enc = NULL;
|
|
encidx = 0;
|
|
n = StringValueCStr(*arg);
|
|
}
|
|
else {
|
|
enc = rb_enc_from_index(encidx);
|
|
n = rb_enc_name(enc);
|
|
}
|
|
|
|
*name_p = n;
|
|
*enc_p = enc;
|
|
|
|
return encidx;
|
|
}
|
|
|
|
static int
|
|
str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
|
|
const char **sname_p, rb_encoding **senc_p,
|
|
const char **dname_p, rb_encoding **denc_p)
|
|
{
|
|
rb_encoding *senc, *denc;
|
|
const char *sname, *dname;
|
|
int sencidx, dencidx;
|
|
|
|
dencidx = enc_arg(arg1, &dname, &denc);
|
|
|
|
if (NIL_P(*arg2)) {
|
|
sencidx = rb_enc_get_index(str);
|
|
senc = rb_enc_from_index(sencidx);
|
|
sname = rb_enc_name(senc);
|
|
}
|
|
else {
|
|
sencidx = enc_arg(arg2, &sname, &senc);
|
|
}
|
|
|
|
*sname_p = sname;
|
|
*senc_p = senc;
|
|
*dname_p = dname;
|
|
*denc_p = denc;
|
|
return dencidx;
|
|
}
|
|
|
|
static int
|
|
str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
|
|
{
|
|
VALUE dest;
|
|
VALUE str = *self;
|
|
volatile VALUE arg1, arg2;
|
|
long blen, slen;
|
|
unsigned char *buf, *bp, *sp;
|
|
const unsigned char *fromp;
|
|
rb_encoding *senc, *denc;
|
|
const char *sname, *dname;
|
|
int dencidx;
|
|
|
|
if (argc < 1 || argc > 2) {
|
|
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
|
|
}
|
|
|
|
arg1 = argv[0];
|
|
arg2 = argc==1 ? Qnil : argv[1];
|
|
dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
|
|
|
|
if ((ecflags & (ECONV_UNIVERSAL_NEWLINE_DECORATOR|
|
|
ECONV_CRLF_NEWLINE_DECORATOR|
|
|
ECONV_CR_NEWLINE_DECORATOR|
|
|
ECONV_XML_TEXT_DECORATOR|
|
|
ECONV_XML_ATTR_CONTENT_DECORATOR|
|
|
ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
|
|
if (senc && senc == denc) {
|
|
return -1;
|
|
}
|
|
if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
|
|
if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
|
|
return dencidx;
|
|
}
|
|
}
|
|
if (encoding_equal(sname, dname)) {
|
|
return -1;
|
|
}
|
|
}
|
|
else {
|
|
if (encoding_equal(sname, dname)) {
|
|
sname = "";
|
|
dname = "";
|
|
}
|
|
}
|
|
|
|
fromp = sp = (unsigned char *)RSTRING_PTR(str);
|
|
slen = RSTRING_LEN(str);
|
|
blen = slen + 30; /* len + margin */
|
|
dest = rb_str_tmp_new(blen);
|
|
bp = (unsigned char *)RSTRING_PTR(dest);
|
|
|
|
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
|
|
if (fromp != sp+slen) {
|
|
rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
|
|
}
|
|
buf = (unsigned char *)RSTRING_PTR(dest);
|
|
*bp = '\0';
|
|
rb_str_set_len(dest, bp - buf);
|
|
|
|
/* set encoding */
|
|
if (!denc) {
|
|
dencidx = rb_define_dummy_encoding(dname);
|
|
}
|
|
*self = dest;
|
|
|
|
return dencidx;
|
|
}
|
|
|
|
static int
|
|
str_transcode(int argc, VALUE *argv, VALUE *self)
|
|
{
|
|
VALUE opt;
|
|
int ecflags = 0;
|
|
VALUE ecopts = Qnil;
|
|
|
|
if (0 < argc) {
|
|
opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
|
|
if (!NIL_P(opt)) {
|
|
argc--;
|
|
ecflags = rb_econv_prepare_opts(opt, &ecopts);
|
|
}
|
|
}
|
|
return str_transcode0(argc, argv, self, ecflags, ecopts);
|
|
}
|
|
|
|
static inline VALUE
|
|
str_encode_associate(VALUE str, int encidx)
|
|
{
|
|
int cr = 0;
|
|
|
|
rb_enc_associate_index(str, encidx);
|
|
|
|
/* transcoded string never be broken. */
|
|
if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
|
|
rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
|
|
}
|
|
else {
|
|
cr = ENC_CODERANGE_VALID;
|
|
}
|
|
ENC_CODERANGE_SET(str, cr);
|
|
return str;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* str.encode!(encoding [, options] ) => str
|
|
* str.encode!(dst_encoding, src_encoding [, options] ) => str
|
|
*
|
|
* The first form transcodes the contents of <i>str</i> from
|
|
* str.encoding to +encoding+.
|
|
* The second form transcodes the contents of <i>str</i> from
|
|
* src_encoding to dst_encoding.
|
|
* The options Hash gives details for conversion. See String#encode
|
|
* for details.
|
|
* Returns the string even if no changes were made.
|
|
*/
|
|
|
|
static VALUE
|
|
str_encode_bang(int argc, VALUE *argv, VALUE str)
|
|
{
|
|
VALUE newstr = str;
|
|
int encidx = str_transcode(argc, argv, &newstr);
|
|
|
|
if (encidx < 0) return str;
|
|
rb_str_shared_replace(str, newstr);
|
|
return str_encode_associate(str, encidx);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* str.encode(encoding [, options] ) => str
|
|
* str.encode(dst_encoding, src_encoding [, options] ) => str
|
|
*
|
|
* The first form returns a copy of <i>str</i> transcoded
|
|
* to encoding +encoding+.
|
|
* The second form returns a copy of <i>str</i> transcoded
|
|
* from src_encoding to dst_encoding.
|
|
* The options Hash gives details for conversion. Details
|
|
* to be added.
|
|
*/
|
|
|
|
static VALUE
|
|
str_encode(int argc, VALUE *argv, VALUE str)
|
|
{
|
|
VALUE newstr = str;
|
|
int encidx = str_transcode(argc, argv, &newstr);
|
|
|
|
if (encidx < 0) return rb_str_dup(str);
|
|
if (newstr == str) {
|
|
newstr = rb_str_dup(str);
|
|
}
|
|
else {
|
|
RBASIC(newstr)->klass = rb_obj_class(str);
|
|
}
|
|
return str_encode_associate(newstr, encidx);
|
|
}
|
|
|
|
VALUE
|
|
rb_str_transcode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
|
|
{
|
|
int argc = 1;
|
|
VALUE *argv = &to;
|
|
VALUE newstr = str;
|
|
int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
|
|
|
|
if (encidx < 0) return rb_str_dup(str);
|
|
RBASIC(newstr)->klass = rb_obj_class(str);
|
|
return str_encode_associate(newstr, encidx);
|
|
}
|
|
|
|
static void
|
|
econv_free(rb_econv_t *ec)
|
|
{
|
|
rb_econv_close(ec);
|
|
}
|
|
|
|
static VALUE
|
|
econv_s_allocate(VALUE klass)
|
|
{
|
|
return Data_Wrap_Struct(klass, NULL, econv_free, NULL);
|
|
}
|
|
|
|
static rb_encoding *
|
|
make_dummy_encoding(const char *name)
|
|
{
|
|
rb_encoding *enc;
|
|
int idx;
|
|
idx = rb_define_dummy_encoding(name);
|
|
enc = rb_enc_from_index(idx);
|
|
return enc;
|
|
}
|
|
|
|
static rb_encoding *
|
|
make_encoding(const char *name)
|
|
{
|
|
rb_encoding *enc;
|
|
enc = rb_enc_find(name);
|
|
if (!enc)
|
|
enc = make_dummy_encoding(name);
|
|
return enc;
|
|
}
|
|
|
|
static VALUE
|
|
make_encobj(const char *name)
|
|
{
|
|
return rb_enc_from_encoding(make_encoding(name));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* Encoding::Converter.asciicompat_encoding(string) => encoding or nil
|
|
* Encoding::Converter.asciicompat_encoding(encoding) => encoding or nil
|
|
*
|
|
* returns the corresponding ASCII compatible encoding.
|
|
*
|
|
* It returns nil if the argument is an ASCII compatible encoding.
|
|
*
|
|
* "corresponding ASCII compatible encoding" is a ASCII compatible encoding which
|
|
* represents same characters in the given ASCII incompatible encoding.
|
|
*
|
|
* So, no conversion undefined error occur between the ASCII compatible and incompatible encoding.
|
|
*
|
|
* Encoding::Converter.stateless_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
|
|
* Encoding::Converter.stateless_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
|
|
* Encoding::Converter.stateless_encoding("UTF-8") #=> nil
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
|
|
{
|
|
const char *arg_name, *result_name;
|
|
rb_encoding *arg_enc, *result_enc;
|
|
|
|
enc_arg(&arg, &arg_name, &arg_enc);
|
|
|
|
result_name = rb_econv_asciicompat_encoding(arg_name);
|
|
|
|
if (result_name == NULL)
|
|
return Qnil;
|
|
|
|
result_enc = make_encoding(result_name);
|
|
|
|
return rb_enc_from_encoding(result_enc);
|
|
}
|
|
|
|
static void
|
|
econv_args(int argc, VALUE *argv,
|
|
volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
|
|
const char **sname_p, const char **dname_p,
|
|
rb_encoding **senc_p, rb_encoding **denc_p,
|
|
int *ecflags_p,
|
|
VALUE *ecopts_p)
|
|
{
|
|
VALUE opt, opthash, flags_v, ecopts;
|
|
int sidx, didx;
|
|
const char *sname, *dname;
|
|
rb_encoding *senc, *denc;
|
|
int ecflags;
|
|
|
|
rb_scan_args(argc, argv, "21", snamev_p, dnamev_p, &opt);
|
|
|
|
if (NIL_P(opt)) {
|
|
ecflags = 0;
|
|
ecopts = Qnil;
|
|
}
|
|
else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) {
|
|
ecflags = NUM2INT(flags_v);
|
|
ecopts = Qnil;
|
|
}
|
|
else {
|
|
opthash = rb_convert_type(opt, T_HASH, "Hash", "to_hash");
|
|
ecflags = rb_econv_prepare_opts(opthash, &ecopts);
|
|
}
|
|
|
|
senc = NULL;
|
|
sidx = rb_to_encoding_index(*snamev_p);
|
|
if (0 <= sidx) {
|
|
senc = rb_enc_from_index(sidx);
|
|
}
|
|
else {
|
|
StringValue(*snamev_p);
|
|
}
|
|
|
|
denc = NULL;
|
|
didx = rb_to_encoding_index(*dnamev_p);
|
|
if (0 <= didx) {
|
|
denc = rb_enc_from_index(didx);
|
|
}
|
|
else {
|
|
StringValue(*dnamev_p);
|
|
}
|
|
|
|
sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
|
|
dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
|
|
|
|
*sname_p = sname;
|
|
*dname_p = dname;
|
|
*senc_p = senc;
|
|
*denc_p = denc;
|
|
*ecflags_p = ecflags;
|
|
*ecopts_p = ecopts;
|
|
}
|
|
|
|
static int
|
|
decorate_convpath(VALUE convpath, int ecflags)
|
|
{
|
|
int num_decorators;
|
|
const char *decorators[MAX_ECFLAGS_DECORATORS];
|
|
int i;
|
|
int n, len;
|
|
|
|
num_decorators = decorator_names(ecflags, decorators);
|
|
if (num_decorators == -1)
|
|
return -1;
|
|
|
|
len = n = RARRAY_LEN(convpath);
|
|
if (n != 0) {
|
|
VALUE pair = RARRAY_PTR(convpath)[n-1];
|
|
const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
|
|
const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
|
|
transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
|
|
const rb_transcoder *tr = load_transcoder_entry(entry);
|
|
if (!tr)
|
|
return -1;
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
|
|
tr->asciicompat_type == asciicompat_encoder) {
|
|
n--;
|
|
rb_ary_store(convpath, len + num_decorators - 1, pair);
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < num_decorators; i++)
|
|
rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
|
|
{
|
|
VALUE *ary_p = arg;
|
|
VALUE v;
|
|
|
|
if (*ary_p == Qnil) {
|
|
*ary_p = rb_ary_new();
|
|
}
|
|
|
|
if (DECORATOR_P(sname, dname)) {
|
|
v = rb_str_new_cstr(dname);
|
|
}
|
|
else {
|
|
v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
|
|
}
|
|
rb_ary_store(*ary_p, depth, v);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
|
|
* Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
|
|
*
|
|
* returns the conversion path.
|
|
*
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
|
|
*
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
|
|
* # "universal_newline"]
|
|
*
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # "universal_newline",
|
|
* # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
|
|
*/
|
|
static VALUE
|
|
econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
|
|
{
|
|
volatile VALUE snamev, dnamev;
|
|
const char *sname, *dname;
|
|
rb_encoding *senc, *denc;
|
|
int ecflags;
|
|
VALUE ecopts;
|
|
VALUE convpath;
|
|
|
|
econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
|
|
|
|
convpath = Qnil;
|
|
transcode_search_path(sname, dname, search_convpath_i, &convpath);
|
|
|
|
if (NIL_P(convpath))
|
|
rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
|
|
|
|
if (decorate_convpath(convpath, ecflags) == -1)
|
|
rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
|
|
|
|
return convpath;
|
|
}
|
|
|
|
struct rb_econv_init_by_convpath_t {
|
|
rb_econv_t *ec;
|
|
int index;
|
|
int ret;
|
|
};
|
|
|
|
void rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
|
|
{
|
|
struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
|
|
int ret;
|
|
|
|
if (a->ret == -1)
|
|
return;
|
|
|
|
ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
|
|
|
|
a->ret = ret;
|
|
return;
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_init_by_convpath(VALUE self, VALUE convpath,
|
|
const char **sname_p, const char **dname_p,
|
|
rb_encoding **senc_p, rb_encoding**denc_p)
|
|
{
|
|
rb_econv_t *ec;
|
|
long i;
|
|
int ret, first=1;
|
|
VALUE elt;
|
|
rb_encoding *senc, *denc;
|
|
const char *sname, *dname;
|
|
|
|
ec = rb_econv_alloc(RARRAY_LEN(convpath));
|
|
DATA_PTR(self) = ec;
|
|
|
|
for (i = 0; i < RARRAY_LEN(convpath); i++) {
|
|
volatile VALUE snamev, dnamev;
|
|
VALUE pair;
|
|
elt = rb_ary_entry(convpath, i);
|
|
if (!NIL_P(pair = rb_check_array_type(elt))) {
|
|
if (RARRAY_LEN(pair) != 2)
|
|
rb_raise(rb_eArgError, "not a 2-element array in convpath");
|
|
snamev = rb_ary_entry(pair, 0);
|
|
enc_arg(&snamev, &sname, &senc);
|
|
dnamev = rb_ary_entry(pair, 1);
|
|
enc_arg(&dnamev, &dname, &denc);
|
|
}
|
|
else {
|
|
sname = "";
|
|
dname = StringValueCStr(elt);
|
|
}
|
|
if (DECORATOR_P(sname, dname)) {
|
|
ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
|
|
if (ret == -1)
|
|
rb_raise(rb_eArgError, "decoration failed: %s", dname);
|
|
}
|
|
else {
|
|
int j = ec->num_trans;
|
|
struct rb_econv_init_by_convpath_t arg;
|
|
arg.ec = ec;
|
|
arg.index = ec->num_trans;
|
|
arg.ret = 0;
|
|
ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
|
|
if (ret == -1 || arg.ret == -1)
|
|
rb_raise(rb_eArgError, "conversion add failed: %s to %s", sname, dname);
|
|
if (first) {
|
|
first = 0;
|
|
*senc_p = senc;
|
|
*sname_p = ec->elems[j].tc->transcoder->src_encoding;
|
|
}
|
|
*denc_p = denc;
|
|
*dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
|
|
}
|
|
}
|
|
|
|
if (first) {
|
|
*senc_p = NULL;
|
|
*denc_p = NULL;
|
|
*sname_p = "";
|
|
*dname_p = "";
|
|
}
|
|
|
|
ec->source_encoding_name = *sname_p;
|
|
ec->destination_encoding_name = *dname_p;
|
|
|
|
return ec;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* Encoding::Converter.new(source_encoding, destination_encoding)
|
|
* Encoding::Converter.new(source_encoding, destination_encoding, opt)
|
|
* Encoding::Converter.new(convpath)
|
|
*
|
|
* possible options elements:
|
|
* hash form:
|
|
* :invalid => nil # error on invalid byte sequence (default)
|
|
* :invalid => :replace # replace invalid byte sequence
|
|
* :undef => nil # error on undefined conversion (default)
|
|
* :undef => :replace # replace undefined conversion
|
|
* :replace => string # replacement string ("?" or "\uFFFD" if not specified)
|
|
* :universal_newline => true # decorator for converting CRLF and CR to LF
|
|
* :crlf_newline => true # decorator for converting LF to CRLF
|
|
* :cr_newline => true # decorator for converting LF to CR
|
|
* integer form:
|
|
* Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
|
|
* Encoding::Converter::CRLF_NEWLINE_DECORATOR
|
|
* Encoding::Converter::CR_NEWLINE_DECORATOR
|
|
*
|
|
* Encoding::Converter.new creates an instance of Encoding::Converter.
|
|
*
|
|
* source_encoding and destination_encoding should be a string or
|
|
* Encoding object.
|
|
*
|
|
* opt should be nil, a hash or an integer.
|
|
*
|
|
* convpath should be an array.
|
|
* convpath should contains
|
|
* - two-element array which contains encoding or encoding name, or
|
|
* - a string of decorator name.
|
|
*
|
|
* example:
|
|
* # UTF-16BE to UTF-8
|
|
* ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
|
|
*
|
|
* # Usually, decorators such as newline conversion are inserted at last.
|
|
* ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
|
|
* p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
|
|
* # "universal_newline"]
|
|
*
|
|
* # But, if the last encoding is ASCII incompatible,
|
|
* # decorators are inserted before the last conversion.
|
|
* ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
|
|
* p ec.convpath #=> ["crlf_newline",
|
|
* # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
|
|
*
|
|
* # conversion path can be specified directly.
|
|
* ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
|
|
* p ec.convpath #=> ["universal_newline",
|
|
* # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
|
|
*/
|
|
static VALUE
|
|
econv_init(int argc, VALUE *argv, VALUE self)
|
|
{
|
|
VALUE ecopts;
|
|
volatile VALUE snamev, dnamev;
|
|
const char *sname, *dname;
|
|
rb_encoding *senc, *denc;
|
|
rb_econv_t *ec;
|
|
int ecflags;
|
|
VALUE convpath;
|
|
|
|
if (DATA_PTR(self)) {
|
|
rb_raise(rb_eTypeError, "already initialized");
|
|
}
|
|
|
|
if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
|
|
ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
|
|
ecflags = 0;
|
|
ecopts = Qnil;
|
|
}
|
|
else {
|
|
econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
|
|
ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
|
|
}
|
|
|
|
if (!ec) {
|
|
rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
|
|
}
|
|
|
|
if (!DECORATOR_P(sname, dname)) {
|
|
if (!senc)
|
|
senc = make_dummy_encoding(sname);
|
|
if (!denc)
|
|
denc = make_dummy_encoding(dname);
|
|
}
|
|
|
|
ec->source_encoding = senc;
|
|
ec->destination_encoding = denc;
|
|
|
|
DATA_PTR(self) = ec;
|
|
|
|
return self;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.inspect -> string
|
|
*
|
|
* Returns a printable version of <i>ec</i>
|
|
*
|
|
* ec = Encoding::Converter.new("iso-8859-1", "utf-8")
|
|
* puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_inspect(VALUE self)
|
|
{
|
|
const char *cname = rb_obj_classname(self);
|
|
rb_econv_t *ec = DATA_PTR(self);
|
|
|
|
if (!ec)
|
|
return rb_sprintf("#<%s: uninitialized>", cname);
|
|
else {
|
|
const char *sname = ec->source_encoding_name;
|
|
const char *dname = ec->destination_encoding_name;
|
|
VALUE str;
|
|
str = rb_sprintf("#<%s: ", cname);
|
|
econv_description(sname, dname, ec->flags, str);
|
|
rb_str_cat2(str, ">");
|
|
return str;
|
|
}
|
|
}
|
|
|
|
#define IS_ECONV(obj) (RDATA(obj)->dfree == (RUBY_DATA_FUNC)econv_free)
|
|
|
|
static rb_econv_t *
|
|
check_econv(VALUE self)
|
|
{
|
|
Check_Type(self, T_DATA);
|
|
if (!IS_ECONV(self)) {
|
|
rb_raise(rb_eTypeError, "wrong argument type %s (expected Encoding::Converter)",
|
|
rb_class2name(CLASS_OF(self)));
|
|
}
|
|
if (!DATA_PTR(self)) {
|
|
rb_raise(rb_eTypeError, "uninitialized encoding converter");
|
|
}
|
|
return DATA_PTR(self);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.source_encoding -> encoding
|
|
*
|
|
* returns the source encoding as an Encoding object.
|
|
*/
|
|
static VALUE
|
|
econv_source_encoding(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
if (!ec->source_encoding)
|
|
return Qnil;
|
|
return rb_enc_from_encoding(ec->source_encoding);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.destination_encoding -> encoding
|
|
*
|
|
* returns the destination encoding as an Encoding object.
|
|
*/
|
|
static VALUE
|
|
econv_destination_encoding(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
if (!ec->destination_encoding)
|
|
return Qnil;
|
|
return rb_enc_from_encoding(ec->destination_encoding);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.convpath -> ary
|
|
*
|
|
* returns the conversion path of ec.
|
|
*
|
|
* The result is an array of conversions.
|
|
*
|
|
* ec = Encoding::Converter.new("ISo-8859-1", "EUC-JP", crlf_newline: true)
|
|
* p ec.convpath
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
|
|
* # "crlf_newline"]
|
|
*
|
|
* A element of the array is a pair of encodings or a string.
|
|
* The pair means encoding conversion.
|
|
* The string means decorator.
|
|
*
|
|
* In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
|
|
* a converter from ISO-8859-1 to UTF-8.
|
|
* "crlf_newline" means newline converter from LF to CRLF.
|
|
*/
|
|
static VALUE
|
|
econv_convpath(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
VALUE result;
|
|
int i;
|
|
|
|
result = rb_ary_new();
|
|
for (i = 0; i < ec->num_trans; i++) {
|
|
const rb_transcoder *tr = ec->elems[i].tc->transcoder;
|
|
VALUE v;
|
|
if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
|
|
v = rb_str_new_cstr(tr->dst_encoding);
|
|
else
|
|
v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
|
|
rb_ary_push(result, v);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static VALUE
|
|
econv_result_to_symbol(rb_econv_result_t res)
|
|
{
|
|
switch (res) {
|
|
case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
|
|
case econv_incomplete_input: return sym_incomplete_input;
|
|
case econv_undefined_conversion: return sym_undefined_conversion;
|
|
case econv_destination_buffer_full: return sym_destination_buffer_full;
|
|
case econv_source_buffer_empty: return sym_source_buffer_empty;
|
|
case econv_finished: return sym_finished;
|
|
case econv_after_output: return sym_after_output;
|
|
default: return INT2NUM(res); /* should not be reached */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.primitive_convert(source_buffer, destination_buffer) -> symbol
|
|
* ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
|
|
* ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
|
|
* ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
|
|
*
|
|
* possible opt elements:
|
|
* hash form:
|
|
* :partial_input => true # source buffer may be part of larger source
|
|
* :after_output => true # stop conversion after output before input
|
|
* integer form:
|
|
* Encoding::Converter::PARTIAL_INPUT
|
|
* Encoding::Converter::AFTER_OUTPUT
|
|
*
|
|
* possible results:
|
|
* :invalid_byte_sequence
|
|
* :incomplete_input
|
|
* :undefined_conversion
|
|
* :after_output
|
|
* :destination_buffer_full
|
|
* :source_buffer_empty
|
|
* :finished
|
|
*
|
|
* primitive_convert converts source_buffer into destination_buffer.
|
|
*
|
|
* source_buffer should be a string or nil.
|
|
* nil means a empty string.
|
|
*
|
|
* destination_buffer should be a string.
|
|
*
|
|
* destination_byteoffset should be an integer or nil.
|
|
* nil means the end of destination_buffer.
|
|
* If it is omitted, nil is assumed.
|
|
*
|
|
* destination_bytesize should be an integer or nil.
|
|
* nil means unlimited.
|
|
* If it is omitted, nil is assumed.
|
|
*
|
|
* opt should be nil, a hash or an integer.
|
|
* nil means no flags.
|
|
* If it is omitted, nil is assumed.
|
|
*
|
|
* primitive_convert converts the content of source_buffer from beginning
|
|
* and store the result into destination_buffer.
|
|
*
|
|
* destination_byteoffset and destination_bytesize specify the region which
|
|
* the converted result is stored.
|
|
* destination_byteoffset specifies the start position in destination_buffer in bytes.
|
|
* If destination_byteoffset is nil,
|
|
* destination_buffer.bytesize is used for appending the result.
|
|
* destination_bytesize specifies maximum number of bytes.
|
|
* If destination_bytesize is nil,
|
|
* destination size is unlimited.
|
|
* After conversion, destination_buffer is resized to
|
|
* destination_byteoffset + actually produced number of bytes.
|
|
* Also destination_buffer's encoding is set to destination_encoding.
|
|
*
|
|
* primitive_convert drops the converted part of source_buffer.
|
|
* the dropped part is converted in destination_buffer or
|
|
* buffered in Encoding::Converter object.
|
|
*
|
|
* primitive_convert stops conversion when one of following condition met.
|
|
* - invalid byte sequence found in source buffer (:invalid_byte_sequence)
|
|
* - unexpected end of source buffer (:incomplete_input)
|
|
* this occur only when :partial_input is not specified.
|
|
* - character not representable in output encoding (:undefined_conversion)
|
|
* - after some output is generated, before input is done (:after_output)
|
|
* this occur only when :after_output is specified.
|
|
* - destination buffer is full (:destination_buffer_full)
|
|
* this occur only when destination_bytesize is non-nil.
|
|
* - source buffer is empty (:source_buffer_empty)
|
|
* this occur only when :partial_input is specified.
|
|
* - conversion is finished (:finished)
|
|
*
|
|
* example:
|
|
* ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
|
|
* ret = ec.primitive_convert(src="pi", dst="", 100)
|
|
* p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
|
|
*
|
|
* ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
|
|
* ret = ec.primitive_convert(src="pi", dst="", 1)
|
|
* p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
|
|
* ret = ec.primitive_convert(src, dst="", 1)
|
|
* p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
|
|
* ret = ec.primitive_convert(src, dst="", 1)
|
|
* p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
|
|
* ret = ec.primitive_convert(src, dst="", 1)
|
|
* p [ret, src, dst] #=> [:finished, "", "i"]
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_primitive_convert(int argc, VALUE *argv, VALUE self)
|
|
{
|
|
VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
|
|
rb_econv_t *ec = check_econv(self);
|
|
rb_econv_result_t res;
|
|
const unsigned char *ip, *is;
|
|
unsigned char *op, *os;
|
|
long output_byteoffset, output_bytesize;
|
|
unsigned long output_byteend;
|
|
int flags;
|
|
|
|
rb_scan_args(argc, argv, "23", &input, &output, &output_byteoffset_v, &output_bytesize_v, &opt);
|
|
|
|
if (NIL_P(output_byteoffset_v))
|
|
output_byteoffset = 0; /* dummy */
|
|
else
|
|
output_byteoffset = NUM2LONG(output_byteoffset_v);
|
|
|
|
if (NIL_P(output_bytesize_v))
|
|
output_bytesize = 0; /* dummy */
|
|
else
|
|
output_bytesize = NUM2LONG(output_bytesize_v);
|
|
|
|
if (NIL_P(opt)) {
|
|
flags = 0;
|
|
}
|
|
else if (!NIL_P(flags_v = rb_check_to_integer(opt, "to_int"))) {
|
|
flags = NUM2INT(flags_v);
|
|
}
|
|
else {
|
|
VALUE v;
|
|
opt = rb_convert_type(opt, T_HASH, "Hash", "to_hash");
|
|
flags = 0;
|
|
v = rb_hash_aref(opt, sym_partial_input);
|
|
if (RTEST(v))
|
|
flags |= ECONV_PARTIAL_INPUT;
|
|
v = rb_hash_aref(opt, sym_after_output);
|
|
if (RTEST(v))
|
|
flags |= ECONV_AFTER_OUTPUT;
|
|
}
|
|
|
|
StringValue(output);
|
|
if (!NIL_P(input))
|
|
StringValue(input);
|
|
rb_str_modify(output);
|
|
|
|
if (NIL_P(output_bytesize_v)) {
|
|
output_bytesize = RSTRING_EMBED_LEN_MAX;
|
|
if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
|
|
output_bytesize = RSTRING_LEN(input);
|
|
}
|
|
|
|
retry:
|
|
|
|
if (NIL_P(output_byteoffset_v))
|
|
output_byteoffset = RSTRING_LEN(output);
|
|
|
|
if (output_byteoffset < 0)
|
|
rb_raise(rb_eArgError, "negative output_byteoffset");
|
|
|
|
if (RSTRING_LEN(output) < output_byteoffset)
|
|
rb_raise(rb_eArgError, "output_byteoffset too big");
|
|
|
|
if (output_bytesize < 0)
|
|
rb_raise(rb_eArgError, "negative output_bytesize");
|
|
|
|
output_byteend = (unsigned long)output_byteoffset +
|
|
(unsigned long)output_bytesize;
|
|
|
|
if (output_byteend < (unsigned long)output_byteoffset ||
|
|
LONG_MAX < output_byteend)
|
|
rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
|
|
|
|
if (rb_str_capacity(output) < output_byteend)
|
|
rb_str_resize(output, output_byteend);
|
|
|
|
if (NIL_P(input)) {
|
|
ip = is = NULL;
|
|
}
|
|
else {
|
|
ip = (const unsigned char *)RSTRING_PTR(input);
|
|
is = ip + RSTRING_LEN(input);
|
|
}
|
|
|
|
op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
|
|
os = op + output_bytesize;
|
|
|
|
res = rb_econv_convert(ec, &ip, is, &op, os, flags);
|
|
rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
|
|
if (!NIL_P(input))
|
|
rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
|
|
|
|
if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
|
|
if (LONG_MAX / 2 < output_bytesize)
|
|
rb_raise(rb_eArgError, "too long conversion result");
|
|
output_bytesize *= 2;
|
|
output_byteoffset_v = Qnil;
|
|
goto retry;
|
|
}
|
|
|
|
if (ec->destination_encoding) {
|
|
rb_enc_associate(output, ec->destination_encoding);
|
|
}
|
|
|
|
return econv_result_to_symbol(res);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.convert(source_string) -> destination_string
|
|
*
|
|
* convert source_string and return destination_string.
|
|
*
|
|
* source_string is assumed as a part of source.
|
|
* i.e. :partial_input=>true is specified internally.
|
|
* finish method should be used at last.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "euc-jp")
|
|
* puts ec.convert("\u3042").dump #=> "\xA4\xA2"
|
|
* puts ec.finish.dump #=> ""
|
|
*
|
|
* ec = Encoding::Converter.new("euc-jp", "utf-8")
|
|
* puts ec.convert("\xA4").dump #=> ""
|
|
* puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
|
|
* puts ec.finish.dump #=> ""
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
|
|
* puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
|
|
* puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
|
|
* puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
|
|
* puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
|
|
*
|
|
* If a conversion error occur,
|
|
* Encoding::ConversionUndefinedError or
|
|
* Encoding::InvalidByteSequenceError is raised.
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_convert(VALUE self, VALUE source_string)
|
|
{
|
|
VALUE ret, dst;
|
|
VALUE av[5];
|
|
int ac;
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
StringValue(source_string);
|
|
|
|
dst = rb_str_new(NULL, 0);
|
|
|
|
av[0] = rb_str_dup(source_string);
|
|
av[1] = dst;
|
|
av[2] = Qnil;
|
|
av[3] = Qnil;
|
|
av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
|
|
ac = 5;
|
|
|
|
ret = econv_primitive_convert(ac, av, self);
|
|
|
|
if (ret == sym_invalid_byte_sequence ||
|
|
ret == sym_undefined_conversion ||
|
|
ret == sym_incomplete_input) {
|
|
VALUE exc = make_econv_exception(ec);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (ret == sym_finished) {
|
|
rb_raise(rb_eArgError, "converter already finished");
|
|
}
|
|
|
|
if (ret != sym_source_buffer_empty) {
|
|
rb_bug("unexpected result of econv_primitive_convert");
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.finish -> string
|
|
*
|
|
* finishes the converter.
|
|
* It returns the last part of converted string.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
|
|
* p ec.convert("\u3042") #=> "\e$B$\""
|
|
* p ec.finish #=> "\e(B"
|
|
*/
|
|
static VALUE
|
|
econv_finish(VALUE self)
|
|
{
|
|
VALUE ret, dst;
|
|
VALUE av[5];
|
|
int ac;
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
dst = rb_str_new(NULL, 0);
|
|
|
|
av[0] = Qnil;
|
|
av[1] = dst;
|
|
av[2] = Qnil;
|
|
av[3] = Qnil;
|
|
av[4] = INT2NUM(0);
|
|
ac = 5;
|
|
|
|
ret = econv_primitive_convert(ac, av, self);
|
|
|
|
if (ret == sym_invalid_byte_sequence ||
|
|
ret == sym_undefined_conversion ||
|
|
ret == sym_incomplete_input) {
|
|
VALUE exc = make_econv_exception(ec);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (ret != sym_finished) {
|
|
rb_bug("unexpected result of econv_primitive_convert");
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.primitive_errinfo -> array
|
|
*
|
|
* primitive_errinfo returns a precious information of the last error result
|
|
* as a 5-elements array:
|
|
*
|
|
* [result, enc1, enc2, error_bytes, readagain_bytes]
|
|
*
|
|
* result is the last result of primitive_convert.
|
|
*
|
|
* Other elements are only meaningful when result is
|
|
* :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
|
|
*
|
|
* enc1 and enc2 indicates a conversion step as pair of strings.
|
|
* For example, a converter from EUC-JP to ISO-8859-1 converters
|
|
* a string as EUC-JP -> UTF-8 -> ISO-8859-1.
|
|
* So [enc1, enc2] is ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
|
|
*
|
|
* error_bytes and readagain_bytes indicates the byte sequences which causes the error.
|
|
* error_bytes is discarded portion.
|
|
* readagain_bytes is buffered portion which is read again on next conversion.
|
|
*
|
|
* Example:
|
|
*
|
|
* # \xff is invalid as EUC-JP.
|
|
* ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
|
|
* ec.primitive_convert(src="\xff", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
|
|
*
|
|
* # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
|
|
* # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
|
|
* # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
|
|
*
|
|
* # partial character is invalid
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* ec.primitive_convert(src="\xa4", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
|
|
*
|
|
* # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
|
|
* # partial characters.
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:source_buffer_empty, nil, nil, nil, nil]
|
|
*
|
|
* # \xd8\x00\x00@ is invalid as UTF-16BE because
|
|
* # no low surrogate after high surrogate (\xd8\x00).
|
|
* # It is detected by 3rd byte (\00) which is part of next character.
|
|
* # So the high surrogate (\xd8\x00) is discarded and
|
|
* # the 3rd byte is read again later.
|
|
* # Since the byte is buffered in ec, it is dropped from src.
|
|
* ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
|
|
* ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
|
|
* p src
|
|
* #=> "@"
|
|
*
|
|
* # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
|
|
* # The problem is detected by 4th byte.
|
|
* ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
|
|
* ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
|
|
* p src
|
|
* #=> ""
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_primitive_errinfo(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
VALUE ary;
|
|
|
|
ary = rb_ary_new2(5);
|
|
|
|
rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
|
|
rb_ary_store(ary, 4, Qnil);
|
|
|
|
if (ec->last_error.source_encoding)
|
|
rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
|
|
|
|
if (ec->last_error.destination_encoding)
|
|
rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
|
|
|
|
if (ec->last_error.error_bytes_start) {
|
|
rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
|
|
rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
|
|
}
|
|
|
|
return ary;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.insert_output(string) -> nil
|
|
*
|
|
* inserts string into the encoding converter.
|
|
* The string will be converted into the destination encoding and
|
|
* outputed on later conversions.
|
|
*
|
|
* If the destination encoding is stateful,
|
|
* string is converted according to the state and update the state.
|
|
*
|
|
* This method should be used only when a conversion error is occur.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-8859-1")
|
|
* src = "HIRAGANA LETTER A is \u{3042}."
|
|
* dst = ""
|
|
* p ec.primitive_convert(src, dst) #=> :undefined_conversion
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
|
|
* ec.insert_output("<err>")
|
|
* p ec.primitive_convert(src, dst) #=> :finished
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
|
|
* src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
|
|
* dst = ""
|
|
* p ec.primitive_convert(src, dst) #=> :undefined_conversion
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
|
|
* ec.insert_output "?" # state change required to output "?".
|
|
* p ec.primitive_convert(src, dst) #=> :finished
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_insert_output(VALUE self, VALUE string)
|
|
{
|
|
const char *insert_enc;
|
|
|
|
int ret;
|
|
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
StringValue(string);
|
|
insert_enc = rb_econv_encoding_to_insert_output(ec);
|
|
string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
|
|
|
|
ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
|
|
if (ret == -1) {
|
|
rb_raise(rb_eArgError, "too big string");
|
|
}
|
|
|
|
return Qnil;
|
|
}
|
|
|
|
/*
|
|
* call-seq
|
|
* ec.putback => string
|
|
* ec.putback(max_numbytes) => string
|
|
*
|
|
* put back the bytes which will be converted.
|
|
*
|
|
* The bytes are caused by invalid_byte_sequence error.
|
|
* When invalid_byte_sequence error, some bytes are discarded and
|
|
* some bytes are buffered to be converted later.
|
|
* The latter bytes can be put back.
|
|
* It can be observed by
|
|
* Encoding::InvalidByteSequenceError#readagain_bytes and
|
|
* Encoding::Converter#primitive_errinfo.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
|
|
* src = "\x00\xd8\x61\x00"
|
|
* dst = ""
|
|
* p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
|
|
* p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
|
|
* p ec.putback #=> "a\x00"
|
|
* p ec.putback #=> "" # no more bytes to put back
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_putback(int argc, VALUE *argv, VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
int n;
|
|
int putbackable;
|
|
VALUE str, max;
|
|
|
|
rb_scan_args(argc, argv, "01", &max);
|
|
|
|
if (NIL_P(max))
|
|
n = rb_econv_putbackable(ec);
|
|
else {
|
|
n = NUM2INT(max);
|
|
putbackable = rb_econv_putbackable(ec);
|
|
if (putbackable < n)
|
|
n = putbackable;
|
|
}
|
|
|
|
str = rb_str_new(NULL, n);
|
|
rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
|
|
|
|
if (ec->source_encoding) {
|
|
rb_enc_associate(str, ec->source_encoding);
|
|
}
|
|
|
|
return str;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.last_error -> exception or nil
|
|
*
|
|
* returns an exception object for the last conversion.
|
|
* It returns nil if the last conversion is not an error.
|
|
*
|
|
* "error" means that
|
|
* Encoding::InvalidByteSequenceError and Encoding::ConversionUndefinedError for
|
|
* Encoding::Converter#convert and
|
|
* :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
|
|
* Encoding::Converter#primitive_convert.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-8859-1")
|
|
* p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
|
|
* p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
|
|
* p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
|
|
* p ec.last_error #=> nil
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_last_error(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
VALUE exc;
|
|
|
|
exc = make_econv_exception(ec);
|
|
if (NIL_P(exc))
|
|
return Qnil;
|
|
return exc;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.replacement -> string
|
|
*
|
|
* returns the replacement string.
|
|
*
|
|
* ec = Encoding::Converter.new("euc-jp", "us-ascii")
|
|
* p ec.replacement #=> "?"
|
|
*
|
|
* ec = Encoding::Converter.new("euc-jp", "utf-8")
|
|
* p ec.replacement #=> "\uFFFD"
|
|
*/
|
|
static VALUE
|
|
econv_get_replacement(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
int ret;
|
|
rb_encoding *enc;
|
|
|
|
ret = make_replacement(ec);
|
|
if (ret == -1) {
|
|
rb_raise(rb_eConversionUndefinedError, "replacement character setup failed");
|
|
}
|
|
|
|
enc = rb_enc_find(ec->replacement_enc);
|
|
return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.replacement = string
|
|
*
|
|
* sets the replacement string.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
|
|
* ec.replacement = "<undef>"
|
|
* p ec.convert("a \u3042 b") #=> "a <undef> b"
|
|
*/
|
|
static VALUE
|
|
econv_set_replacement(VALUE self, VALUE arg)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
VALUE string = arg;
|
|
int ret;
|
|
rb_encoding *enc;
|
|
|
|
StringValue(string);
|
|
enc = rb_enc_get(string);
|
|
|
|
ret = rb_econv_set_replacement(ec,
|
|
(const unsigned char *)RSTRING_PTR(string),
|
|
RSTRING_LEN(string),
|
|
rb_enc_name(enc));
|
|
|
|
if (ret == -1) {
|
|
/* xxx: rb_eInvalidByteSequenceError? */
|
|
rb_raise(rb_eConversionUndefinedError, "replacement character setup failed");
|
|
}
|
|
|
|
return arg;
|
|
}
|
|
|
|
void
|
|
rb_econv_check_error(rb_econv_t *ec)
|
|
{
|
|
VALUE exc;
|
|
|
|
exc = make_econv_exception(ec);
|
|
if (NIL_P(exc))
|
|
return;
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.source_encoding_name -> string
|
|
*
|
|
* returns the source encoding name as a string.
|
|
*/
|
|
static VALUE
|
|
ecerr_source_encoding_name(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("source_encoding_name"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.source_encoding -> encoding
|
|
*
|
|
* returns the source encoding as an encoding object.
|
|
*
|
|
* Note that the result may not be equal to the source encoding of
|
|
* the encoding converter if the conversion has multiple steps.
|
|
*
|
|
* ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
|
|
* begin
|
|
* ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
|
|
* rescue Encoding::ConversionUndefinedError
|
|
* p $!.source_encoding #=> #<Encoding:UTF-8>
|
|
* p $!.destination_encoding #=> #<Encoding:EUC-JP>
|
|
* p $!.source_encoding_name #=> "UTF-8"
|
|
* p $!.destination_encoding_name #=> "EUC-JP"
|
|
* end
|
|
*
|
|
*/
|
|
static VALUE
|
|
ecerr_source_encoding(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("source_encoding"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.destination_encoding_name -> string
|
|
*
|
|
* returns the destination encoding name as a string.
|
|
*/
|
|
static VALUE
|
|
ecerr_destination_encoding_name(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("destination_encoding_name"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.destination_encoding -> string
|
|
*
|
|
* returns the destination encoding as an encoding object.
|
|
*/
|
|
static VALUE
|
|
ecerr_destination_encoding(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("destination_encoding"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.error_char -> string
|
|
*
|
|
* returns the one-character string which cause Encoding::ConversionUndefinedError.
|
|
*
|
|
* ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
|
|
* begin
|
|
* ec.convert("\xa0")
|
|
* rescue Encoding::ConversionUndefinedError
|
|
* puts $!.error_char.dump #=> "\xC2\xA0"
|
|
* p $!.error_char.encoding #=> #<Encoding:UTF-8>
|
|
* end
|
|
*
|
|
*/
|
|
static VALUE
|
|
ecerr_error_char(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("error_char"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.error_bytes -> string
|
|
*
|
|
* returns the discarded bytes when Encoding::InvalidByteSequenceError occur.
|
|
*
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* begin
|
|
* ec.convert("abc\xA1\xFFdef")
|
|
* rescue Encoding::InvalidByteSequenceError
|
|
* p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
|
|
* puts $!.error_bytes.dump #=> "\xA1"
|
|
* puts $!.readagain_bytes.dump #=> "\xFF"
|
|
* end
|
|
*/
|
|
static VALUE
|
|
ecerr_error_bytes(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("error_bytes"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.readagain_bytes -> string
|
|
*
|
|
* returns the bytes to be read again when Encoding::InvalidByteSequenceError occur.
|
|
*/
|
|
static VALUE
|
|
ecerr_readagain_bytes(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("readagain_bytes"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.incomplete_input? -> true or false
|
|
*
|
|
* returns true if the invalid byte sequence error is caused by
|
|
* premature end of string.
|
|
*
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
*
|
|
* begin
|
|
* ec.convert("abc\xA1z")
|
|
* rescue Encoding::InvalidByteSequenceError
|
|
* p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
|
|
* p $!.incomplete_input? #=> false
|
|
* end
|
|
*
|
|
* begin
|
|
* ec.convert("abc\xA1")
|
|
* ec.finish
|
|
* rescue Encoding::InvalidByteSequenceError
|
|
* p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
|
|
* p $!.incomplete_input? #=> true
|
|
* end
|
|
*/
|
|
static VALUE
|
|
ecerr_incomplete_input(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("incomplete_input"));
|
|
}
|
|
|
|
extern void Init_newline(void);
|
|
|
|
void
|
|
Init_transcode(void)
|
|
{
|
|
rb_eConversionUndefinedError = rb_define_class_under(rb_cEncoding, "ConversionUndefinedError", rb_eStandardError);
|
|
rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eStandardError);
|
|
rb_eNoConverterError = rb_define_class_under(rb_cEncoding, "NoConverterError", rb_eStandardError);
|
|
|
|
transcoder_table = st_init_strcasetable();
|
|
|
|
sym_invalid = ID2SYM(rb_intern("invalid"));
|
|
sym_undef = ID2SYM(rb_intern("undef"));
|
|
sym_ignore = ID2SYM(rb_intern("ignore"));
|
|
sym_replace = ID2SYM(rb_intern("replace"));
|
|
sym_xml = ID2SYM(rb_intern("xml"));
|
|
sym_text = ID2SYM(rb_intern("text"));
|
|
sym_attr = ID2SYM(rb_intern("attr"));
|
|
|
|
sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
|
|
sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
|
|
sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
|
|
sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
|
|
sym_finished = ID2SYM(rb_intern("finished"));
|
|
sym_after_output = ID2SYM(rb_intern("after_output"));
|
|
sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
|
|
sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
|
|
sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
|
|
sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
|
|
sym_partial_input = ID2SYM(rb_intern("partial_input"));
|
|
|
|
rb_define_method(rb_cString, "encode", str_encode, -1);
|
|
rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
|
|
|
|
rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
|
|
rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
|
|
rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
|
|
rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
|
|
rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
|
|
rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
|
|
rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
|
|
rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
|
|
rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
|
|
rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
|
|
rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
|
|
rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
|
|
rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
|
|
rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
|
|
rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
|
|
rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
|
|
rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
|
|
rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
|
|
|
|
rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
|
|
rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
|
|
rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
|
|
rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
|
|
rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
|
|
rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
|
|
rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
|
|
rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
|
|
rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
|
|
rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
|
|
rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
|
|
rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
|
|
rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
|
|
|
|
rb_define_method(rb_eConversionUndefinedError, "source_encoding_name", ecerr_source_encoding_name, 0);
|
|
rb_define_method(rb_eConversionUndefinedError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
|
|
rb_define_method(rb_eConversionUndefinedError, "source_encoding", ecerr_source_encoding, 0);
|
|
rb_define_method(rb_eConversionUndefinedError, "destination_encoding", ecerr_destination_encoding, 0);
|
|
rb_define_method(rb_eConversionUndefinedError, "error_char", ecerr_error_char, 0);
|
|
|
|
rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
|
|
|
|
Init_newline();
|
|
}
|