mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
ff30358d13
RARRAY_AREF has been a macro for reasons. We might not be able to change that for public APIs, but why not relax the situation internally to make it an inline function.
4583 lines
137 KiB
C
4583 lines
137 KiB
C
/**********************************************************************
|
|
|
|
transcode.c -
|
|
|
|
$Author$
|
|
created at: Tue Oct 30 16:10:22 JST 2007
|
|
|
|
Copyright (C) 2007 Martin Duerst
|
|
|
|
**********************************************************************/
|
|
|
|
#include "ruby/internal/config.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
#include "internal.h"
|
|
#include "internal/array.h"
|
|
#include "internal/inits.h"
|
|
#include "internal/object.h"
|
|
#include "internal/string.h"
|
|
#include "internal/transcode.h"
|
|
#include "ruby/encoding.h"
|
|
|
|
#include "transcode_data.h"
|
|
#include "id.h"
|
|
|
|
#define ENABLE_ECONV_NEWLINE_OPTION 1
|
|
|
|
/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
|
|
static VALUE rb_eUndefinedConversionError;
|
|
static VALUE rb_eInvalidByteSequenceError;
|
|
static VALUE rb_eConverterNotFoundError;
|
|
|
|
VALUE rb_cEncodingConverter;
|
|
|
|
static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
|
|
static VALUE sym_xml, sym_text, sym_attr;
|
|
static VALUE sym_universal_newline;
|
|
static VALUE sym_crlf_newline;
|
|
static VALUE sym_cr_newline;
|
|
#ifdef ENABLE_ECONV_NEWLINE_OPTION
|
|
static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
|
|
#endif
|
|
static VALUE sym_partial_input;
|
|
|
|
static VALUE sym_invalid_byte_sequence;
|
|
static VALUE sym_undefined_conversion;
|
|
static VALUE sym_destination_buffer_full;
|
|
static VALUE sym_source_buffer_empty;
|
|
static VALUE sym_finished;
|
|
static VALUE sym_after_output;
|
|
static VALUE sym_incomplete_input;
|
|
|
|
static unsigned char *
|
|
allocate_converted_string(const char *sname, const char *dname,
|
|
const unsigned char *str, size_t len,
|
|
unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
|
|
size_t *dst_len_ptr);
|
|
|
|
/* dynamic structure, one per conversion (similar to iconv_t) */
|
|
/* may carry conversion state (e.g. for iso-2022-jp) */
|
|
typedef struct rb_transcoding {
|
|
const rb_transcoder *transcoder;
|
|
|
|
int flags;
|
|
|
|
int resume_position;
|
|
unsigned int next_table;
|
|
VALUE next_info;
|
|
unsigned char next_byte;
|
|
unsigned int output_index;
|
|
|
|
ssize_t recognized_len; /* already interpreted */
|
|
ssize_t readagain_len; /* not yet interpreted */
|
|
union {
|
|
unsigned char ary[8]; /* max_input <= sizeof(ary) */
|
|
unsigned char *ptr; /* length: max_input */
|
|
} readbuf; /* recognized_len + readagain_len used */
|
|
|
|
ssize_t writebuf_off;
|
|
ssize_t writebuf_len;
|
|
union {
|
|
unsigned char ary[8]; /* max_output <= sizeof(ary) */
|
|
unsigned char *ptr; /* length: max_output */
|
|
} writebuf;
|
|
|
|
union rb_transcoding_state_t { /* opaque data for stateful encoding */
|
|
void *ptr;
|
|
char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
|
|
double dummy_for_alignment;
|
|
} state;
|
|
} rb_transcoding;
|
|
#define TRANSCODING_READBUF(tc) \
|
|
((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
|
|
(tc)->readbuf.ary : \
|
|
(tc)->readbuf.ptr)
|
|
#define TRANSCODING_WRITEBUF(tc) \
|
|
((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
|
|
(tc)->writebuf.ary : \
|
|
(tc)->writebuf.ptr)
|
|
#define TRANSCODING_WRITEBUF_SIZE(tc) \
|
|
((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
|
|
sizeof((tc)->writebuf.ary) : \
|
|
(size_t)(tc)->transcoder->max_output)
|
|
#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
|
|
#define TRANSCODING_STATE(tc) \
|
|
((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
|
|
(tc)->state.ary : \
|
|
(tc)->state.ptr)
|
|
|
|
typedef struct {
|
|
struct rb_transcoding *tc;
|
|
unsigned char *out_buf_start;
|
|
unsigned char *out_data_start;
|
|
unsigned char *out_data_end;
|
|
unsigned char *out_buf_end;
|
|
rb_econv_result_t last_result;
|
|
} rb_econv_elem_t;
|
|
|
|
struct rb_econv_t {
|
|
int flags;
|
|
int started; /* bool */
|
|
|
|
const char *source_encoding_name;
|
|
const char *destination_encoding_name;
|
|
|
|
const unsigned char *replacement_str;
|
|
size_t replacement_len;
|
|
const char *replacement_enc;
|
|
|
|
unsigned char *in_buf_start;
|
|
unsigned char *in_data_start;
|
|
unsigned char *in_data_end;
|
|
unsigned char *in_buf_end;
|
|
rb_econv_elem_t *elems;
|
|
int replacement_allocated; /* bool */
|
|
int num_allocated;
|
|
int num_trans;
|
|
int num_finished;
|
|
struct rb_transcoding *last_tc;
|
|
|
|
/* last error */
|
|
struct {
|
|
rb_econv_result_t result;
|
|
struct rb_transcoding *error_tc;
|
|
const char *source_encoding;
|
|
const char *destination_encoding;
|
|
const unsigned char *error_bytes_start;
|
|
size_t error_bytes_len;
|
|
size_t readagain_len;
|
|
} last_error;
|
|
|
|
/* The following fields are only for Encoding::Converter.
|
|
* rb_econv_open set them NULL. */
|
|
rb_encoding *source_encoding;
|
|
rb_encoding *destination_encoding;
|
|
};
|
|
|
|
/*
|
|
* Dispatch data and logic
|
|
*/
|
|
|
|
#define DECORATOR_P(sname, dname) (*(sname) == '\0')
|
|
|
|
typedef struct {
|
|
const char *sname;
|
|
const char *dname;
|
|
const char *lib; /* null means no need to load a library */
|
|
const rb_transcoder *transcoder;
|
|
} transcoder_entry_t;
|
|
|
|
static st_table *transcoder_table;
|
|
|
|
static transcoder_entry_t *
|
|
make_transcoder_entry(const char *sname, const char *dname)
|
|
{
|
|
st_data_t val;
|
|
st_table *table2;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
|
|
val = (st_data_t)st_init_strcasetable();
|
|
st_add_direct(transcoder_table, (st_data_t)sname, val);
|
|
}
|
|
table2 = (st_table *)val;
|
|
if (!st_lookup(table2, (st_data_t)dname, &val)) {
|
|
transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
|
|
entry->sname = sname;
|
|
entry->dname = dname;
|
|
entry->lib = NULL;
|
|
entry->transcoder = NULL;
|
|
val = (st_data_t)entry;
|
|
st_add_direct(table2, (st_data_t)dname, val);
|
|
}
|
|
return (transcoder_entry_t *)val;
|
|
}
|
|
|
|
static transcoder_entry_t *
|
|
get_transcoder_entry(const char *sname, const char *dname)
|
|
{
|
|
st_data_t val;
|
|
st_table *table2;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
|
|
return NULL;
|
|
}
|
|
table2 = (st_table *)val;
|
|
if (!st_lookup(table2, (st_data_t)dname, &val)) {
|
|
return NULL;
|
|
}
|
|
return (transcoder_entry_t *)val;
|
|
}
|
|
|
|
void
|
|
rb_register_transcoder(const rb_transcoder *tr)
|
|
{
|
|
const char *const sname = tr->src_encoding;
|
|
const char *const dname = tr->dst_encoding;
|
|
|
|
transcoder_entry_t *entry;
|
|
|
|
entry = make_transcoder_entry(sname, dname);
|
|
if (entry->transcoder) {
|
|
rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
|
|
sname, dname);
|
|
}
|
|
|
|
entry->transcoder = tr;
|
|
}
|
|
|
|
static void
|
|
declare_transcoder(const char *sname, const char *dname, const char *lib)
|
|
{
|
|
transcoder_entry_t *entry;
|
|
|
|
entry = make_transcoder_entry(sname, dname);
|
|
entry->lib = lib;
|
|
}
|
|
|
|
static const char transcoder_lib_prefix[] = "enc/trans/";
|
|
|
|
void
|
|
rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
|
|
{
|
|
if (!lib) {
|
|
rb_raise(rb_eArgError, "invalid library name - (null)");
|
|
}
|
|
declare_transcoder(enc1, enc2, lib);
|
|
}
|
|
|
|
#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
|
|
|
|
typedef struct search_path_queue_tag {
|
|
struct search_path_queue_tag *next;
|
|
const char *enc;
|
|
} search_path_queue_t;
|
|
|
|
typedef struct {
|
|
st_table *visited;
|
|
search_path_queue_t *queue;
|
|
search_path_queue_t **queue_last_ptr;
|
|
const char *base_enc;
|
|
} search_path_bfs_t;
|
|
|
|
static int
|
|
transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
|
|
{
|
|
const char *dname = (const char *)key;
|
|
search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
|
|
search_path_queue_t *q;
|
|
|
|
if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
|
|
return ST_CONTINUE;
|
|
}
|
|
|
|
q = ALLOC(search_path_queue_t);
|
|
q->enc = dname;
|
|
q->next = NULL;
|
|
*bfs->queue_last_ptr = q;
|
|
bfs->queue_last_ptr = &q->next;
|
|
|
|
st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
|
|
return ST_CONTINUE;
|
|
}
|
|
|
|
static int
|
|
transcode_search_path(const char *sname, const char *dname,
|
|
void (*callback)(const char *sname, const char *dname, int depth, void *arg),
|
|
void *arg)
|
|
{
|
|
search_path_bfs_t bfs;
|
|
search_path_queue_t *q;
|
|
st_data_t val;
|
|
st_table *table2;
|
|
int found;
|
|
int pathlen = -1;
|
|
|
|
if (encoding_equal(sname, dname))
|
|
return -1;
|
|
|
|
q = ALLOC(search_path_queue_t);
|
|
q->enc = sname;
|
|
q->next = NULL;
|
|
bfs.queue_last_ptr = &q->next;
|
|
bfs.queue = q;
|
|
|
|
bfs.visited = st_init_strcasetable();
|
|
st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
|
|
|
|
while (bfs.queue) {
|
|
q = bfs.queue;
|
|
bfs.queue = q->next;
|
|
if (!bfs.queue)
|
|
bfs.queue_last_ptr = &bfs.queue;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
|
|
xfree(q);
|
|
continue;
|
|
}
|
|
table2 = (st_table *)val;
|
|
|
|
if (st_lookup(table2, (st_data_t)dname, &val)) {
|
|
st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
|
|
xfree(q);
|
|
found = 1;
|
|
goto cleanup;
|
|
}
|
|
|
|
bfs.base_enc = q->enc;
|
|
st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
|
|
bfs.base_enc = NULL;
|
|
|
|
xfree(q);
|
|
}
|
|
found = 0;
|
|
|
|
cleanup:
|
|
while (bfs.queue) {
|
|
q = bfs.queue;
|
|
bfs.queue = q->next;
|
|
xfree(q);
|
|
}
|
|
|
|
if (found) {
|
|
const char *enc = dname;
|
|
int depth;
|
|
pathlen = 0;
|
|
while (1) {
|
|
st_lookup(bfs.visited, (st_data_t)enc, &val);
|
|
if (!val)
|
|
break;
|
|
pathlen++;
|
|
enc = (const char *)val;
|
|
}
|
|
depth = pathlen;
|
|
enc = dname;
|
|
while (1) {
|
|
st_lookup(bfs.visited, (st_data_t)enc, &val);
|
|
if (!val)
|
|
break;
|
|
callback((const char *)val, enc, --depth, arg);
|
|
enc = (const char *)val;
|
|
}
|
|
}
|
|
|
|
st_free_table(bfs.visited);
|
|
|
|
return pathlen; /* is -1 if not found */
|
|
}
|
|
|
|
static const rb_transcoder *
|
|
load_transcoder_entry(transcoder_entry_t *entry)
|
|
{
|
|
if (entry->transcoder)
|
|
return entry->transcoder;
|
|
|
|
if (entry->lib) {
|
|
const char *const lib = entry->lib;
|
|
const size_t len = strlen(lib);
|
|
const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
|
|
const VALUE fn = rb_str_new(0, total_len);
|
|
char *const path = RSTRING_PTR(fn);
|
|
|
|
memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
|
|
memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
|
|
rb_str_set_len(fn, total_len);
|
|
OBJ_FREEZE(fn);
|
|
rb_require_string(fn);
|
|
}
|
|
|
|
if (entry->transcoder)
|
|
return entry->transcoder;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static const char*
|
|
get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
|
|
{
|
|
if (encoding_equal(encname, "UTF-8")) {
|
|
*len_ret = 3;
|
|
*repl_encname_ptr = "UTF-8";
|
|
return "\xEF\xBF\xBD";
|
|
}
|
|
else {
|
|
*len_ret = 1;
|
|
*repl_encname_ptr = "US-ASCII";
|
|
return "?";
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Transcoding engine logic
|
|
*/
|
|
|
|
static const unsigned char *
|
|
transcode_char_start(rb_transcoding *tc,
|
|
const unsigned char *in_start,
|
|
const unsigned char *inchar_start,
|
|
const unsigned char *in_p,
|
|
size_t *char_len_ptr)
|
|
{
|
|
const unsigned char *ptr;
|
|
if (inchar_start - in_start < tc->recognized_len) {
|
|
MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
|
|
inchar_start, unsigned char, in_p - inchar_start);
|
|
ptr = TRANSCODING_READBUF(tc);
|
|
}
|
|
else {
|
|
ptr = inchar_start - tc->recognized_len;
|
|
}
|
|
*char_len_ptr = tc->recognized_len + (in_p - inchar_start);
|
|
return ptr;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
rb_transcoding *tc,
|
|
const int opt)
|
|
{
|
|
const rb_transcoder *tr = tc->transcoder;
|
|
int unitlen = tr->input_unit_length;
|
|
ssize_t readagain_len = 0;
|
|
|
|
const unsigned char *inchar_start;
|
|
const unsigned char *in_p;
|
|
|
|
unsigned char *out_p;
|
|
|
|
in_p = inchar_start = *in_pos;
|
|
|
|
out_p = *out_pos;
|
|
|
|
#define SUSPEND(ret, num) \
|
|
do { \
|
|
tc->resume_position = (num); \
|
|
if (0 < in_p - inchar_start) \
|
|
MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
|
|
inchar_start, unsigned char, in_p - inchar_start); \
|
|
*in_pos = in_p; \
|
|
*out_pos = out_p; \
|
|
tc->recognized_len += in_p - inchar_start; \
|
|
if (readagain_len) { \
|
|
tc->recognized_len -= readagain_len; \
|
|
tc->readagain_len = readagain_len; \
|
|
} \
|
|
return (ret); \
|
|
resume_label ## num:; \
|
|
} while (0)
|
|
#define SUSPEND_OBUF(num) \
|
|
do { \
|
|
while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
|
|
} while (0)
|
|
|
|
#define SUSPEND_AFTER_OUTPUT(num) \
|
|
if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
|
|
SUSPEND(econv_after_output, num); \
|
|
}
|
|
|
|
#define next_table (tc->next_table)
|
|
#define next_info (tc->next_info)
|
|
#define next_byte (tc->next_byte)
|
|
#define writebuf_len (tc->writebuf_len)
|
|
#define writebuf_off (tc->writebuf_off)
|
|
|
|
switch (tc->resume_position) {
|
|
case 0: break;
|
|
case 1: goto resume_label1;
|
|
case 2: goto resume_label2;
|
|
case 3: goto resume_label3;
|
|
case 4: goto resume_label4;
|
|
case 5: goto resume_label5;
|
|
case 6: goto resume_label6;
|
|
case 7: goto resume_label7;
|
|
case 8: goto resume_label8;
|
|
case 9: goto resume_label9;
|
|
case 10: goto resume_label10;
|
|
case 11: goto resume_label11;
|
|
case 12: goto resume_label12;
|
|
case 13: goto resume_label13;
|
|
case 14: goto resume_label14;
|
|
case 15: goto resume_label15;
|
|
case 16: goto resume_label16;
|
|
case 17: goto resume_label17;
|
|
case 18: goto resume_label18;
|
|
case 19: goto resume_label19;
|
|
case 20: goto resume_label20;
|
|
case 21: goto resume_label21;
|
|
case 22: goto resume_label22;
|
|
case 23: goto resume_label23;
|
|
case 24: goto resume_label24;
|
|
case 25: goto resume_label25;
|
|
case 26: goto resume_label26;
|
|
case 27: goto resume_label27;
|
|
case 28: goto resume_label28;
|
|
case 29: goto resume_label29;
|
|
case 30: goto resume_label30;
|
|
case 31: goto resume_label31;
|
|
case 32: goto resume_label32;
|
|
case 33: goto resume_label33;
|
|
case 34: goto resume_label34;
|
|
}
|
|
|
|
while (1) {
|
|
inchar_start = in_p;
|
|
tc->recognized_len = 0;
|
|
next_table = tr->conv_tree_start;
|
|
|
|
SUSPEND_AFTER_OUTPUT(24);
|
|
|
|
if (in_stop <= in_p) {
|
|
if (!(opt & ECONV_PARTIAL_INPUT))
|
|
break;
|
|
SUSPEND(econv_source_buffer_empty, 7);
|
|
continue;
|
|
}
|
|
|
|
#define BYTE_ADDR(index) (tr->byte_array + (index))
|
|
#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
|
|
#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
|
|
#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
|
|
#define BL_MIN_BYTE (BL_BASE[0])
|
|
#define BL_MAX_BYTE (BL_BASE[1])
|
|
#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
|
|
#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
|
|
|
|
next_byte = (unsigned char)*in_p++;
|
|
follow_byte:
|
|
if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
|
|
next_info = INVALID;
|
|
else {
|
|
next_info = (VALUE)BL_ACTION(next_byte);
|
|
}
|
|
follow_info:
|
|
switch (next_info & 0x1F) {
|
|
case NOMAP:
|
|
{
|
|
const unsigned char *p = inchar_start;
|
|
writebuf_off = 0;
|
|
while (p < in_p) {
|
|
TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
|
|
}
|
|
writebuf_len = writebuf_off;
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(3);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
continue;
|
|
case 0x00: case 0x04: case 0x08: case 0x0C:
|
|
case 0x10: case 0x14: case 0x18: case 0x1C:
|
|
SUSPEND_AFTER_OUTPUT(25);
|
|
while (in_p >= in_stop) {
|
|
if (!(opt & ECONV_PARTIAL_INPUT))
|
|
goto incomplete;
|
|
SUSPEND(econv_source_buffer_empty, 5);
|
|
}
|
|
next_byte = (unsigned char)*in_p++;
|
|
next_table = (unsigned int)next_info;
|
|
goto follow_byte;
|
|
case ZERObt: /* drop input */
|
|
continue;
|
|
case ONEbt:
|
|
SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
|
|
continue;
|
|
case TWObt:
|
|
SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
|
|
SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
|
|
continue;
|
|
case THREEbt:
|
|
SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
|
|
SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
|
|
SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
|
|
continue;
|
|
case FOURbt:
|
|
SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
|
|
SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
|
|
SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
|
|
SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
|
|
continue;
|
|
case GB4bt:
|
|
SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
|
|
SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
|
|
SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
|
|
SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
|
|
continue;
|
|
case STR1:
|
|
tc->output_index = 0;
|
|
while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
|
|
SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
|
|
tc->output_index++;
|
|
}
|
|
continue;
|
|
case FUNii:
|
|
next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
|
|
goto follow_info;
|
|
case FUNsi:
|
|
{
|
|
const unsigned char *char_start;
|
|
size_t char_len;
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
|
|
goto follow_info;
|
|
}
|
|
case FUNio:
|
|
SUSPEND_OBUF(13);
|
|
if (tr->max_output <= out_stop - out_p)
|
|
out_p += tr->func_io(TRANSCODING_STATE(tc),
|
|
next_info, out_p, out_stop - out_p);
|
|
else {
|
|
writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
|
|
next_info,
|
|
TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(20);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
break;
|
|
case FUNso:
|
|
{
|
|
const unsigned char *char_start;
|
|
size_t char_len;
|
|
SUSPEND_OBUF(14);
|
|
if (tr->max_output <= out_stop - out_p) {
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
out_p += tr->func_so(TRANSCODING_STATE(tc),
|
|
char_start, (size_t)char_len,
|
|
out_p, out_stop - out_p);
|
|
}
|
|
else {
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
|
|
char_start, (size_t)char_len,
|
|
TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(22);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case FUNsio:
|
|
{
|
|
const unsigned char *char_start;
|
|
size_t char_len;
|
|
SUSPEND_OBUF(33);
|
|
if (tr->max_output <= out_stop - out_p) {
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
out_p += tr->func_sio(TRANSCODING_STATE(tc),
|
|
char_start, (size_t)char_len, next_info,
|
|
out_p, out_stop - out_p);
|
|
}
|
|
else {
|
|
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
|
|
writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
|
|
char_start, (size_t)char_len, next_info,
|
|
TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(34);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case INVALID:
|
|
if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
|
|
if (tc->recognized_len + (in_p - inchar_start) < unitlen)
|
|
SUSPEND_AFTER_OUTPUT(26);
|
|
while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
|
|
in_p = in_stop;
|
|
SUSPEND(econv_source_buffer_empty, 8);
|
|
}
|
|
if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
|
|
in_p = in_stop;
|
|
}
|
|
else {
|
|
in_p = inchar_start + (unitlen - tc->recognized_len);
|
|
}
|
|
}
|
|
else {
|
|
ssize_t invalid_len; /* including the last byte which causes invalid */
|
|
ssize_t discard_len;
|
|
invalid_len = tc->recognized_len + (in_p - inchar_start);
|
|
discard_len = ((invalid_len - 1) / unitlen) * unitlen;
|
|
readagain_len = invalid_len - discard_len;
|
|
}
|
|
goto invalid;
|
|
case UNDEF:
|
|
goto undef;
|
|
default:
|
|
rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
|
|
}
|
|
continue;
|
|
|
|
invalid:
|
|
SUSPEND(econv_invalid_byte_sequence, 1);
|
|
continue;
|
|
|
|
incomplete:
|
|
SUSPEND(econv_incomplete_input, 27);
|
|
continue;
|
|
|
|
undef:
|
|
SUSPEND(econv_undefined_conversion, 2);
|
|
continue;
|
|
}
|
|
|
|
/* cleanup */
|
|
if (tr->finish_func) {
|
|
SUSPEND_OBUF(4);
|
|
if (tr->max_output <= out_stop - out_p) {
|
|
out_p += tr->finish_func(TRANSCODING_STATE(tc),
|
|
out_p, out_stop - out_p);
|
|
}
|
|
else {
|
|
writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
|
|
TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
|
|
writebuf_off = 0;
|
|
while (writebuf_off < writebuf_len) {
|
|
SUSPEND_OBUF(23);
|
|
*out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
|
|
}
|
|
}
|
|
}
|
|
while (1)
|
|
SUSPEND(econv_finished, 6);
|
|
#undef SUSPEND
|
|
#undef next_table
|
|
#undef next_info
|
|
#undef next_byte
|
|
#undef writebuf_len
|
|
#undef writebuf_off
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
rb_transcoding *tc,
|
|
const int opt)
|
|
{
|
|
if (tc->readagain_len) {
|
|
unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
|
|
const unsigned char *readagain_pos = readagain_buf;
|
|
const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
|
|
rb_econv_result_t res;
|
|
|
|
MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
|
|
unsigned char, tc->readagain_len);
|
|
tc->readagain_len = 0;
|
|
res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
|
|
if (res != econv_source_buffer_empty) {
|
|
MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
|
|
readagain_pos, unsigned char, readagain_stop - readagain_pos);
|
|
tc->readagain_len += readagain_stop - readagain_pos;
|
|
return res;
|
|
}
|
|
}
|
|
return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
|
|
}
|
|
|
|
static rb_transcoding *
|
|
rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
|
|
{
|
|
rb_transcoding *tc;
|
|
|
|
tc = ALLOC(rb_transcoding);
|
|
tc->transcoder = tr;
|
|
tc->flags = flags;
|
|
if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
|
|
tc->state.ptr = xmalloc(tr->state_size);
|
|
if (tr->state_init_func) {
|
|
(tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
|
|
}
|
|
tc->resume_position = 0;
|
|
tc->recognized_len = 0;
|
|
tc->readagain_len = 0;
|
|
tc->writebuf_len = 0;
|
|
tc->writebuf_off = 0;
|
|
if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
|
|
tc->readbuf.ptr = xmalloc(tr->max_input);
|
|
}
|
|
if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
|
|
tc->writebuf.ptr = xmalloc(tr->max_output);
|
|
}
|
|
return tc;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
rb_transcoding_convert(rb_transcoding *tc,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags)
|
|
{
|
|
return transcode_restartable(
|
|
input_ptr, output_ptr,
|
|
input_stop, output_stop,
|
|
tc, flags);
|
|
}
|
|
|
|
static void
|
|
rb_transcoding_close(rb_transcoding *tc)
|
|
{
|
|
const rb_transcoder *tr = tc->transcoder;
|
|
if (tr->state_fini_func) {
|
|
(tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
|
|
}
|
|
if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
|
|
xfree(tc->state.ptr);
|
|
if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
|
|
xfree(tc->readbuf.ptr);
|
|
if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
|
|
xfree(tc->writebuf.ptr);
|
|
xfree(tc);
|
|
}
|
|
|
|
static size_t
|
|
rb_transcoding_memsize(rb_transcoding *tc)
|
|
{
|
|
size_t size = sizeof(rb_transcoding);
|
|
const rb_transcoder *tr = tc->transcoder;
|
|
|
|
if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
|
|
size += tr->state_size;
|
|
}
|
|
if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
|
|
size += tr->max_input;
|
|
}
|
|
if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
|
|
size += tr->max_output;
|
|
}
|
|
return size;
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_alloc(int n_hint)
|
|
{
|
|
rb_econv_t *ec;
|
|
|
|
if (n_hint <= 0)
|
|
n_hint = 1;
|
|
|
|
ec = ALLOC(rb_econv_t);
|
|
ec->flags = 0;
|
|
ec->source_encoding_name = NULL;
|
|
ec->destination_encoding_name = NULL;
|
|
ec->started = 0;
|
|
ec->replacement_str = NULL;
|
|
ec->replacement_len = 0;
|
|
ec->replacement_enc = NULL;
|
|
ec->replacement_allocated = 0;
|
|
ec->in_buf_start = NULL;
|
|
ec->in_data_start = NULL;
|
|
ec->in_data_end = NULL;
|
|
ec->in_buf_end = NULL;
|
|
ec->num_allocated = n_hint;
|
|
ec->num_trans = 0;
|
|
ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
|
|
ec->num_finished = 0;
|
|
ec->last_tc = NULL;
|
|
ec->last_error.result = econv_source_buffer_empty;
|
|
ec->last_error.error_tc = NULL;
|
|
ec->last_error.source_encoding = NULL;
|
|
ec->last_error.destination_encoding = NULL;
|
|
ec->last_error.error_bytes_start = NULL;
|
|
ec->last_error.error_bytes_len = 0;
|
|
ec->last_error.readagain_len = 0;
|
|
ec->source_encoding = NULL;
|
|
ec->destination_encoding = NULL;
|
|
return ec;
|
|
}
|
|
|
|
static int
|
|
rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
|
|
{
|
|
int n, j;
|
|
int bufsize = 4096;
|
|
unsigned char *p;
|
|
|
|
if (ec->num_trans == ec->num_allocated) {
|
|
n = ec->num_allocated * 2;
|
|
REALLOC_N(ec->elems, rb_econv_elem_t, n);
|
|
ec->num_allocated = n;
|
|
}
|
|
|
|
p = xmalloc(bufsize);
|
|
|
|
MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
|
|
|
|
ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
|
|
ec->elems[i].out_buf_start = p;
|
|
ec->elems[i].out_buf_end = p + bufsize;
|
|
ec->elems[i].out_data_start = p;
|
|
ec->elems[i].out_data_end = p;
|
|
ec->elems[i].last_result = econv_source_buffer_empty;
|
|
|
|
ec->num_trans++;
|
|
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
|
|
for (j = ec->num_trans-1; i <= j; j--) {
|
|
rb_transcoding *tc = ec->elems[j].tc;
|
|
const rb_transcoder *tr2 = tc->transcoder;
|
|
if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
|
|
ec->last_tc = tc;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
|
|
{
|
|
rb_econv_t *ec;
|
|
int i, ret;
|
|
|
|
for (i = 0; i < n; i++) {
|
|
const rb_transcoder *tr;
|
|
tr = load_transcoder_entry(entries[i]);
|
|
if (!tr)
|
|
return NULL;
|
|
}
|
|
|
|
ec = rb_econv_alloc(n);
|
|
|
|
for (i = 0; i < n; i++) {
|
|
const rb_transcoder *tr = load_transcoder_entry(entries[i]);
|
|
ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
|
|
if (ret == -1) {
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return ec;
|
|
}
|
|
|
|
struct trans_open_t {
|
|
transcoder_entry_t **entries;
|
|
int num_additional;
|
|
};
|
|
|
|
static void
|
|
trans_open_i(const char *sname, const char *dname, int depth, void *arg)
|
|
{
|
|
struct trans_open_t *toarg = arg;
|
|
|
|
if (!toarg->entries) {
|
|
toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
|
|
}
|
|
toarg->entries[depth] = get_transcoder_entry(sname, dname);
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_open0(const char *sname, const char *dname, int ecflags)
|
|
{
|
|
transcoder_entry_t **entries = NULL;
|
|
int num_trans;
|
|
rb_econv_t *ec;
|
|
|
|
/* Just check if sname and dname are defined */
|
|
/* (This check is needed?) */
|
|
if (*sname) rb_enc_find_index(sname);
|
|
if (*dname) rb_enc_find_index(dname);
|
|
|
|
if (*sname == '\0' && *dname == '\0') {
|
|
num_trans = 0;
|
|
entries = NULL;
|
|
sname = dname = "";
|
|
}
|
|
else {
|
|
struct trans_open_t toarg;
|
|
toarg.entries = NULL;
|
|
toarg.num_additional = 0;
|
|
num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
|
|
entries = toarg.entries;
|
|
if (num_trans < 0) {
|
|
xfree(entries);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
|
|
xfree(entries);
|
|
if (!ec)
|
|
return NULL;
|
|
|
|
ec->flags = ecflags;
|
|
ec->source_encoding_name = sname;
|
|
ec->destination_encoding_name = dname;
|
|
|
|
return ec;
|
|
}
|
|
|
|
#define MAX_ECFLAGS_DECORATORS 32
|
|
|
|
static int
|
|
decorator_names(int ecflags, const char **decorators_ret)
|
|
{
|
|
int num_decorators;
|
|
|
|
switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
|
|
case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
|
|
case ECONV_CRLF_NEWLINE_DECORATOR:
|
|
case ECONV_CR_NEWLINE_DECORATOR:
|
|
case 0:
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
|
|
if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
|
|
(ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
|
|
return -1;
|
|
|
|
num_decorators = 0;
|
|
|
|
if (ecflags & ECONV_XML_TEXT_DECORATOR)
|
|
decorators_ret[num_decorators++] = "xml_text_escape";
|
|
if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
|
|
decorators_ret[num_decorators++] = "xml_attr_content_escape";
|
|
if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "xml_attr_quote";
|
|
|
|
if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "crlf_newline";
|
|
if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "cr_newline";
|
|
if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
|
|
decorators_ret[num_decorators++] = "universal_newline";
|
|
|
|
return num_decorators;
|
|
}
|
|
|
|
rb_econv_t *
|
|
rb_econv_open(const char *sname, const char *dname, int ecflags)
|
|
{
|
|
rb_econv_t *ec;
|
|
int num_decorators;
|
|
const char *decorators[MAX_ECFLAGS_DECORATORS];
|
|
int i;
|
|
|
|
num_decorators = decorator_names(ecflags, decorators);
|
|
if (num_decorators == -1)
|
|
return NULL;
|
|
|
|
ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
|
|
if (!ec)
|
|
return NULL;
|
|
|
|
for (i = 0; i < num_decorators; i++)
|
|
if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
|
|
ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
|
|
|
|
return ec;
|
|
}
|
|
|
|
static int
|
|
trans_sweep(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags,
|
|
int start)
|
|
{
|
|
int try;
|
|
int i, f;
|
|
|
|
const unsigned char **ipp, *is, *iold;
|
|
unsigned char **opp, *os, *oold;
|
|
rb_econv_result_t res;
|
|
|
|
try = 1;
|
|
while (try) {
|
|
try = 0;
|
|
for (i = start; i < ec->num_trans; i++) {
|
|
rb_econv_elem_t *te = &ec->elems[i];
|
|
|
|
if (i == 0) {
|
|
ipp = input_ptr;
|
|
is = input_stop;
|
|
}
|
|
else {
|
|
rb_econv_elem_t *prev_te = &ec->elems[i-1];
|
|
ipp = (const unsigned char **)&prev_te->out_data_start;
|
|
is = prev_te->out_data_end;
|
|
}
|
|
|
|
if (i == ec->num_trans-1) {
|
|
opp = output_ptr;
|
|
os = output_stop;
|
|
}
|
|
else {
|
|
if (te->out_buf_start != te->out_data_start) {
|
|
ssize_t len = te->out_data_end - te->out_data_start;
|
|
ssize_t off = te->out_data_start - te->out_buf_start;
|
|
MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
|
|
te->out_data_start = te->out_buf_start;
|
|
te->out_data_end -= off;
|
|
}
|
|
opp = &te->out_data_end;
|
|
os = te->out_buf_end;
|
|
}
|
|
|
|
f = flags;
|
|
if (ec->num_finished != i)
|
|
f |= ECONV_PARTIAL_INPUT;
|
|
if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
|
|
start = 1;
|
|
flags &= ~ECONV_AFTER_OUTPUT;
|
|
}
|
|
if (i != 0)
|
|
f &= ~ECONV_AFTER_OUTPUT;
|
|
iold = *ipp;
|
|
oold = *opp;
|
|
te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
|
|
if (iold != *ipp || oold != *opp)
|
|
try = 1;
|
|
|
|
switch (res) {
|
|
case econv_invalid_byte_sequence:
|
|
case econv_incomplete_input:
|
|
case econv_undefined_conversion:
|
|
case econv_after_output:
|
|
return i;
|
|
|
|
case econv_destination_buffer_full:
|
|
case econv_source_buffer_empty:
|
|
break;
|
|
|
|
case econv_finished:
|
|
ec->num_finished = i+1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
rb_trans_conv(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags,
|
|
int *result_position_ptr)
|
|
{
|
|
int i;
|
|
int needreport_index;
|
|
int sweep_start;
|
|
|
|
unsigned char empty_buf;
|
|
unsigned char *empty_ptr = &empty_buf;
|
|
|
|
if (!input_ptr) {
|
|
input_ptr = (const unsigned char **)&empty_ptr;
|
|
input_stop = empty_ptr;
|
|
}
|
|
|
|
if (!output_ptr) {
|
|
output_ptr = &empty_ptr;
|
|
output_stop = empty_ptr;
|
|
}
|
|
|
|
if (ec->elems[0].last_result == econv_after_output)
|
|
ec->elems[0].last_result = econv_source_buffer_empty;
|
|
|
|
for (i = ec->num_trans-1; 0 <= i; i--) {
|
|
switch (ec->elems[i].last_result) {
|
|
case econv_invalid_byte_sequence:
|
|
case econv_incomplete_input:
|
|
case econv_undefined_conversion:
|
|
case econv_after_output:
|
|
case econv_finished:
|
|
sweep_start = i+1;
|
|
goto found_needreport;
|
|
|
|
case econv_destination_buffer_full:
|
|
case econv_source_buffer_empty:
|
|
break;
|
|
|
|
default:
|
|
rb_bug("unexpected transcode last result");
|
|
}
|
|
}
|
|
|
|
/* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
|
|
|
|
if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
|
|
(flags & ECONV_AFTER_OUTPUT)) {
|
|
rb_econv_result_t res;
|
|
|
|
res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
|
|
(flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
|
|
result_position_ptr);
|
|
|
|
if (res == econv_source_buffer_empty)
|
|
return econv_after_output;
|
|
return res;
|
|
}
|
|
|
|
sweep_start = 0;
|
|
|
|
found_needreport:
|
|
|
|
do {
|
|
needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
|
|
sweep_start = needreport_index + 1;
|
|
} while (needreport_index != -1 && needreport_index != ec->num_trans-1);
|
|
|
|
for (i = ec->num_trans-1; 0 <= i; i--) {
|
|
if (ec->elems[i].last_result != econv_source_buffer_empty) {
|
|
rb_econv_result_t res = ec->elems[i].last_result;
|
|
if (res == econv_invalid_byte_sequence ||
|
|
res == econv_incomplete_input ||
|
|
res == econv_undefined_conversion ||
|
|
res == econv_after_output) {
|
|
ec->elems[i].last_result = econv_source_buffer_empty;
|
|
}
|
|
if (result_position_ptr)
|
|
*result_position_ptr = i;
|
|
return res;
|
|
}
|
|
}
|
|
if (result_position_ptr)
|
|
*result_position_ptr = -1;
|
|
return econv_source_buffer_empty;
|
|
}
|
|
|
|
static rb_econv_result_t
|
|
rb_econv_convert0(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags)
|
|
{
|
|
rb_econv_result_t res;
|
|
int result_position;
|
|
int has_output = 0;
|
|
|
|
memset(&ec->last_error, 0, sizeof(ec->last_error));
|
|
|
|
if (ec->num_trans == 0) {
|
|
size_t len;
|
|
if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
|
|
if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
|
|
len = output_stop - *output_ptr;
|
|
memcpy(*output_ptr, ec->in_data_start, len);
|
|
*output_ptr = output_stop;
|
|
ec->in_data_start += len;
|
|
res = econv_destination_buffer_full;
|
|
goto gotresult;
|
|
}
|
|
len = ec->in_data_end - ec->in_data_start;
|
|
memcpy(*output_ptr, ec->in_data_start, len);
|
|
*output_ptr += len;
|
|
ec->in_data_start = ec->in_data_end = ec->in_buf_start;
|
|
if (flags & ECONV_AFTER_OUTPUT) {
|
|
res = econv_after_output;
|
|
goto gotresult;
|
|
}
|
|
}
|
|
if (output_stop - *output_ptr < input_stop - *input_ptr) {
|
|
len = output_stop - *output_ptr;
|
|
}
|
|
else {
|
|
len = input_stop - *input_ptr;
|
|
}
|
|
if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
|
|
*(*output_ptr)++ = *(*input_ptr)++;
|
|
res = econv_after_output;
|
|
goto gotresult;
|
|
}
|
|
memcpy(*output_ptr, *input_ptr, len);
|
|
*output_ptr += len;
|
|
*input_ptr += len;
|
|
if (*input_ptr != input_stop)
|
|
res = econv_destination_buffer_full;
|
|
else if (flags & ECONV_PARTIAL_INPUT)
|
|
res = econv_source_buffer_empty;
|
|
else
|
|
res = econv_finished;
|
|
goto gotresult;
|
|
}
|
|
|
|
if (ec->elems[ec->num_trans-1].out_data_start) {
|
|
unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
|
|
unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
|
|
if (data_start != data_end) {
|
|
size_t len;
|
|
if (output_stop - *output_ptr < data_end - data_start) {
|
|
len = output_stop - *output_ptr;
|
|
memcpy(*output_ptr, data_start, len);
|
|
*output_ptr = output_stop;
|
|
ec->elems[ec->num_trans-1].out_data_start += len;
|
|
res = econv_destination_buffer_full;
|
|
goto gotresult;
|
|
}
|
|
len = data_end - data_start;
|
|
memcpy(*output_ptr, data_start, len);
|
|
*output_ptr += len;
|
|
ec->elems[ec->num_trans-1].out_data_start =
|
|
ec->elems[ec->num_trans-1].out_data_end =
|
|
ec->elems[ec->num_trans-1].out_buf_start;
|
|
has_output = 1;
|
|
}
|
|
}
|
|
|
|
if (ec->in_buf_start &&
|
|
ec->in_data_start != ec->in_data_end) {
|
|
res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
|
|
(flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
|
|
if (res != econv_source_buffer_empty)
|
|
goto gotresult;
|
|
}
|
|
|
|
if (has_output &&
|
|
(flags & ECONV_AFTER_OUTPUT) &&
|
|
*input_ptr != input_stop) {
|
|
input_stop = *input_ptr;
|
|
res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
|
|
if (res == econv_source_buffer_empty)
|
|
res = econv_after_output;
|
|
}
|
|
else if ((flags & ECONV_AFTER_OUTPUT) ||
|
|
ec->num_trans == 1) {
|
|
res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
|
|
}
|
|
else {
|
|
flags |= ECONV_AFTER_OUTPUT;
|
|
do {
|
|
res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
|
|
} while (res == econv_after_output);
|
|
}
|
|
|
|
gotresult:
|
|
ec->last_error.result = res;
|
|
if (res == econv_invalid_byte_sequence ||
|
|
res == econv_incomplete_input ||
|
|
res == econv_undefined_conversion) {
|
|
rb_transcoding *error_tc = ec->elems[result_position].tc;
|
|
ec->last_error.error_tc = error_tc;
|
|
ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
|
|
ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
|
|
ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
|
|
ec->last_error.error_bytes_len = error_tc->recognized_len;
|
|
ec->last_error.readagain_len = error_tc->readagain_len;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
static int output_replacement_character(rb_econv_t *ec);
|
|
|
|
static int
|
|
output_hex_charref(rb_econv_t *ec)
|
|
{
|
|
int ret;
|
|
unsigned char utfbuf[1024];
|
|
const unsigned char *utf;
|
|
size_t utf_len;
|
|
int utf_allocated = 0;
|
|
char charef_buf[16];
|
|
const unsigned char *p;
|
|
|
|
if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
|
|
utf = ec->last_error.error_bytes_start;
|
|
utf_len = ec->last_error.error_bytes_len;
|
|
}
|
|
else {
|
|
utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
|
|
ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
|
|
utfbuf, sizeof(utfbuf),
|
|
&utf_len);
|
|
if (!utf)
|
|
return -1;
|
|
if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
|
|
utf_allocated = 1;
|
|
}
|
|
|
|
if (utf_len % 4 != 0)
|
|
goto fail;
|
|
|
|
p = utf;
|
|
while (4 <= utf_len) {
|
|
unsigned int u = 0;
|
|
u += p[0] << 24;
|
|
u += p[1] << 16;
|
|
u += p[2] << 8;
|
|
u += p[3];
|
|
snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
|
|
|
|
ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
|
|
if (ret == -1)
|
|
goto fail;
|
|
|
|
p += 4;
|
|
utf_len -= 4;
|
|
}
|
|
|
|
if (utf_allocated)
|
|
xfree((void *)utf);
|
|
return 0;
|
|
|
|
fail:
|
|
if (utf_allocated)
|
|
xfree((void *)utf);
|
|
return -1;
|
|
}
|
|
|
|
rb_econv_result_t
|
|
rb_econv_convert(rb_econv_t *ec,
|
|
const unsigned char **input_ptr, const unsigned char *input_stop,
|
|
unsigned char **output_ptr, unsigned char *output_stop,
|
|
int flags)
|
|
{
|
|
rb_econv_result_t ret;
|
|
|
|
unsigned char empty_buf;
|
|
unsigned char *empty_ptr = &empty_buf;
|
|
|
|
ec->started = 1;
|
|
|
|
if (!input_ptr) {
|
|
input_ptr = (const unsigned char **)&empty_ptr;
|
|
input_stop = empty_ptr;
|
|
}
|
|
|
|
if (!output_ptr) {
|
|
output_ptr = &empty_ptr;
|
|
output_stop = empty_ptr;
|
|
}
|
|
|
|
resume:
|
|
ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
|
|
|
|
if (ret == econv_invalid_byte_sequence ||
|
|
ret == econv_incomplete_input) {
|
|
/* deal with invalid byte sequence */
|
|
/* todo: add more alternative behaviors */
|
|
switch (ec->flags & ECONV_INVALID_MASK) {
|
|
case ECONV_INVALID_REPLACE:
|
|
if (output_replacement_character(ec) == 0)
|
|
goto resume;
|
|
}
|
|
}
|
|
|
|
if (ret == econv_undefined_conversion) {
|
|
/* valid character in source encoding
|
|
* but no related character(s) in destination encoding */
|
|
/* todo: add more alternative behaviors */
|
|
switch (ec->flags & ECONV_UNDEF_MASK) {
|
|
case ECONV_UNDEF_REPLACE:
|
|
if (output_replacement_character(ec) == 0)
|
|
goto resume;
|
|
break;
|
|
|
|
case ECONV_UNDEF_HEX_CHARREF:
|
|
if (output_hex_charref(ec) == 0)
|
|
goto resume;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
const char *
|
|
rb_econv_encoding_to_insert_output(rb_econv_t *ec)
|
|
{
|
|
rb_transcoding *tc = ec->last_tc;
|
|
const rb_transcoder *tr;
|
|
|
|
if (tc == NULL)
|
|
return "";
|
|
|
|
tr = tc->transcoder;
|
|
|
|
if (tr->asciicompat_type == asciicompat_encoder)
|
|
return tr->src_encoding;
|
|
return tr->dst_encoding;
|
|
}
|
|
|
|
static unsigned char *
|
|
allocate_converted_string(const char *sname, const char *dname,
|
|
const unsigned char *str, size_t len,
|
|
unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
|
|
size_t *dst_len_ptr)
|
|
{
|
|
unsigned char *dst_str;
|
|
size_t dst_len;
|
|
size_t dst_bufsize;
|
|
|
|
rb_econv_t *ec;
|
|
rb_econv_result_t res;
|
|
|
|
const unsigned char *sp;
|
|
unsigned char *dp;
|
|
|
|
if (caller_dst_buf)
|
|
dst_bufsize = caller_dst_bufsize;
|
|
else if (len == 0)
|
|
dst_bufsize = 1;
|
|
else
|
|
dst_bufsize = len;
|
|
|
|
ec = rb_econv_open(sname, dname, 0);
|
|
if (ec == NULL)
|
|
return NULL;
|
|
if (caller_dst_buf)
|
|
dst_str = caller_dst_buf;
|
|
else
|
|
dst_str = xmalloc(dst_bufsize);
|
|
dst_len = 0;
|
|
sp = str;
|
|
dp = dst_str+dst_len;
|
|
res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
|
|
dst_len = dp - dst_str;
|
|
while (res == econv_destination_buffer_full) {
|
|
if (SIZE_MAX/2 < dst_bufsize) {
|
|
goto fail;
|
|
}
|
|
dst_bufsize *= 2;
|
|
if (dst_str == caller_dst_buf) {
|
|
unsigned char *tmp;
|
|
tmp = xmalloc(dst_bufsize);
|
|
memcpy(tmp, dst_str, dst_bufsize/2);
|
|
dst_str = tmp;
|
|
}
|
|
else {
|
|
dst_str = xrealloc(dst_str, dst_bufsize);
|
|
}
|
|
dp = dst_str+dst_len;
|
|
res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
|
|
dst_len = dp - dst_str;
|
|
}
|
|
if (res != econv_finished) {
|
|
goto fail;
|
|
}
|
|
rb_econv_close(ec);
|
|
*dst_len_ptr = dst_len;
|
|
return dst_str;
|
|
|
|
fail:
|
|
if (dst_str != caller_dst_buf)
|
|
xfree(dst_str);
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
|
|
/* result: 0:success -1:failure */
|
|
int
|
|
rb_econv_insert_output(rb_econv_t *ec,
|
|
const unsigned char *str, size_t len, const char *str_encoding)
|
|
{
|
|
const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
|
|
unsigned char insert_buf[4096];
|
|
const unsigned char *insert_str = NULL;
|
|
size_t insert_len;
|
|
|
|
int last_trans_index;
|
|
rb_transcoding *tc;
|
|
|
|
unsigned char **buf_start_p;
|
|
unsigned char **data_start_p;
|
|
unsigned char **data_end_p;
|
|
unsigned char **buf_end_p;
|
|
|
|
size_t need;
|
|
|
|
ec->started = 1;
|
|
|
|
if (len == 0)
|
|
return 0;
|
|
|
|
if (encoding_equal(insert_encoding, str_encoding)) {
|
|
insert_str = str;
|
|
insert_len = len;
|
|
}
|
|
else {
|
|
insert_str = allocate_converted_string(str_encoding, insert_encoding,
|
|
str, len, insert_buf, sizeof(insert_buf), &insert_len);
|
|
if (insert_str == NULL)
|
|
return -1;
|
|
}
|
|
|
|
need = insert_len;
|
|
|
|
last_trans_index = ec->num_trans-1;
|
|
if (ec->num_trans == 0) {
|
|
tc = NULL;
|
|
buf_start_p = &ec->in_buf_start;
|
|
data_start_p = &ec->in_data_start;
|
|
data_end_p = &ec->in_data_end;
|
|
buf_end_p = &ec->in_buf_end;
|
|
}
|
|
else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
|
|
tc = ec->elems[last_trans_index].tc;
|
|
need += tc->readagain_len;
|
|
if (need < insert_len)
|
|
goto fail;
|
|
if (last_trans_index == 0) {
|
|
buf_start_p = &ec->in_buf_start;
|
|
data_start_p = &ec->in_data_start;
|
|
data_end_p = &ec->in_data_end;
|
|
buf_end_p = &ec->in_buf_end;
|
|
}
|
|
else {
|
|
rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
|
|
buf_start_p = &ee->out_buf_start;
|
|
data_start_p = &ee->out_data_start;
|
|
data_end_p = &ee->out_data_end;
|
|
buf_end_p = &ee->out_buf_end;
|
|
}
|
|
}
|
|
else {
|
|
rb_econv_elem_t *ee = &ec->elems[last_trans_index];
|
|
buf_start_p = &ee->out_buf_start;
|
|
data_start_p = &ee->out_data_start;
|
|
data_end_p = &ee->out_data_end;
|
|
buf_end_p = &ee->out_buf_end;
|
|
tc = ec->elems[last_trans_index].tc;
|
|
}
|
|
|
|
if (*buf_start_p == NULL) {
|
|
unsigned char *buf = xmalloc(need);
|
|
*buf_start_p = buf;
|
|
*data_start_p = buf;
|
|
*data_end_p = buf;
|
|
*buf_end_p = buf+need;
|
|
}
|
|
else if ((size_t)(*buf_end_p - *data_end_p) < need) {
|
|
MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
|
|
*data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
|
|
*data_start_p = *buf_start_p;
|
|
if ((size_t)(*buf_end_p - *data_end_p) < need) {
|
|
unsigned char *buf;
|
|
size_t s = (*data_end_p - *buf_start_p) + need;
|
|
if (s < need)
|
|
goto fail;
|
|
buf = xrealloc(*buf_start_p, s);
|
|
*data_start_p = buf;
|
|
*data_end_p = buf + (*data_end_p - *buf_start_p);
|
|
*buf_start_p = buf;
|
|
*buf_end_p = buf + s;
|
|
}
|
|
}
|
|
|
|
memcpy(*data_end_p, insert_str, insert_len);
|
|
*data_end_p += insert_len;
|
|
if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
|
|
memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
|
|
*data_end_p += tc->readagain_len;
|
|
tc->readagain_len = 0;
|
|
}
|
|
|
|
if (insert_str != str && insert_str != insert_buf)
|
|
xfree((void*)insert_str);
|
|
return 0;
|
|
|
|
fail:
|
|
if (insert_str != str && insert_str != insert_buf)
|
|
xfree((void*)insert_str);
|
|
return -1;
|
|
}
|
|
|
|
void
|
|
rb_econv_close(rb_econv_t *ec)
|
|
{
|
|
int i;
|
|
|
|
if (ec->replacement_allocated) {
|
|
xfree((void *)ec->replacement_str);
|
|
}
|
|
for (i = 0; i < ec->num_trans; i++) {
|
|
rb_transcoding_close(ec->elems[i].tc);
|
|
if (ec->elems[i].out_buf_start)
|
|
xfree(ec->elems[i].out_buf_start);
|
|
}
|
|
xfree(ec->in_buf_start);
|
|
xfree(ec->elems);
|
|
xfree(ec);
|
|
}
|
|
|
|
size_t
|
|
rb_econv_memsize(rb_econv_t *ec)
|
|
{
|
|
size_t size = sizeof(rb_econv_t);
|
|
int i;
|
|
|
|
if (ec->replacement_allocated) {
|
|
size += ec->replacement_len;
|
|
}
|
|
for (i = 0; i < ec->num_trans; i++) {
|
|
size += rb_transcoding_memsize(ec->elems[i].tc);
|
|
|
|
if (ec->elems[i].out_buf_start) {
|
|
size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
|
|
}
|
|
}
|
|
size += ec->in_buf_end - ec->in_buf_start;
|
|
size += sizeof(rb_econv_elem_t) * ec->num_allocated;
|
|
|
|
return size;
|
|
}
|
|
|
|
int
|
|
rb_econv_putbackable(rb_econv_t *ec)
|
|
{
|
|
if (ec->num_trans == 0)
|
|
return 0;
|
|
#if SIZEOF_SIZE_T > SIZEOF_INT
|
|
if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
|
|
#endif
|
|
return (int)ec->elems[0].tc->readagain_len;
|
|
}
|
|
|
|
void
|
|
rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
|
|
{
|
|
rb_transcoding *tc;
|
|
if (ec->num_trans == 0 || n == 0)
|
|
return;
|
|
tc = ec->elems[0].tc;
|
|
memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
|
|
tc->readagain_len -= n;
|
|
}
|
|
|
|
struct asciicompat_encoding_t {
|
|
const char *ascii_compat_name;
|
|
const char *ascii_incompat_name;
|
|
};
|
|
|
|
static int
|
|
asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
|
|
{
|
|
struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
|
|
transcoder_entry_t *entry = (transcoder_entry_t *)val;
|
|
const rb_transcoder *tr;
|
|
|
|
if (DECORATOR_P(entry->sname, entry->dname))
|
|
return ST_CONTINUE;
|
|
tr = load_transcoder_entry(entry);
|
|
if (tr && tr->asciicompat_type == asciicompat_decoder) {
|
|
data->ascii_compat_name = tr->dst_encoding;
|
|
return ST_STOP;
|
|
}
|
|
return ST_CONTINUE;
|
|
}
|
|
|
|
const char *
|
|
rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
|
|
{
|
|
st_data_t v;
|
|
st_table *table2;
|
|
struct asciicompat_encoding_t data;
|
|
|
|
if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
|
|
return NULL;
|
|
table2 = (st_table *)v;
|
|
|
|
/*
|
|
* Assumption:
|
|
* There is at most one transcoder for
|
|
* converting from ASCII incompatible encoding.
|
|
*
|
|
* For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
|
|
*/
|
|
if (table2->num_entries != 1)
|
|
return NULL;
|
|
|
|
data.ascii_incompat_name = ascii_incompat_name;
|
|
data.ascii_compat_name = NULL;
|
|
st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
|
|
return data.ascii_compat_name;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
|
|
{
|
|
unsigned const char *sp, *se;
|
|
unsigned char *ds, *dp, *de;
|
|
rb_econv_result_t res;
|
|
int max_output;
|
|
|
|
if (NIL_P(dst)) {
|
|
dst = rb_str_buf_new(len);
|
|
if (ec->destination_encoding)
|
|
rb_enc_associate(dst, ec->destination_encoding);
|
|
}
|
|
|
|
if (ec->last_tc)
|
|
max_output = ec->last_tc->transcoder->max_output;
|
|
else
|
|
max_output = 1;
|
|
|
|
do {
|
|
long dlen = RSTRING_LEN(dst);
|
|
if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
|
|
unsigned long new_capa = (unsigned long)dlen + len + max_output;
|
|
if (LONG_MAX < new_capa)
|
|
rb_raise(rb_eArgError, "too long string");
|
|
rb_str_resize(dst, new_capa);
|
|
rb_str_set_len(dst, dlen);
|
|
}
|
|
sp = (const unsigned char *)ss;
|
|
se = sp + len;
|
|
ds = (unsigned char *)RSTRING_PTR(dst);
|
|
de = ds + rb_str_capacity(dst);
|
|
dp = ds += dlen;
|
|
res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
|
|
len -= (const char *)sp - ss;
|
|
ss = (const char *)sp;
|
|
rb_str_set_len(dst, dlen + (dp - ds));
|
|
rb_econv_check_error(ec);
|
|
} while (res == econv_destination_buffer_full);
|
|
|
|
return dst;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
|
|
{
|
|
src = rb_str_new_frozen(src);
|
|
dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
|
|
RB_GC_GUARD(src);
|
|
return dst;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
|
|
{
|
|
return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
|
|
{
|
|
return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
|
|
{
|
|
return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
|
|
}
|
|
|
|
static int
|
|
rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
|
|
{
|
|
transcoder_entry_t *entry;
|
|
const rb_transcoder *tr;
|
|
|
|
if (ec->started != 0)
|
|
return -1;
|
|
|
|
entry = get_transcoder_entry(sname, dname);
|
|
if (!entry)
|
|
return -1;
|
|
|
|
tr = load_transcoder_entry(entry);
|
|
if (!tr) return -1;
|
|
|
|
return rb_econv_add_transcoder_at(ec, tr, n);
|
|
}
|
|
|
|
static int
|
|
rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
|
|
{
|
|
return rb_econv_add_converter(ec, "", decorator_name, n);
|
|
}
|
|
|
|
int
|
|
rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
|
|
{
|
|
const rb_transcoder *tr;
|
|
|
|
if (ec->num_trans == 0)
|
|
return rb_econv_decorate_at(ec, decorator_name, 0);
|
|
|
|
tr = ec->elems[0].tc->transcoder;
|
|
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
|
|
tr->asciicompat_type == asciicompat_decoder)
|
|
return rb_econv_decorate_at(ec, decorator_name, 1);
|
|
|
|
return rb_econv_decorate_at(ec, decorator_name, 0);
|
|
}
|
|
|
|
int
|
|
rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
|
|
{
|
|
const rb_transcoder *tr;
|
|
|
|
if (ec->num_trans == 0)
|
|
return rb_econv_decorate_at(ec, decorator_name, 0);
|
|
|
|
tr = ec->elems[ec->num_trans-1].tc->transcoder;
|
|
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
|
|
tr->asciicompat_type == asciicompat_encoder)
|
|
return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
|
|
|
|
return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
|
|
}
|
|
|
|
void
|
|
rb_econv_binmode(rb_econv_t *ec)
|
|
{
|
|
const char *dname = 0;
|
|
|
|
switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
|
|
case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
|
|
dname = "universal_newline";
|
|
break;
|
|
case ECONV_CRLF_NEWLINE_DECORATOR:
|
|
dname = "crlf_newline";
|
|
break;
|
|
case ECONV_CR_NEWLINE_DECORATOR:
|
|
dname = "cr_newline";
|
|
break;
|
|
}
|
|
|
|
if (dname) {
|
|
const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
|
|
int num_trans = ec->num_trans;
|
|
int i, j = 0;
|
|
|
|
for (i=0; i < num_trans; i++) {
|
|
if (transcoder == ec->elems[i].tc->transcoder) {
|
|
rb_transcoding_close(ec->elems[i].tc);
|
|
xfree(ec->elems[i].out_buf_start);
|
|
ec->num_trans--;
|
|
}
|
|
else
|
|
ec->elems[j++] = ec->elems[i];
|
|
}
|
|
}
|
|
|
|
ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
|
|
}
|
|
|
|
static VALUE
|
|
econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
|
|
{
|
|
int has_description = 0;
|
|
|
|
if (NIL_P(mesg))
|
|
mesg = rb_str_new(NULL, 0);
|
|
|
|
if (*sname != '\0' || *dname != '\0') {
|
|
if (*sname == '\0')
|
|
rb_str_cat2(mesg, dname);
|
|
else if (*dname == '\0')
|
|
rb_str_cat2(mesg, sname);
|
|
else
|
|
rb_str_catf(mesg, "%s to %s", sname, dname);
|
|
has_description = 1;
|
|
}
|
|
|
|
if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
|
|
ECONV_XML_TEXT_DECORATOR|
|
|
ECONV_XML_ATTR_CONTENT_DECORATOR|
|
|
ECONV_XML_ATTR_QUOTE_DECORATOR)) {
|
|
const char *pre = "";
|
|
if (has_description)
|
|
rb_str_cat2(mesg, " with ");
|
|
if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "universal_newline");
|
|
}
|
|
if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "crlf_newline");
|
|
}
|
|
if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "cr_newline");
|
|
}
|
|
if (ecflags & ECONV_XML_TEXT_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "xml_text");
|
|
}
|
|
if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "xml_attr_content");
|
|
}
|
|
if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
|
|
rb_str_cat2(mesg, pre); pre = ",";
|
|
rb_str_cat2(mesg, "xml_attr_quote");
|
|
}
|
|
has_description = 1;
|
|
}
|
|
if (!has_description) {
|
|
rb_str_cat2(mesg, "no-conversion");
|
|
}
|
|
|
|
return mesg;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
|
|
{
|
|
VALUE mesg, exc;
|
|
mesg = rb_str_new_cstr("code converter not found (");
|
|
econv_description(sname, dname, ecflags, mesg);
|
|
rb_str_cat2(mesg, ")");
|
|
exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
|
|
return exc;
|
|
}
|
|
|
|
static VALUE
|
|
make_econv_exception(rb_econv_t *ec)
|
|
{
|
|
VALUE mesg, exc;
|
|
if (ec->last_error.result == econv_invalid_byte_sequence ||
|
|
ec->last_error.result == econv_incomplete_input) {
|
|
const char *err = (const char *)ec->last_error.error_bytes_start;
|
|
size_t error_len = ec->last_error.error_bytes_len;
|
|
VALUE bytes = rb_str_new(err, error_len);
|
|
VALUE dumped = rb_str_dump(bytes);
|
|
size_t readagain_len = ec->last_error.readagain_len;
|
|
VALUE bytes2 = Qnil;
|
|
VALUE dumped2;
|
|
if (ec->last_error.result == econv_incomplete_input) {
|
|
mesg = rb_sprintf("incomplete %s on %s",
|
|
StringValueCStr(dumped),
|
|
ec->last_error.source_encoding);
|
|
}
|
|
else if (readagain_len) {
|
|
bytes2 = rb_str_new(err+error_len, readagain_len);
|
|
dumped2 = rb_str_dump(bytes2);
|
|
mesg = rb_sprintf("%s followed by %s on %s",
|
|
StringValueCStr(dumped),
|
|
StringValueCStr(dumped2),
|
|
ec->last_error.source_encoding);
|
|
}
|
|
else {
|
|
mesg = rb_sprintf("%s on %s",
|
|
StringValueCStr(dumped),
|
|
ec->last_error.source_encoding);
|
|
}
|
|
|
|
exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
|
|
rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
|
|
rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
|
|
rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
|
|
goto set_encs;
|
|
}
|
|
if (ec->last_error.result == econv_undefined_conversion) {
|
|
VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
|
|
ec->last_error.error_bytes_len);
|
|
VALUE dumped = Qnil;
|
|
int idx;
|
|
if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
|
|
rb_encoding *utf8 = rb_utf8_encoding();
|
|
const char *start, *end;
|
|
int n;
|
|
start = (const char *)ec->last_error.error_bytes_start;
|
|
end = start + ec->last_error.error_bytes_len;
|
|
n = rb_enc_precise_mbclen(start, end, utf8);
|
|
if (MBCLEN_CHARFOUND_P(n) &&
|
|
(size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
|
|
unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
|
|
dumped = rb_sprintf("U+%04X", cc);
|
|
}
|
|
}
|
|
if (dumped == Qnil)
|
|
dumped = rb_str_dump(bytes);
|
|
if (strcmp(ec->last_error.source_encoding,
|
|
ec->source_encoding_name) == 0 &&
|
|
strcmp(ec->last_error.destination_encoding,
|
|
ec->destination_encoding_name) == 0) {
|
|
mesg = rb_sprintf("%s from %s to %s",
|
|
StringValueCStr(dumped),
|
|
ec->last_error.source_encoding,
|
|
ec->last_error.destination_encoding);
|
|
}
|
|
else {
|
|
int i;
|
|
mesg = rb_sprintf("%s to %s in conversion from %s",
|
|
StringValueCStr(dumped),
|
|
ec->last_error.destination_encoding,
|
|
ec->source_encoding_name);
|
|
for (i = 0; i < ec->num_trans; i++) {
|
|
const rb_transcoder *tr = ec->elems[i].tc->transcoder;
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
|
|
rb_str_catf(mesg, " to %s",
|
|
ec->elems[i].tc->transcoder->dst_encoding);
|
|
}
|
|
}
|
|
exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
|
|
idx = rb_enc_find_index(ec->last_error.source_encoding);
|
|
if (0 <= idx)
|
|
rb_enc_associate_index(bytes, idx);
|
|
rb_ivar_set(exc, rb_intern("error_char"), bytes);
|
|
goto set_encs;
|
|
}
|
|
return Qnil;
|
|
|
|
set_encs:
|
|
rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
|
|
rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
|
|
int idx = rb_enc_find_index(ec->last_error.source_encoding);
|
|
if (0 <= idx)
|
|
rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
|
|
idx = rb_enc_find_index(ec->last_error.destination_encoding);
|
|
if (0 <= idx)
|
|
rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
|
|
return exc;
|
|
}
|
|
|
|
static void
|
|
more_output_buffer(
|
|
VALUE destination,
|
|
unsigned char *(*resize_destination)(VALUE, size_t, size_t),
|
|
int max_output,
|
|
unsigned char **out_start_ptr,
|
|
unsigned char **out_pos,
|
|
unsigned char **out_stop_ptr)
|
|
{
|
|
size_t len = (*out_pos - *out_start_ptr);
|
|
size_t new_len = (len + max_output) * 2;
|
|
*out_start_ptr = resize_destination(destination, len, new_len);
|
|
*out_pos = *out_start_ptr + len;
|
|
*out_stop_ptr = *out_start_ptr + new_len;
|
|
}
|
|
|
|
static int
|
|
make_replacement(rb_econv_t *ec)
|
|
{
|
|
rb_transcoding *tc;
|
|
const rb_transcoder *tr;
|
|
const unsigned char *replacement;
|
|
const char *repl_enc;
|
|
const char *ins_enc;
|
|
size_t len;
|
|
|
|
if (ec->replacement_str)
|
|
return 0;
|
|
|
|
ins_enc = rb_econv_encoding_to_insert_output(ec);
|
|
|
|
tc = ec->last_tc;
|
|
if (*ins_enc) {
|
|
tr = tc->transcoder;
|
|
rb_enc_find(tr->dst_encoding);
|
|
replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
|
|
}
|
|
else {
|
|
replacement = (unsigned char *)"?";
|
|
len = 1;
|
|
repl_enc = "";
|
|
}
|
|
|
|
ec->replacement_str = replacement;
|
|
ec->replacement_len = len;
|
|
ec->replacement_enc = repl_enc;
|
|
ec->replacement_allocated = 0;
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
rb_econv_set_replacement(rb_econv_t *ec,
|
|
const unsigned char *str, size_t len, const char *encname)
|
|
{
|
|
unsigned char *str2;
|
|
size_t len2;
|
|
const char *encname2;
|
|
|
|
encname2 = rb_econv_encoding_to_insert_output(ec);
|
|
|
|
if (!*encname2 || encoding_equal(encname, encname2)) {
|
|
str2 = xmalloc(len);
|
|
MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
|
|
len2 = len;
|
|
encname2 = encname;
|
|
}
|
|
else {
|
|
str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
|
|
if (!str2)
|
|
return -1;
|
|
}
|
|
|
|
if (ec->replacement_allocated) {
|
|
xfree((void *)ec->replacement_str);
|
|
}
|
|
ec->replacement_allocated = 1;
|
|
ec->replacement_str = str2;
|
|
ec->replacement_len = len2;
|
|
ec->replacement_enc = encname2;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
output_replacement_character(rb_econv_t *ec)
|
|
{
|
|
int ret;
|
|
|
|
if (make_replacement(ec) == -1)
|
|
return -1;
|
|
|
|
ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
|
|
if (ret == -1)
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#if 1
|
|
#define hash_fallback rb_hash_aref
|
|
|
|
static VALUE
|
|
proc_fallback(VALUE fallback, VALUE c)
|
|
{
|
|
return rb_proc_call(fallback, rb_ary_new4(1, &c));
|
|
}
|
|
|
|
static VALUE
|
|
method_fallback(VALUE fallback, VALUE c)
|
|
{
|
|
return rb_method_call(1, &c, fallback);
|
|
}
|
|
|
|
static VALUE
|
|
aref_fallback(VALUE fallback, VALUE c)
|
|
{
|
|
return rb_funcallv_public(fallback, idAREF, 1, &c);
|
|
}
|
|
|
|
static void
|
|
transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
VALUE destination,
|
|
unsigned char *(*resize_destination)(VALUE, size_t, size_t),
|
|
const char *src_encoding,
|
|
const char *dst_encoding,
|
|
int ecflags,
|
|
VALUE ecopts)
|
|
{
|
|
rb_econv_t *ec;
|
|
rb_transcoding *last_tc;
|
|
rb_econv_result_t ret;
|
|
unsigned char *out_start = *out_pos;
|
|
int max_output;
|
|
VALUE exc;
|
|
VALUE fallback = Qnil;
|
|
VALUE (*fallback_func)(VALUE, VALUE) = 0;
|
|
|
|
ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
|
|
if (!ec)
|
|
rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
|
|
|
|
if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
|
|
fallback = rb_hash_aref(ecopts, sym_fallback);
|
|
if (RB_TYPE_P(fallback, T_HASH)) {
|
|
fallback_func = hash_fallback;
|
|
}
|
|
else if (rb_obj_is_proc(fallback)) {
|
|
fallback_func = proc_fallback;
|
|
}
|
|
else if (rb_obj_is_method(fallback)) {
|
|
fallback_func = method_fallback;
|
|
}
|
|
else {
|
|
fallback_func = aref_fallback;
|
|
}
|
|
}
|
|
last_tc = ec->last_tc;
|
|
max_output = last_tc ? last_tc->transcoder->max_output : 1;
|
|
|
|
resume:
|
|
ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
|
|
|
|
if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
|
|
VALUE rep = rb_enc_str_new(
|
|
(const char *)ec->last_error.error_bytes_start,
|
|
ec->last_error.error_bytes_len,
|
|
rb_enc_find(ec->last_error.source_encoding));
|
|
rep = (*fallback_func)(fallback, rep);
|
|
if (rep != Qundef && !NIL_P(rep)) {
|
|
StringValue(rep);
|
|
ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
|
|
RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
|
|
if ((int)ret == -1) {
|
|
rb_raise(rb_eArgError, "too big fallback string");
|
|
}
|
|
goto resume;
|
|
}
|
|
}
|
|
|
|
if (ret == econv_invalid_byte_sequence ||
|
|
ret == econv_incomplete_input ||
|
|
ret == econv_undefined_conversion) {
|
|
exc = make_econv_exception(ec);
|
|
rb_econv_close(ec);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (ret == econv_destination_buffer_full) {
|
|
more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
|
|
goto resume;
|
|
}
|
|
|
|
rb_econv_close(ec);
|
|
return;
|
|
}
|
|
#else
|
|
/* sample transcode_loop implementation in byte-by-byte stream style */
|
|
static void
|
|
transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
|
|
const unsigned char *in_stop, unsigned char *out_stop,
|
|
VALUE destination,
|
|
unsigned char *(*resize_destination)(VALUE, size_t, size_t),
|
|
const char *src_encoding,
|
|
const char *dst_encoding,
|
|
int ecflags,
|
|
VALUE ecopts)
|
|
{
|
|
rb_econv_t *ec;
|
|
rb_transcoding *last_tc;
|
|
rb_econv_result_t ret;
|
|
unsigned char *out_start = *out_pos;
|
|
const unsigned char *ptr;
|
|
int max_output;
|
|
VALUE exc;
|
|
|
|
ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
|
|
if (!ec)
|
|
rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
|
|
|
|
last_tc = ec->last_tc;
|
|
max_output = last_tc ? last_tc->transcoder->max_output : 1;
|
|
|
|
ret = econv_source_buffer_empty;
|
|
ptr = *in_pos;
|
|
while (ret != econv_finished) {
|
|
unsigned char input_byte;
|
|
const unsigned char *p = &input_byte;
|
|
|
|
if (ret == econv_source_buffer_empty) {
|
|
if (ptr < in_stop) {
|
|
input_byte = *ptr;
|
|
ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
|
|
}
|
|
else {
|
|
ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
|
|
}
|
|
}
|
|
else {
|
|
ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
|
|
}
|
|
if (&input_byte != p)
|
|
ptr += p - &input_byte;
|
|
switch (ret) {
|
|
case econv_invalid_byte_sequence:
|
|
case econv_incomplete_input:
|
|
case econv_undefined_conversion:
|
|
exc = make_econv_exception(ec);
|
|
rb_econv_close(ec);
|
|
rb_exc_raise(exc);
|
|
break;
|
|
|
|
case econv_destination_buffer_full:
|
|
more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
|
|
break;
|
|
|
|
case econv_source_buffer_empty:
|
|
break;
|
|
|
|
case econv_finished:
|
|
break;
|
|
}
|
|
}
|
|
rb_econv_close(ec);
|
|
*in_pos = in_stop;
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
|
|
/*
|
|
* String-specific code
|
|
*/
|
|
|
|
static unsigned char *
|
|
str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
|
|
{
|
|
rb_str_resize(destination, new_len);
|
|
return (unsigned char *)RSTRING_PTR(destination);
|
|
}
|
|
|
|
static int
|
|
econv_opts(VALUE opt, int ecflags)
|
|
{
|
|
VALUE v;
|
|
int newlineflag = 0;
|
|
|
|
v = rb_hash_aref(opt, sym_invalid);
|
|
if (NIL_P(v)) {
|
|
}
|
|
else if (v==sym_replace) {
|
|
ecflags |= ECONV_INVALID_REPLACE;
|
|
}
|
|
else {
|
|
rb_raise(rb_eArgError, "unknown value for invalid character option");
|
|
}
|
|
|
|
v = rb_hash_aref(opt, sym_undef);
|
|
if (NIL_P(v)) {
|
|
}
|
|
else if (v==sym_replace) {
|
|
ecflags |= ECONV_UNDEF_REPLACE;
|
|
}
|
|
else {
|
|
rb_raise(rb_eArgError, "unknown value for undefined character option");
|
|
}
|
|
|
|
v = rb_hash_aref(opt, sym_replace);
|
|
if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
|
|
ecflags |= ECONV_UNDEF_REPLACE;
|
|
}
|
|
|
|
v = rb_hash_aref(opt, sym_xml);
|
|
if (!NIL_P(v)) {
|
|
if (v==sym_text) {
|
|
ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
|
|
}
|
|
else if (v==sym_attr) {
|
|
ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
|
|
}
|
|
else if (RB_TYPE_P(v, T_SYMBOL)) {
|
|
rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
|
|
}
|
|
else {
|
|
rb_raise(rb_eArgError, "unexpected value for xml option");
|
|
}
|
|
}
|
|
|
|
#ifdef ENABLE_ECONV_NEWLINE_OPTION
|
|
v = rb_hash_aref(opt, sym_newline);
|
|
if (!NIL_P(v)) {
|
|
newlineflag = 2;
|
|
ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
|
|
if (v == sym_universal) {
|
|
ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
|
|
}
|
|
else if (v == sym_crlf) {
|
|
ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
|
|
}
|
|
else if (v == sym_cr) {
|
|
ecflags |= ECONV_CR_NEWLINE_DECORATOR;
|
|
}
|
|
else if (v == sym_lf) {
|
|
/* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
|
|
}
|
|
else if (SYMBOL_P(v)) {
|
|
rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
|
|
rb_sym2str(v));
|
|
}
|
|
else {
|
|
rb_raise(rb_eArgError, "unexpected value for newline option");
|
|
}
|
|
}
|
|
#endif
|
|
{
|
|
int setflags = 0;
|
|
|
|
v = rb_hash_aref(opt, sym_universal_newline);
|
|
if (RTEST(v))
|
|
setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
|
|
newlineflag |= !NIL_P(v);
|
|
|
|
v = rb_hash_aref(opt, sym_crlf_newline);
|
|
if (RTEST(v))
|
|
setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
|
|
newlineflag |= !NIL_P(v);
|
|
|
|
v = rb_hash_aref(opt, sym_cr_newline);
|
|
if (RTEST(v))
|
|
setflags |= ECONV_CR_NEWLINE_DECORATOR;
|
|
newlineflag |= !NIL_P(v);
|
|
|
|
switch (newlineflag) {
|
|
case 1:
|
|
ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
|
|
ecflags |= setflags;
|
|
break;
|
|
|
|
case 3:
|
|
rb_warning(":newline option preceds other newline options");
|
|
break;
|
|
}
|
|
}
|
|
|
|
return ecflags;
|
|
}
|
|
|
|
int
|
|
rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
|
|
{
|
|
VALUE newhash = Qnil;
|
|
VALUE v;
|
|
|
|
if (NIL_P(opthash)) {
|
|
*opts = Qnil;
|
|
return ecflags;
|
|
}
|
|
ecflags = econv_opts(opthash, ecflags);
|
|
|
|
v = rb_hash_aref(opthash, sym_replace);
|
|
if (!NIL_P(v)) {
|
|
StringValue(v);
|
|
if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
|
|
VALUE dumped = rb_str_dump(v);
|
|
rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
|
|
StringValueCStr(dumped),
|
|
rb_enc_name(rb_enc_get(v)));
|
|
}
|
|
v = rb_str_new_frozen(v);
|
|
newhash = rb_hash_new();
|
|
rb_hash_aset(newhash, sym_replace, v);
|
|
}
|
|
|
|
v = rb_hash_aref(opthash, sym_fallback);
|
|
if (!NIL_P(v)) {
|
|
VALUE h = rb_check_hash_type(v);
|
|
if (NIL_P(h)
|
|
? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
|
|
: (v = h, 1)) {
|
|
if (NIL_P(newhash))
|
|
newhash = rb_hash_new();
|
|
rb_hash_aset(newhash, sym_fallback, v);
|
|
}
|
|
}
|
|
|
|
if (!NIL_P(newhash))
|
|
rb_hash_freeze(newhash);
|
|
*opts = newhash;
|
|
|
|
return ecflags;
|
|
}
|
|
|
|
int
|
|
rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
|
|
{
|
|
return rb_econv_prepare_options(opthash, opts, 0);
|
|
}
|
|
|
|
rb_econv_t *
|
|
rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
|
|
{
|
|
rb_econv_t *ec;
|
|
VALUE replacement;
|
|
|
|
if (NIL_P(opthash)) {
|
|
replacement = Qnil;
|
|
}
|
|
else {
|
|
if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
|
|
rb_bug("rb_econv_open_opts called with invalid opthash");
|
|
replacement = rb_hash_aref(opthash, sym_replace);
|
|
}
|
|
|
|
ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
|
|
if (!ec)
|
|
return ec;
|
|
|
|
if (!NIL_P(replacement)) {
|
|
int ret;
|
|
rb_encoding *enc = rb_enc_get(replacement);
|
|
|
|
ret = rb_econv_set_replacement(ec,
|
|
(const unsigned char *)RSTRING_PTR(replacement),
|
|
RSTRING_LEN(replacement),
|
|
rb_enc_name(enc));
|
|
if (ret == -1) {
|
|
rb_econv_close(ec);
|
|
return NULL;
|
|
}
|
|
}
|
|
return ec;
|
|
}
|
|
|
|
static int
|
|
enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
|
|
{
|
|
rb_encoding *enc;
|
|
const char *n;
|
|
int encidx;
|
|
VALUE encval;
|
|
|
|
if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
|
|
!(enc = rb_enc_from_index(encidx))) {
|
|
enc = NULL;
|
|
encidx = 0;
|
|
n = StringValueCStr(*arg);
|
|
}
|
|
else {
|
|
n = rb_enc_name(enc);
|
|
}
|
|
|
|
*name_p = n;
|
|
*enc_p = enc;
|
|
|
|
return encidx;
|
|
}
|
|
|
|
static int
|
|
str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
|
|
const char **sname_p, rb_encoding **senc_p,
|
|
const char **dname_p, rb_encoding **denc_p)
|
|
{
|
|
rb_encoding *senc, *denc;
|
|
const char *sname, *dname;
|
|
int sencidx, dencidx;
|
|
|
|
dencidx = enc_arg(arg1, &dname, &denc);
|
|
|
|
if (NIL_P(*arg2)) {
|
|
sencidx = rb_enc_get_index(str);
|
|
senc = rb_enc_from_index(sencidx);
|
|
sname = rb_enc_name(senc);
|
|
}
|
|
else {
|
|
sencidx = enc_arg(arg2, &sname, &senc);
|
|
}
|
|
|
|
*sname_p = sname;
|
|
*senc_p = senc;
|
|
*dname_p = dname;
|
|
*denc_p = denc;
|
|
return dencidx;
|
|
}
|
|
|
|
static int
|
|
str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
|
|
{
|
|
VALUE dest;
|
|
VALUE str = *self;
|
|
VALUE arg1, arg2;
|
|
long blen, slen;
|
|
unsigned char *buf, *bp, *sp;
|
|
const unsigned char *fromp;
|
|
rb_encoding *senc, *denc;
|
|
const char *sname, *dname;
|
|
int dencidx;
|
|
int explicitly_invalid_replace = TRUE;
|
|
|
|
rb_check_arity(argc, 0, 2);
|
|
|
|
if (argc == 0) {
|
|
arg1 = rb_enc_default_internal();
|
|
if (NIL_P(arg1)) {
|
|
if (!ecflags) return -1;
|
|
arg1 = rb_obj_encoding(str);
|
|
}
|
|
if (!(ecflags & ECONV_INVALID_MASK)) {
|
|
explicitly_invalid_replace = FALSE;
|
|
}
|
|
ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
|
|
}
|
|
else {
|
|
arg1 = argv[0];
|
|
}
|
|
arg2 = argc<=1 ? Qnil : argv[1];
|
|
dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
|
|
|
|
if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
|
|
ECONV_XML_TEXT_DECORATOR|
|
|
ECONV_XML_ATTR_CONTENT_DECORATOR|
|
|
ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
|
|
if (senc && senc == denc) {
|
|
if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
|
|
VALUE rep = Qnil;
|
|
if (!NIL_P(ecopts)) {
|
|
rep = rb_hash_aref(ecopts, sym_replace);
|
|
}
|
|
dest = rb_enc_str_scrub(senc, str, rep);
|
|
if (NIL_P(dest)) dest = str;
|
|
*self = dest;
|
|
return dencidx;
|
|
}
|
|
return NIL_P(arg2) ? -1 : dencidx;
|
|
}
|
|
if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
|
|
if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
|
|
return dencidx;
|
|
}
|
|
}
|
|
if (encoding_equal(sname, dname)) {
|
|
return NIL_P(arg2) ? -1 : dencidx;
|
|
}
|
|
}
|
|
else {
|
|
if (encoding_equal(sname, dname)) {
|
|
sname = "";
|
|
dname = "";
|
|
}
|
|
}
|
|
|
|
fromp = sp = (unsigned char *)RSTRING_PTR(str);
|
|
slen = RSTRING_LEN(str);
|
|
blen = slen + 30; /* len + margin */
|
|
dest = rb_str_tmp_new(blen);
|
|
bp = (unsigned char *)RSTRING_PTR(dest);
|
|
|
|
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
|
|
if (fromp != sp+slen) {
|
|
rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
|
|
}
|
|
buf = (unsigned char *)RSTRING_PTR(dest);
|
|
*bp = '\0';
|
|
rb_str_set_len(dest, bp - buf);
|
|
|
|
/* set encoding */
|
|
if (!denc) {
|
|
dencidx = rb_define_dummy_encoding(dname);
|
|
RB_GC_GUARD(arg1);
|
|
RB_GC_GUARD(arg2);
|
|
}
|
|
*self = dest;
|
|
|
|
return dencidx;
|
|
}
|
|
|
|
static int
|
|
str_transcode(int argc, VALUE *argv, VALUE *self)
|
|
{
|
|
VALUE opt;
|
|
int ecflags = 0;
|
|
VALUE ecopts = Qnil;
|
|
|
|
argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
|
|
if (!NIL_P(opt)) {
|
|
ecflags = rb_econv_prepare_opts(opt, &ecopts);
|
|
}
|
|
return str_transcode0(argc, argv, self, ecflags, ecopts);
|
|
}
|
|
|
|
static inline VALUE
|
|
str_encode_associate(VALUE str, int encidx)
|
|
{
|
|
int cr = 0;
|
|
|
|
rb_enc_associate_index(str, encidx);
|
|
|
|
/* transcoded string never be broken. */
|
|
if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
|
|
rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
|
|
}
|
|
else {
|
|
cr = ENC_CODERANGE_VALID;
|
|
}
|
|
ENC_CODERANGE_SET(str, cr);
|
|
return str;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* str.encode!(encoding [, options] ) -> str
|
|
* str.encode!(dst_encoding, src_encoding [, options] ) -> str
|
|
*
|
|
* The first form transcodes the contents of <i>str</i> from
|
|
* str.encoding to +encoding+.
|
|
* The second form transcodes the contents of <i>str</i> from
|
|
* src_encoding to dst_encoding.
|
|
* The options Hash gives details for conversion. See String#encode
|
|
* for details.
|
|
* Returns the string even if no changes were made.
|
|
*/
|
|
|
|
static VALUE
|
|
str_encode_bang(int argc, VALUE *argv, VALUE str)
|
|
{
|
|
VALUE newstr;
|
|
int encidx;
|
|
|
|
rb_check_frozen(str);
|
|
|
|
newstr = str;
|
|
encidx = str_transcode(argc, argv, &newstr);
|
|
|
|
if (encidx < 0) return str;
|
|
if (newstr == str) {
|
|
rb_enc_associate_index(str, encidx);
|
|
return str;
|
|
}
|
|
rb_str_shared_replace(str, newstr);
|
|
return str_encode_associate(str, encidx);
|
|
}
|
|
|
|
static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
|
|
|
|
/*
|
|
* call-seq:
|
|
* str.encode(encoding [, options] ) -> str
|
|
* str.encode(dst_encoding, src_encoding [, options] ) -> str
|
|
* str.encode([options]) -> str
|
|
*
|
|
* The first form returns a copy of +str+ transcoded
|
|
* to encoding +encoding+.
|
|
* The second form returns a copy of +str+ transcoded
|
|
* from src_encoding to dst_encoding.
|
|
* The last form returns a copy of +str+ transcoded to
|
|
* <tt>Encoding.default_internal</tt>.
|
|
*
|
|
* By default, the first and second form raise
|
|
* Encoding::UndefinedConversionError for characters that are
|
|
* undefined in the destination encoding, and
|
|
* Encoding::InvalidByteSequenceError for invalid byte sequences
|
|
* in the source encoding. The last form by default does not raise
|
|
* exceptions but uses replacement strings.
|
|
*
|
|
* The +options+ Hash gives details for conversion and can have the following
|
|
* keys:
|
|
*
|
|
* :invalid ::
|
|
* If the value is +:replace+, #encode replaces invalid byte sequences in
|
|
* +str+ with the replacement character. The default is to raise the
|
|
* Encoding::InvalidByteSequenceError exception
|
|
* :undef ::
|
|
* If the value is +:replace+, #encode replaces characters which are
|
|
* undefined in the destination encoding with the replacement character.
|
|
* The default is to raise the Encoding::UndefinedConversionError.
|
|
* :replace ::
|
|
* Sets the replacement string to the given value. The default replacement
|
|
* string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
|
|
* :fallback ::
|
|
* Sets the replacement string by the given object for undefined
|
|
* character. The object should be a Hash, a Proc, a Method, or an
|
|
* object which has [] method.
|
|
* Its key is an undefined character encoded in the source encoding
|
|
* of current transcoder. Its value can be any encoding until it
|
|
* can be converted into the destination encoding of the transcoder.
|
|
* :xml ::
|
|
* The value must be +:text+ or +:attr+.
|
|
* If the value is +:text+ #encode replaces undefined characters with their
|
|
* (upper-case hexadecimal) numeric character references. '&', '<', and '>'
|
|
* are converted to "&", "<", and ">", respectively.
|
|
* If the value is +:attr+, #encode also quotes the replacement result
|
|
* (using '"'), and replaces '"' with """.
|
|
* :cr_newline ::
|
|
* Replaces LF ("\n") with CR ("\r") if value is true.
|
|
* :crlf_newline ::
|
|
* Replaces LF ("\n") with CRLF ("\r\n") if value is true.
|
|
* :universal_newline ::
|
|
* Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
|
|
*/
|
|
|
|
static VALUE
|
|
str_encode(int argc, VALUE *argv, VALUE str)
|
|
{
|
|
VALUE newstr = str;
|
|
int encidx = str_transcode(argc, argv, &newstr);
|
|
return encoded_dup(newstr, str, encidx);
|
|
}
|
|
|
|
VALUE
|
|
rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
|
|
{
|
|
int argc = 1;
|
|
VALUE *argv = &to;
|
|
VALUE newstr = str;
|
|
int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
|
|
return encoded_dup(newstr, str, encidx);
|
|
}
|
|
|
|
static VALUE
|
|
encoded_dup(VALUE newstr, VALUE str, int encidx)
|
|
{
|
|
if (encidx < 0) return rb_str_dup(str);
|
|
if (newstr == str) {
|
|
newstr = rb_str_dup(str);
|
|
rb_enc_associate_index(newstr, encidx);
|
|
return newstr;
|
|
}
|
|
else {
|
|
RBASIC_SET_CLASS(newstr, rb_obj_class(str));
|
|
}
|
|
return str_encode_associate(newstr, encidx);
|
|
}
|
|
|
|
/*
|
|
* Document-class: Encoding::Converter
|
|
*
|
|
* Encoding conversion class.
|
|
*/
|
|
static void
|
|
econv_free(void *ptr)
|
|
{
|
|
rb_econv_t *ec = ptr;
|
|
rb_econv_close(ec);
|
|
}
|
|
|
|
static size_t
|
|
econv_memsize(const void *ptr)
|
|
{
|
|
return sizeof(rb_econv_t);
|
|
}
|
|
|
|
static const rb_data_type_t econv_data_type = {
|
|
"econv",
|
|
{0, econv_free, econv_memsize,},
|
|
0, 0, RUBY_TYPED_FREE_IMMEDIATELY
|
|
};
|
|
|
|
static VALUE
|
|
econv_s_allocate(VALUE klass)
|
|
{
|
|
return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
|
|
}
|
|
|
|
static rb_encoding *
|
|
make_dummy_encoding(const char *name)
|
|
{
|
|
rb_encoding *enc;
|
|
int idx;
|
|
idx = rb_define_dummy_encoding(name);
|
|
enc = rb_enc_from_index(idx);
|
|
return enc;
|
|
}
|
|
|
|
static rb_encoding *
|
|
make_encoding(const char *name)
|
|
{
|
|
rb_encoding *enc;
|
|
enc = rb_enc_find(name);
|
|
if (!enc)
|
|
enc = make_dummy_encoding(name);
|
|
return enc;
|
|
}
|
|
|
|
static VALUE
|
|
make_encobj(const char *name)
|
|
{
|
|
return rb_enc_from_encoding(make_encoding(name));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
|
|
* Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
|
|
*
|
|
* Returns the corresponding ASCII compatible encoding.
|
|
*
|
|
* Returns nil if the argument is an ASCII compatible encoding.
|
|
*
|
|
* "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
|
|
* can represents exactly the same characters as the given ASCII incompatible encoding.
|
|
* So, no conversion undefined error occurs when converting between the two encodings.
|
|
*
|
|
* Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
|
|
* Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
|
|
* Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
|
|
{
|
|
const char *arg_name, *result_name;
|
|
rb_encoding *arg_enc, *result_enc;
|
|
|
|
enc_arg(&arg, &arg_name, &arg_enc);
|
|
|
|
result_name = rb_econv_asciicompat_encoding(arg_name);
|
|
|
|
if (result_name == NULL)
|
|
return Qnil;
|
|
|
|
result_enc = make_encoding(result_name);
|
|
|
|
return rb_enc_from_encoding(result_enc);
|
|
}
|
|
|
|
static void
|
|
econv_args(int argc, VALUE *argv,
|
|
VALUE *snamev_p, VALUE *dnamev_p,
|
|
const char **sname_p, const char **dname_p,
|
|
rb_encoding **senc_p, rb_encoding **denc_p,
|
|
int *ecflags_p,
|
|
VALUE *ecopts_p)
|
|
{
|
|
VALUE opt, flags_v, ecopts;
|
|
int sidx, didx;
|
|
const char *sname, *dname;
|
|
rb_encoding *senc, *denc;
|
|
int ecflags;
|
|
|
|
argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
|
|
|
|
if (!NIL_P(flags_v)) {
|
|
if (!NIL_P(opt)) {
|
|
rb_error_arity(argc + 1, 2, 3);
|
|
}
|
|
ecflags = NUM2INT(rb_to_int(flags_v));
|
|
ecopts = Qnil;
|
|
}
|
|
else if (!NIL_P(opt)) {
|
|
ecflags = rb_econv_prepare_opts(opt, &ecopts);
|
|
}
|
|
else {
|
|
ecflags = 0;
|
|
ecopts = Qnil;
|
|
}
|
|
|
|
senc = NULL;
|
|
sidx = rb_to_encoding_index(*snamev_p);
|
|
if (0 <= sidx) {
|
|
senc = rb_enc_from_index(sidx);
|
|
}
|
|
else {
|
|
StringValue(*snamev_p);
|
|
}
|
|
|
|
denc = NULL;
|
|
didx = rb_to_encoding_index(*dnamev_p);
|
|
if (0 <= didx) {
|
|
denc = rb_enc_from_index(didx);
|
|
}
|
|
else {
|
|
StringValue(*dnamev_p);
|
|
}
|
|
|
|
sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
|
|
dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
|
|
|
|
*sname_p = sname;
|
|
*dname_p = dname;
|
|
*senc_p = senc;
|
|
*denc_p = denc;
|
|
*ecflags_p = ecflags;
|
|
*ecopts_p = ecopts;
|
|
}
|
|
|
|
static int
|
|
decorate_convpath(VALUE convpath, int ecflags)
|
|
{
|
|
int num_decorators;
|
|
const char *decorators[MAX_ECFLAGS_DECORATORS];
|
|
int i;
|
|
int n, len;
|
|
|
|
num_decorators = decorator_names(ecflags, decorators);
|
|
if (num_decorators == -1)
|
|
return -1;
|
|
|
|
len = n = RARRAY_LENINT(convpath);
|
|
if (n != 0) {
|
|
VALUE pair = RARRAY_AREF(convpath, n-1);
|
|
if (RB_TYPE_P(pair, T_ARRAY)) {
|
|
const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
|
|
const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
|
|
transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
|
|
const rb_transcoder *tr = load_transcoder_entry(entry);
|
|
if (!tr)
|
|
return -1;
|
|
if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
|
|
tr->asciicompat_type == asciicompat_encoder) {
|
|
n--;
|
|
rb_ary_store(convpath, len + num_decorators - 1, pair);
|
|
}
|
|
}
|
|
else {
|
|
rb_ary_store(convpath, len + num_decorators - 1, pair);
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < num_decorators; i++)
|
|
rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
|
|
{
|
|
VALUE *ary_p = arg;
|
|
VALUE v;
|
|
|
|
if (*ary_p == Qnil) {
|
|
*ary_p = rb_ary_new();
|
|
}
|
|
|
|
if (DECORATOR_P(sname, dname)) {
|
|
v = rb_str_new_cstr(dname);
|
|
}
|
|
else {
|
|
v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
|
|
}
|
|
rb_ary_store(*ary_p, depth, v);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
|
|
* Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
|
|
*
|
|
* Returns a conversion path.
|
|
*
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
|
|
*
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
|
|
* or
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
|
|
* # "universal_newline"]
|
|
*
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
|
|
* or
|
|
* p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # "universal_newline",
|
|
* # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
|
|
*/
|
|
static VALUE
|
|
econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
|
|
{
|
|
VALUE snamev, dnamev;
|
|
const char *sname, *dname;
|
|
rb_encoding *senc, *denc;
|
|
int ecflags;
|
|
VALUE ecopts;
|
|
VALUE convpath;
|
|
|
|
econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
|
|
|
|
convpath = Qnil;
|
|
transcode_search_path(sname, dname, search_convpath_i, &convpath);
|
|
|
|
if (NIL_P(convpath)) {
|
|
VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
|
|
RB_GC_GUARD(snamev);
|
|
RB_GC_GUARD(dnamev);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (decorate_convpath(convpath, ecflags) == -1) {
|
|
VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
|
|
RB_GC_GUARD(snamev);
|
|
RB_GC_GUARD(dnamev);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
return convpath;
|
|
}
|
|
|
|
/*
|
|
* Check the existence of a conversion path.
|
|
* Returns the number of converters in the conversion path.
|
|
* result: >=0:success -1:failure
|
|
*/
|
|
int
|
|
rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
|
|
{
|
|
VALUE convpath = Qnil;
|
|
transcode_search_path(from_encoding, to_encoding, search_convpath_i,
|
|
&convpath);
|
|
return RTEST(convpath);
|
|
}
|
|
|
|
struct rb_econv_init_by_convpath_t {
|
|
rb_econv_t *ec;
|
|
int index;
|
|
int ret;
|
|
};
|
|
|
|
static void
|
|
rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
|
|
{
|
|
struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
|
|
int ret;
|
|
|
|
if (a->ret == -1)
|
|
return;
|
|
|
|
ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
|
|
|
|
a->ret = ret;
|
|
return;
|
|
}
|
|
|
|
static rb_econv_t *
|
|
rb_econv_init_by_convpath(VALUE self, VALUE convpath,
|
|
const char **sname_p, const char **dname_p,
|
|
rb_encoding **senc_p, rb_encoding**denc_p)
|
|
{
|
|
rb_econv_t *ec;
|
|
long i;
|
|
int ret, first=1;
|
|
VALUE elt;
|
|
rb_encoding *senc = 0, *denc = 0;
|
|
const char *sname, *dname;
|
|
|
|
ec = rb_econv_alloc(RARRAY_LENINT(convpath));
|
|
DATA_PTR(self) = ec;
|
|
|
|
for (i = 0; i < RARRAY_LEN(convpath); i++) {
|
|
VALUE snamev, dnamev;
|
|
VALUE pair;
|
|
elt = rb_ary_entry(convpath, i);
|
|
if (!NIL_P(pair = rb_check_array_type(elt))) {
|
|
if (RARRAY_LEN(pair) != 2)
|
|
rb_raise(rb_eArgError, "not a 2-element array in convpath");
|
|
snamev = rb_ary_entry(pair, 0);
|
|
enc_arg(&snamev, &sname, &senc);
|
|
dnamev = rb_ary_entry(pair, 1);
|
|
enc_arg(&dnamev, &dname, &denc);
|
|
}
|
|
else {
|
|
sname = "";
|
|
dname = StringValueCStr(elt);
|
|
}
|
|
if (DECORATOR_P(sname, dname)) {
|
|
ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
|
|
if (ret == -1) {
|
|
VALUE msg = rb_sprintf("decoration failed: %s", dname);
|
|
RB_GC_GUARD(snamev);
|
|
RB_GC_GUARD(dnamev);
|
|
rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
|
|
}
|
|
}
|
|
else {
|
|
int j = ec->num_trans;
|
|
struct rb_econv_init_by_convpath_t arg;
|
|
arg.ec = ec;
|
|
arg.index = ec->num_trans;
|
|
arg.ret = 0;
|
|
ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
|
|
if (ret == -1 || arg.ret == -1) {
|
|
VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
|
|
RB_GC_GUARD(snamev);
|
|
RB_GC_GUARD(dnamev);
|
|
rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
|
|
}
|
|
if (first) {
|
|
first = 0;
|
|
*senc_p = senc;
|
|
*sname_p = ec->elems[j].tc->transcoder->src_encoding;
|
|
}
|
|
*denc_p = denc;
|
|
*dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
|
|
}
|
|
}
|
|
|
|
if (first) {
|
|
*senc_p = NULL;
|
|
*denc_p = NULL;
|
|
*sname_p = "";
|
|
*dname_p = "";
|
|
}
|
|
|
|
ec->source_encoding_name = *sname_p;
|
|
ec->destination_encoding_name = *dname_p;
|
|
|
|
return ec;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* Encoding::Converter.new(source_encoding, destination_encoding)
|
|
* Encoding::Converter.new(source_encoding, destination_encoding, opt)
|
|
* Encoding::Converter.new(convpath)
|
|
*
|
|
* possible options elements:
|
|
* hash form:
|
|
* :invalid => nil # raise error on invalid byte sequence (default)
|
|
* :invalid => :replace # replace invalid byte sequence
|
|
* :undef => nil # raise error on undefined conversion (default)
|
|
* :undef => :replace # replace undefined conversion
|
|
* :replace => string # replacement string ("?" or "\uFFFD" if not specified)
|
|
* :newline => :universal # decorator for converting CRLF and CR to LF
|
|
* :newline => :crlf # decorator for converting LF to CRLF
|
|
* :newline => :cr # decorator for converting LF to CR
|
|
* :universal_newline => true # decorator for converting CRLF and CR to LF
|
|
* :crlf_newline => true # decorator for converting LF to CRLF
|
|
* :cr_newline => true # decorator for converting LF to CR
|
|
* :xml => :text # escape as XML CharData.
|
|
* :xml => :attr # escape as XML AttValue
|
|
* integer form:
|
|
* Encoding::Converter::INVALID_REPLACE
|
|
* Encoding::Converter::UNDEF_REPLACE
|
|
* Encoding::Converter::UNDEF_HEX_CHARREF
|
|
* Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
|
|
* Encoding::Converter::CRLF_NEWLINE_DECORATOR
|
|
* Encoding::Converter::CR_NEWLINE_DECORATOR
|
|
* Encoding::Converter::XML_TEXT_DECORATOR
|
|
* Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
|
|
* Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
|
|
*
|
|
* Encoding::Converter.new creates an instance of Encoding::Converter.
|
|
*
|
|
* Source_encoding and destination_encoding should be a string or
|
|
* Encoding object.
|
|
*
|
|
* opt should be nil, a hash or an integer.
|
|
*
|
|
* convpath should be an array.
|
|
* convpath may contain
|
|
* - two-element arrays which contain encodings or encoding names, or
|
|
* - strings representing decorator names.
|
|
*
|
|
* Encoding::Converter.new optionally takes an option.
|
|
* The option should be a hash or an integer.
|
|
* The option hash can contain :invalid => nil, etc.
|
|
* The option integer should be logical-or of constants such as
|
|
* Encoding::Converter::INVALID_REPLACE, etc.
|
|
*
|
|
* [:invalid => nil]
|
|
* Raise error on invalid byte sequence. This is a default behavior.
|
|
* [:invalid => :replace]
|
|
* Replace invalid byte sequence by replacement string.
|
|
* [:undef => nil]
|
|
* Raise an error if a character in source_encoding is not defined in destination_encoding.
|
|
* This is a default behavior.
|
|
* [:undef => :replace]
|
|
* Replace undefined character in destination_encoding with replacement string.
|
|
* [:replace => string]
|
|
* Specify the replacement string.
|
|
* If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
|
|
* [:universal_newline => true]
|
|
* Convert CRLF and CR to LF.
|
|
* [:crlf_newline => true]
|
|
* Convert LF to CRLF.
|
|
* [:cr_newline => true]
|
|
* Convert LF to CR.
|
|
* [:xml => :text]
|
|
* Escape as XML CharData.
|
|
* This form can be used as an HTML 4.0 #PCDATA.
|
|
* - '&' -> '&'
|
|
* - '<' -> '<'
|
|
* - '>' -> '>'
|
|
* - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
|
|
* [:xml => :attr]
|
|
* Escape as XML AttValue.
|
|
* The converted result is quoted as "...".
|
|
* This form can be used as an HTML 4.0 attribute value.
|
|
* - '&' -> '&'
|
|
* - '<' -> '<'
|
|
* - '>' -> '>'
|
|
* - '"' -> '"'
|
|
* - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
|
|
*
|
|
* Examples:
|
|
* # UTF-16BE to UTF-8
|
|
* ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
|
|
*
|
|
* # Usually, decorators such as newline conversion are inserted last.
|
|
* ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
|
|
* p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
|
|
* # "universal_newline"]
|
|
*
|
|
* # But, if the last encoding is ASCII incompatible,
|
|
* # decorators are inserted before the last conversion.
|
|
* ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
|
|
* p ec.convpath #=> ["crlf_newline",
|
|
* # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
|
|
*
|
|
* # Conversion path can be specified directly.
|
|
* ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
|
|
* p ec.convpath #=> ["universal_newline",
|
|
* # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
|
|
*/
|
|
static VALUE
|
|
econv_init(int argc, VALUE *argv, VALUE self)
|
|
{
|
|
VALUE ecopts;
|
|
VALUE snamev, dnamev;
|
|
const char *sname, *dname;
|
|
rb_encoding *senc, *denc;
|
|
rb_econv_t *ec;
|
|
int ecflags;
|
|
VALUE convpath;
|
|
|
|
if (rb_check_typeddata(self, &econv_data_type)) {
|
|
rb_raise(rb_eTypeError, "already initialized");
|
|
}
|
|
|
|
if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
|
|
ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
|
|
ecflags = 0;
|
|
ecopts = Qnil;
|
|
}
|
|
else {
|
|
econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
|
|
ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
|
|
}
|
|
|
|
if (!ec) {
|
|
VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
|
|
RB_GC_GUARD(snamev);
|
|
RB_GC_GUARD(dnamev);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (!DECORATOR_P(sname, dname)) {
|
|
if (!senc)
|
|
senc = make_dummy_encoding(sname);
|
|
if (!denc)
|
|
denc = make_dummy_encoding(dname);
|
|
RB_GC_GUARD(snamev);
|
|
RB_GC_GUARD(dnamev);
|
|
}
|
|
|
|
ec->source_encoding = senc;
|
|
ec->destination_encoding = denc;
|
|
|
|
DATA_PTR(self) = ec;
|
|
|
|
return self;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.inspect -> string
|
|
*
|
|
* Returns a printable version of <i>ec</i>
|
|
*
|
|
* ec = Encoding::Converter.new("iso-8859-1", "utf-8")
|
|
* puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_inspect(VALUE self)
|
|
{
|
|
const char *cname = rb_obj_classname(self);
|
|
rb_econv_t *ec;
|
|
|
|
TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
|
|
if (!ec)
|
|
return rb_sprintf("#<%s: uninitialized>", cname);
|
|
else {
|
|
const char *sname = ec->source_encoding_name;
|
|
const char *dname = ec->destination_encoding_name;
|
|
VALUE str;
|
|
str = rb_sprintf("#<%s: ", cname);
|
|
econv_description(sname, dname, ec->flags, str);
|
|
rb_str_cat2(str, ">");
|
|
return str;
|
|
}
|
|
}
|
|
|
|
static rb_econv_t *
|
|
check_econv(VALUE self)
|
|
{
|
|
rb_econv_t *ec;
|
|
|
|
TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
|
|
if (!ec) {
|
|
rb_raise(rb_eTypeError, "uninitialized encoding converter");
|
|
}
|
|
return ec;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.source_encoding -> encoding
|
|
*
|
|
* Returns the source encoding as an Encoding object.
|
|
*/
|
|
static VALUE
|
|
econv_source_encoding(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
if (!ec->source_encoding)
|
|
return Qnil;
|
|
return rb_enc_from_encoding(ec->source_encoding);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.destination_encoding -> encoding
|
|
*
|
|
* Returns the destination encoding as an Encoding object.
|
|
*/
|
|
static VALUE
|
|
econv_destination_encoding(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
if (!ec->destination_encoding)
|
|
return Qnil;
|
|
return rb_enc_from_encoding(ec->destination_encoding);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.convpath -> ary
|
|
*
|
|
* Returns the conversion path of ec.
|
|
*
|
|
* The result is an array of conversions.
|
|
*
|
|
* ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
|
|
* p ec.convpath
|
|
* #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
|
|
* # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
|
|
* # "crlf_newline"]
|
|
*
|
|
* Each element of the array is a pair of encodings or a string.
|
|
* A pair means an encoding conversion.
|
|
* A string means a decorator.
|
|
*
|
|
* In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
|
|
* a converter from ISO-8859-1 to UTF-8.
|
|
* "crlf_newline" means newline converter from LF to CRLF.
|
|
*/
|
|
static VALUE
|
|
econv_convpath(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
VALUE result;
|
|
int i;
|
|
|
|
result = rb_ary_new();
|
|
for (i = 0; i < ec->num_trans; i++) {
|
|
const rb_transcoder *tr = ec->elems[i].tc->transcoder;
|
|
VALUE v;
|
|
if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
|
|
v = rb_str_new_cstr(tr->dst_encoding);
|
|
else
|
|
v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
|
|
rb_ary_push(result, v);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec == other -> true or false
|
|
*/
|
|
static VALUE
|
|
econv_equal(VALUE self, VALUE other)
|
|
{
|
|
rb_econv_t *ec1 = check_econv(self);
|
|
rb_econv_t *ec2;
|
|
int i;
|
|
|
|
if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
|
|
return Qnil;
|
|
}
|
|
ec2 = DATA_PTR(other);
|
|
if (!ec2) return Qfalse;
|
|
if (ec1->source_encoding_name != ec2->source_encoding_name &&
|
|
strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
|
|
return Qfalse;
|
|
if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
|
|
strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
|
|
return Qfalse;
|
|
if (ec1->flags != ec2->flags) return Qfalse;
|
|
if (ec1->replacement_enc != ec2->replacement_enc &&
|
|
strcmp(ec1->replacement_enc, ec2->replacement_enc))
|
|
return Qfalse;
|
|
if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
|
|
if (ec1->replacement_str != ec2->replacement_str &&
|
|
memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
|
|
return Qfalse;
|
|
|
|
if (ec1->num_trans != ec2->num_trans) return Qfalse;
|
|
for (i = 0; i < ec1->num_trans; i++) {
|
|
if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
|
|
return Qfalse;
|
|
}
|
|
return Qtrue;
|
|
}
|
|
|
|
static VALUE
|
|
econv_result_to_symbol(rb_econv_result_t res)
|
|
{
|
|
switch (res) {
|
|
case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
|
|
case econv_incomplete_input: return sym_incomplete_input;
|
|
case econv_undefined_conversion: return sym_undefined_conversion;
|
|
case econv_destination_buffer_full: return sym_destination_buffer_full;
|
|
case econv_source_buffer_empty: return sym_source_buffer_empty;
|
|
case econv_finished: return sym_finished;
|
|
case econv_after_output: return sym_after_output;
|
|
default: return INT2NUM(res); /* should not be reached */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.primitive_convert(source_buffer, destination_buffer) -> symbol
|
|
* ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
|
|
* ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
|
|
* ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
|
|
*
|
|
* possible opt elements:
|
|
* hash form:
|
|
* :partial_input => true # source buffer may be part of larger source
|
|
* :after_output => true # stop conversion after output before input
|
|
* integer form:
|
|
* Encoding::Converter::PARTIAL_INPUT
|
|
* Encoding::Converter::AFTER_OUTPUT
|
|
*
|
|
* possible results:
|
|
* :invalid_byte_sequence
|
|
* :incomplete_input
|
|
* :undefined_conversion
|
|
* :after_output
|
|
* :destination_buffer_full
|
|
* :source_buffer_empty
|
|
* :finished
|
|
*
|
|
* primitive_convert converts source_buffer into destination_buffer.
|
|
*
|
|
* source_buffer should be a string or nil.
|
|
* nil means an empty string.
|
|
*
|
|
* destination_buffer should be a string.
|
|
*
|
|
* destination_byteoffset should be an integer or nil.
|
|
* nil means the end of destination_buffer.
|
|
* If it is omitted, nil is assumed.
|
|
*
|
|
* destination_bytesize should be an integer or nil.
|
|
* nil means unlimited.
|
|
* If it is omitted, nil is assumed.
|
|
*
|
|
* opt should be nil, a hash or an integer.
|
|
* nil means no flags.
|
|
* If it is omitted, nil is assumed.
|
|
*
|
|
* primitive_convert converts the content of source_buffer from beginning
|
|
* and store the result into destination_buffer.
|
|
*
|
|
* destination_byteoffset and destination_bytesize specify the region which
|
|
* the converted result is stored.
|
|
* destination_byteoffset specifies the start position in destination_buffer in bytes.
|
|
* If destination_byteoffset is nil,
|
|
* destination_buffer.bytesize is used for appending the result.
|
|
* destination_bytesize specifies maximum number of bytes.
|
|
* If destination_bytesize is nil,
|
|
* destination size is unlimited.
|
|
* After conversion, destination_buffer is resized to
|
|
* destination_byteoffset + actually produced number of bytes.
|
|
* Also destination_buffer's encoding is set to destination_encoding.
|
|
*
|
|
* primitive_convert drops the converted part of source_buffer.
|
|
* the dropped part is converted in destination_buffer or
|
|
* buffered in Encoding::Converter object.
|
|
*
|
|
* primitive_convert stops conversion when one of following condition met.
|
|
* - invalid byte sequence found in source buffer (:invalid_byte_sequence)
|
|
* +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
|
|
* - unexpected end of source buffer (:incomplete_input)
|
|
* this occur only when :partial_input is not specified.
|
|
* +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
|
|
* - character not representable in output encoding (:undefined_conversion)
|
|
* +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
|
|
* - after some output is generated, before input is done (:after_output)
|
|
* this occur only when :after_output is specified.
|
|
* - destination buffer is full (:destination_buffer_full)
|
|
* this occur only when destination_bytesize is non-nil.
|
|
* - source buffer is empty (:source_buffer_empty)
|
|
* this occur only when :partial_input is specified.
|
|
* - conversion is finished (:finished)
|
|
*
|
|
* example:
|
|
* ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
|
|
* ret = ec.primitive_convert(src="pi", dst="", nil, 100)
|
|
* p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
|
|
*
|
|
* ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
|
|
* ret = ec.primitive_convert(src="pi", dst="", nil, 1)
|
|
* p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
|
|
* ret = ec.primitive_convert(src, dst="", nil, 1)
|
|
* p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
|
|
* ret = ec.primitive_convert(src, dst="", nil, 1)
|
|
* p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
|
|
* ret = ec.primitive_convert(src, dst="", nil, 1)
|
|
* p [ret, src, dst] #=> [:finished, "", "i"]
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_primitive_convert(int argc, VALUE *argv, VALUE self)
|
|
{
|
|
VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
|
|
rb_econv_t *ec = check_econv(self);
|
|
rb_econv_result_t res;
|
|
const unsigned char *ip, *is;
|
|
unsigned char *op, *os;
|
|
long output_byteoffset, output_bytesize;
|
|
unsigned long output_byteend;
|
|
int flags;
|
|
|
|
argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
|
|
|
|
if (NIL_P(output_byteoffset_v))
|
|
output_byteoffset = 0; /* dummy */
|
|
else
|
|
output_byteoffset = NUM2LONG(output_byteoffset_v);
|
|
|
|
if (NIL_P(output_bytesize_v))
|
|
output_bytesize = 0; /* dummy */
|
|
else
|
|
output_bytesize = NUM2LONG(output_bytesize_v);
|
|
|
|
if (!NIL_P(flags_v)) {
|
|
if (!NIL_P(opt)) {
|
|
rb_error_arity(argc + 1, 2, 5);
|
|
}
|
|
flags = NUM2INT(rb_to_int(flags_v));
|
|
}
|
|
else if (!NIL_P(opt)) {
|
|
VALUE v;
|
|
flags = 0;
|
|
v = rb_hash_aref(opt, sym_partial_input);
|
|
if (RTEST(v))
|
|
flags |= ECONV_PARTIAL_INPUT;
|
|
v = rb_hash_aref(opt, sym_after_output);
|
|
if (RTEST(v))
|
|
flags |= ECONV_AFTER_OUTPUT;
|
|
}
|
|
else {
|
|
flags = 0;
|
|
}
|
|
|
|
StringValue(output);
|
|
if (!NIL_P(input))
|
|
StringValue(input);
|
|
rb_str_modify(output);
|
|
|
|
if (NIL_P(output_bytesize_v)) {
|
|
output_bytesize = RSTRING_EMBED_LEN_MAX;
|
|
if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
|
|
output_bytesize = RSTRING_LEN(input);
|
|
}
|
|
|
|
retry:
|
|
|
|
if (NIL_P(output_byteoffset_v))
|
|
output_byteoffset = RSTRING_LEN(output);
|
|
|
|
if (output_byteoffset < 0)
|
|
rb_raise(rb_eArgError, "negative output_byteoffset");
|
|
|
|
if (RSTRING_LEN(output) < output_byteoffset)
|
|
rb_raise(rb_eArgError, "output_byteoffset too big");
|
|
|
|
if (output_bytesize < 0)
|
|
rb_raise(rb_eArgError, "negative output_bytesize");
|
|
|
|
output_byteend = (unsigned long)output_byteoffset +
|
|
(unsigned long)output_bytesize;
|
|
|
|
if (output_byteend < (unsigned long)output_byteoffset ||
|
|
LONG_MAX < output_byteend)
|
|
rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
|
|
|
|
if (rb_str_capacity(output) < output_byteend)
|
|
rb_str_resize(output, output_byteend);
|
|
|
|
if (NIL_P(input)) {
|
|
ip = is = NULL;
|
|
}
|
|
else {
|
|
ip = (const unsigned char *)RSTRING_PTR(input);
|
|
is = ip + RSTRING_LEN(input);
|
|
}
|
|
|
|
op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
|
|
os = op + output_bytesize;
|
|
|
|
res = rb_econv_convert(ec, &ip, is, &op, os, flags);
|
|
rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
|
|
if (!NIL_P(input)) {
|
|
rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
|
|
}
|
|
|
|
if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
|
|
if (LONG_MAX / 2 < output_bytesize)
|
|
rb_raise(rb_eArgError, "too long conversion result");
|
|
output_bytesize *= 2;
|
|
output_byteoffset_v = Qnil;
|
|
goto retry;
|
|
}
|
|
|
|
if (ec->destination_encoding) {
|
|
rb_enc_associate(output, ec->destination_encoding);
|
|
}
|
|
|
|
return econv_result_to_symbol(res);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.convert(source_string) -> destination_string
|
|
*
|
|
* Convert source_string and return destination_string.
|
|
*
|
|
* source_string is assumed as a part of source.
|
|
* i.e. :partial_input=>true is specified internally.
|
|
* finish method should be used last.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "euc-jp")
|
|
* puts ec.convert("\u3042").dump #=> "\xA4\xA2"
|
|
* puts ec.finish.dump #=> ""
|
|
*
|
|
* ec = Encoding::Converter.new("euc-jp", "utf-8")
|
|
* puts ec.convert("\xA4").dump #=> ""
|
|
* puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
|
|
* puts ec.finish.dump #=> ""
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
|
|
* puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
|
|
* puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
|
|
* puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
|
|
* puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
|
|
*
|
|
* If a conversion error occur,
|
|
* Encoding::UndefinedConversionError or
|
|
* Encoding::InvalidByteSequenceError is raised.
|
|
* Encoding::Converter#convert doesn't supply methods to recover or restart
|
|
* from these exceptions.
|
|
* When you want to handle these conversion errors,
|
|
* use Encoding::Converter#primitive_convert.
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_convert(VALUE self, VALUE source_string)
|
|
{
|
|
VALUE ret, dst;
|
|
VALUE av[5];
|
|
int ac;
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
StringValue(source_string);
|
|
|
|
dst = rb_str_new(NULL, 0);
|
|
|
|
av[0] = rb_str_dup(source_string);
|
|
av[1] = dst;
|
|
av[2] = Qnil;
|
|
av[3] = Qnil;
|
|
av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
|
|
ac = 5;
|
|
|
|
ret = econv_primitive_convert(ac, av, self);
|
|
|
|
if (ret == sym_invalid_byte_sequence ||
|
|
ret == sym_undefined_conversion ||
|
|
ret == sym_incomplete_input) {
|
|
VALUE exc = make_econv_exception(ec);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (ret == sym_finished) {
|
|
rb_raise(rb_eArgError, "converter already finished");
|
|
}
|
|
|
|
if (ret != sym_source_buffer_empty) {
|
|
rb_bug("unexpected result of econv_primitive_convert");
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.finish -> string
|
|
*
|
|
* Finishes the converter.
|
|
* It returns the last part of the converted string.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
|
|
* p ec.convert("\u3042") #=> "\e$B$\""
|
|
* p ec.finish #=> "\e(B"
|
|
*/
|
|
static VALUE
|
|
econv_finish(VALUE self)
|
|
{
|
|
VALUE ret, dst;
|
|
VALUE av[5];
|
|
int ac;
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
dst = rb_str_new(NULL, 0);
|
|
|
|
av[0] = Qnil;
|
|
av[1] = dst;
|
|
av[2] = Qnil;
|
|
av[3] = Qnil;
|
|
av[4] = INT2FIX(0);
|
|
ac = 5;
|
|
|
|
ret = econv_primitive_convert(ac, av, self);
|
|
|
|
if (ret == sym_invalid_byte_sequence ||
|
|
ret == sym_undefined_conversion ||
|
|
ret == sym_incomplete_input) {
|
|
VALUE exc = make_econv_exception(ec);
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
if (ret != sym_finished) {
|
|
rb_bug("unexpected result of econv_primitive_convert");
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.primitive_errinfo -> array
|
|
*
|
|
* primitive_errinfo returns important information regarding the last error
|
|
* as a 5-element array:
|
|
*
|
|
* [result, enc1, enc2, error_bytes, readagain_bytes]
|
|
*
|
|
* result is the last result of primitive_convert.
|
|
*
|
|
* Other elements are only meaningful when result is
|
|
* :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
|
|
*
|
|
* enc1 and enc2 indicate a conversion step as a pair of strings.
|
|
* For example, a converter from EUC-JP to ISO-8859-1 converts
|
|
* a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
|
|
* So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
|
|
*
|
|
* error_bytes and readagain_bytes indicate the byte sequences which caused the error.
|
|
* error_bytes is discarded portion.
|
|
* readagain_bytes is buffered portion which is read again on next conversion.
|
|
*
|
|
* Example:
|
|
*
|
|
* # \xff is invalid as EUC-JP.
|
|
* ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
|
|
* ec.primitive_convert(src="\xff", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
|
|
*
|
|
* # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
|
|
* # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
|
|
* # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
|
|
*
|
|
* # partial character is invalid
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* ec.primitive_convert(src="\xa4", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
|
|
*
|
|
* # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
|
|
* # partial characters.
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:source_buffer_empty, nil, nil, nil, nil]
|
|
*
|
|
* # \xd8\x00\x00@ is invalid as UTF-16BE because
|
|
* # no low surrogate after high surrogate (\xd8\x00).
|
|
* # It is detected by 3rd byte (\00) which is part of next character.
|
|
* # So the high surrogate (\xd8\x00) is discarded and
|
|
* # the 3rd byte is read again later.
|
|
* # Since the byte is buffered in ec, it is dropped from src.
|
|
* ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
|
|
* ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
|
|
* p src
|
|
* #=> "@"
|
|
*
|
|
* # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
|
|
* # The problem is detected by 4th byte.
|
|
* ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
|
|
* ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
|
|
* p ec.primitive_errinfo
|
|
* #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
|
|
* p src
|
|
* #=> ""
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_primitive_errinfo(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
VALUE ary;
|
|
|
|
ary = rb_ary_new2(5);
|
|
|
|
rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
|
|
rb_ary_store(ary, 4, Qnil);
|
|
|
|
if (ec->last_error.source_encoding)
|
|
rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
|
|
|
|
if (ec->last_error.destination_encoding)
|
|
rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
|
|
|
|
if (ec->last_error.error_bytes_start) {
|
|
rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
|
|
rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
|
|
}
|
|
|
|
return ary;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.insert_output(string) -> nil
|
|
*
|
|
* Inserts string into the encoding converter.
|
|
* The string will be converted to the destination encoding and
|
|
* output on later conversions.
|
|
*
|
|
* If the destination encoding is stateful,
|
|
* string is converted according to the state and the state is updated.
|
|
*
|
|
* This method should be used only when a conversion error occurs.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-8859-1")
|
|
* src = "HIRAGANA LETTER A is \u{3042}."
|
|
* dst = ""
|
|
* p ec.primitive_convert(src, dst) #=> :undefined_conversion
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
|
|
* ec.insert_output("<err>")
|
|
* p ec.primitive_convert(src, dst) #=> :finished
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
|
|
* src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
|
|
* dst = ""
|
|
* p ec.primitive_convert(src, dst) #=> :undefined_conversion
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
|
|
* ec.insert_output "?" # state change required to output "?".
|
|
* p ec.primitive_convert(src, dst) #=> :finished
|
|
* puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_insert_output(VALUE self, VALUE string)
|
|
{
|
|
const char *insert_enc;
|
|
|
|
int ret;
|
|
|
|
rb_econv_t *ec = check_econv(self);
|
|
|
|
StringValue(string);
|
|
insert_enc = rb_econv_encoding_to_insert_output(ec);
|
|
string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
|
|
|
|
ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
|
|
if (ret == -1) {
|
|
rb_raise(rb_eArgError, "too big string");
|
|
}
|
|
|
|
return Qnil;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.putback -> string
|
|
* ec.putback(max_numbytes) -> string
|
|
*
|
|
* Put back the bytes which will be converted.
|
|
*
|
|
* The bytes are caused by invalid_byte_sequence error.
|
|
* When invalid_byte_sequence error, some bytes are discarded and
|
|
* some bytes are buffered to be converted later.
|
|
* The latter bytes can be put back.
|
|
* It can be observed by
|
|
* Encoding::InvalidByteSequenceError#readagain_bytes and
|
|
* Encoding::Converter#primitive_errinfo.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
|
|
* src = "\x00\xd8\x61\x00"
|
|
* dst = ""
|
|
* p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
|
|
* p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
|
|
* p ec.putback #=> "a\x00"
|
|
* p ec.putback #=> "" # no more bytes to put back
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_putback(int argc, VALUE *argv, VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
int n;
|
|
int putbackable;
|
|
VALUE str, max;
|
|
|
|
if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
|
|
n = rb_econv_putbackable(ec);
|
|
}
|
|
else {
|
|
n = NUM2INT(max);
|
|
putbackable = rb_econv_putbackable(ec);
|
|
if (putbackable < n)
|
|
n = putbackable;
|
|
}
|
|
|
|
str = rb_str_new(NULL, n);
|
|
rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
|
|
|
|
if (ec->source_encoding) {
|
|
rb_enc_associate(str, ec->source_encoding);
|
|
}
|
|
|
|
return str;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.last_error -> exception or nil
|
|
*
|
|
* Returns an exception object for the last conversion.
|
|
* Returns nil if the last conversion did not produce an error.
|
|
*
|
|
* "error" means that
|
|
* Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
|
|
* Encoding::Converter#convert and
|
|
* :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
|
|
* Encoding::Converter#primitive_convert.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "iso-8859-1")
|
|
* p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
|
|
* p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
|
|
* p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
|
|
* p ec.last_error #=> nil
|
|
*
|
|
*/
|
|
static VALUE
|
|
econv_last_error(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
VALUE exc;
|
|
|
|
exc = make_econv_exception(ec);
|
|
if (NIL_P(exc))
|
|
return Qnil;
|
|
return exc;
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.replacement -> string
|
|
*
|
|
* Returns the replacement string.
|
|
*
|
|
* ec = Encoding::Converter.new("euc-jp", "us-ascii")
|
|
* p ec.replacement #=> "?"
|
|
*
|
|
* ec = Encoding::Converter.new("euc-jp", "utf-8")
|
|
* p ec.replacement #=> "\uFFFD"
|
|
*/
|
|
static VALUE
|
|
econv_get_replacement(VALUE self)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
int ret;
|
|
rb_encoding *enc;
|
|
|
|
ret = make_replacement(ec);
|
|
if (ret == -1) {
|
|
rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
|
|
}
|
|
|
|
enc = rb_enc_find(ec->replacement_enc);
|
|
return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ec.replacement = string
|
|
*
|
|
* Sets the replacement string.
|
|
*
|
|
* ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
|
|
* ec.replacement = "<undef>"
|
|
* p ec.convert("a \u3042 b") #=> "a <undef> b"
|
|
*/
|
|
static VALUE
|
|
econv_set_replacement(VALUE self, VALUE arg)
|
|
{
|
|
rb_econv_t *ec = check_econv(self);
|
|
VALUE string = arg;
|
|
int ret;
|
|
rb_encoding *enc;
|
|
|
|
StringValue(string);
|
|
enc = rb_enc_get(string);
|
|
|
|
ret = rb_econv_set_replacement(ec,
|
|
(const unsigned char *)RSTRING_PTR(string),
|
|
RSTRING_LEN(string),
|
|
rb_enc_name(enc));
|
|
|
|
if (ret == -1) {
|
|
/* xxx: rb_eInvalidByteSequenceError? */
|
|
rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
|
|
}
|
|
|
|
return arg;
|
|
}
|
|
|
|
VALUE
|
|
rb_econv_make_exception(rb_econv_t *ec)
|
|
{
|
|
return make_econv_exception(ec);
|
|
}
|
|
|
|
void
|
|
rb_econv_check_error(rb_econv_t *ec)
|
|
{
|
|
VALUE exc;
|
|
|
|
exc = make_econv_exception(ec);
|
|
if (NIL_P(exc))
|
|
return;
|
|
rb_exc_raise(exc);
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.source_encoding_name -> string
|
|
*
|
|
* Returns the source encoding name as a string.
|
|
*/
|
|
static VALUE
|
|
ecerr_source_encoding_name(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("source_encoding_name"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.source_encoding -> encoding
|
|
*
|
|
* Returns the source encoding as an encoding object.
|
|
*
|
|
* Note that the result may not be equal to the source encoding of
|
|
* the encoding converter if the conversion has multiple steps.
|
|
*
|
|
* ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
|
|
* begin
|
|
* ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
|
|
* rescue Encoding::UndefinedConversionError
|
|
* p $!.source_encoding #=> #<Encoding:UTF-8>
|
|
* p $!.destination_encoding #=> #<Encoding:EUC-JP>
|
|
* p $!.source_encoding_name #=> "UTF-8"
|
|
* p $!.destination_encoding_name #=> "EUC-JP"
|
|
* end
|
|
*
|
|
*/
|
|
static VALUE
|
|
ecerr_source_encoding(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("source_encoding"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.destination_encoding_name -> string
|
|
*
|
|
* Returns the destination encoding name as a string.
|
|
*/
|
|
static VALUE
|
|
ecerr_destination_encoding_name(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("destination_encoding_name"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.destination_encoding -> string
|
|
*
|
|
* Returns the destination encoding as an encoding object.
|
|
*/
|
|
static VALUE
|
|
ecerr_destination_encoding(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("destination_encoding"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.error_char -> string
|
|
*
|
|
* Returns the one-character string which cause Encoding::UndefinedConversionError.
|
|
*
|
|
* ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
|
|
* begin
|
|
* ec.convert("\xa0")
|
|
* rescue Encoding::UndefinedConversionError
|
|
* puts $!.error_char.dump #=> "\xC2\xA0"
|
|
* p $!.error_char.encoding #=> #<Encoding:UTF-8>
|
|
* end
|
|
*
|
|
*/
|
|
static VALUE
|
|
ecerr_error_char(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("error_char"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.error_bytes -> string
|
|
*
|
|
* Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
|
|
*
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
* begin
|
|
* ec.convert("abc\xA1\xFFdef")
|
|
* rescue Encoding::InvalidByteSequenceError
|
|
* p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
|
|
* puts $!.error_bytes.dump #=> "\xA1"
|
|
* puts $!.readagain_bytes.dump #=> "\xFF"
|
|
* end
|
|
*/
|
|
static VALUE
|
|
ecerr_error_bytes(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("error_bytes"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.readagain_bytes -> string
|
|
*
|
|
* Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
|
|
*/
|
|
static VALUE
|
|
ecerr_readagain_bytes(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("readagain_bytes"));
|
|
}
|
|
|
|
/*
|
|
* call-seq:
|
|
* ecerr.incomplete_input? -> true or false
|
|
*
|
|
* Returns true if the invalid byte sequence error is caused by
|
|
* premature end of string.
|
|
*
|
|
* ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
|
|
*
|
|
* begin
|
|
* ec.convert("abc\xA1z")
|
|
* rescue Encoding::InvalidByteSequenceError
|
|
* p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
|
|
* p $!.incomplete_input? #=> false
|
|
* end
|
|
*
|
|
* begin
|
|
* ec.convert("abc\xA1")
|
|
* ec.finish
|
|
* rescue Encoding::InvalidByteSequenceError
|
|
* p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
|
|
* p $!.incomplete_input? #=> true
|
|
* end
|
|
*/
|
|
static VALUE
|
|
ecerr_incomplete_input(VALUE self)
|
|
{
|
|
return rb_attr_get(self, rb_intern("incomplete_input"));
|
|
}
|
|
|
|
/*
|
|
* Document-class: Encoding::UndefinedConversionError
|
|
*
|
|
* Raised by Encoding and String methods when a transcoding operation
|
|
* fails.
|
|
*/
|
|
|
|
/*
|
|
* Document-class: Encoding::InvalidByteSequenceError
|
|
*
|
|
* Raised by Encoding and String methods when the string being
|
|
* transcoded contains a byte invalid for the either the source or
|
|
* target encoding.
|
|
*/
|
|
|
|
/*
|
|
* Document-class: Encoding::ConverterNotFoundError
|
|
*
|
|
* Raised by transcoding methods when a named encoding does not
|
|
* correspond with a known converter.
|
|
*/
|
|
|
|
#undef rb_intern
|
|
void
|
|
Init_transcode(void)
|
|
{
|
|
transcoder_table = st_init_strcasetable();
|
|
|
|
sym_invalid = ID2SYM(rb_intern("invalid"));
|
|
sym_undef = ID2SYM(rb_intern("undef"));
|
|
sym_replace = ID2SYM(rb_intern("replace"));
|
|
sym_fallback = ID2SYM(rb_intern("fallback"));
|
|
sym_xml = ID2SYM(rb_intern("xml"));
|
|
sym_text = ID2SYM(rb_intern("text"));
|
|
sym_attr = ID2SYM(rb_intern("attr"));
|
|
|
|
sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
|
|
sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
|
|
sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
|
|
sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
|
|
sym_finished = ID2SYM(rb_intern("finished"));
|
|
sym_after_output = ID2SYM(rb_intern("after_output"));
|
|
sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
|
|
sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
|
|
sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
|
|
sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
|
|
sym_partial_input = ID2SYM(rb_intern("partial_input"));
|
|
|
|
#ifdef ENABLE_ECONV_NEWLINE_OPTION
|
|
sym_newline = ID2SYM(rb_intern("newline"));
|
|
sym_universal = ID2SYM(rb_intern("universal"));
|
|
sym_crlf = ID2SYM(rb_intern("crlf"));
|
|
sym_cr = ID2SYM(rb_intern("cr"));
|
|
sym_lf = ID2SYM(rb_intern("lf"));
|
|
#endif
|
|
|
|
InitVM(transcode);
|
|
}
|
|
|
|
void
|
|
InitVM_transcode(void)
|
|
{
|
|
rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
|
|
rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
|
|
rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
|
|
|
|
rb_define_method(rb_cString, "encode", str_encode, -1);
|
|
rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
|
|
|
|
rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
|
|
rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
|
|
rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
|
|
rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
|
|
rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
|
|
rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
|
|
rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
|
|
rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
|
|
rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
|
|
rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
|
|
rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
|
|
rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
|
|
rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
|
|
rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
|
|
rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
|
|
rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
|
|
rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
|
|
rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
|
|
rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
|
|
|
|
/* Document-const: INVALID_MASK
|
|
*
|
|
* Mask for invalid byte sequences
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
|
|
|
|
/* Document-const: INVALID_REPLACE
|
|
*
|
|
* Replace invalid byte sequences
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
|
|
|
|
/* Document-const: UNDEF_MASK
|
|
*
|
|
* Mask for a valid character in the source encoding but no related
|
|
* character(s) in destination encoding.
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
|
|
|
|
/* Document-const: UNDEF_REPLACE
|
|
*
|
|
* Replace byte sequences that are undefined in the destination encoding.
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
|
|
|
|
/* Document-const: UNDEF_HEX_CHARREF
|
|
*
|
|
* Replace byte sequences that are undefined in the destination encoding
|
|
* with an XML hexadecimal character reference. This is valid for XML
|
|
* conversion.
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
|
|
|
|
/* Document-const: PARTIAL_INPUT
|
|
*
|
|
* Indicates the source may be part of a larger string. See
|
|
* primitive_convert for an example.
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
|
|
|
|
/* Document-const: AFTER_OUTPUT
|
|
*
|
|
* Stop converting after some output is complete but before all of the
|
|
* input was consumed. See primitive_convert for an example.
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
|
|
|
|
/* Document-const: UNIVERSAL_NEWLINE_DECORATOR
|
|
*
|
|
* Decorator for converting CRLF and CR to LF
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
|
|
|
|
/* Document-const: CRLF_NEWLINE_DECORATOR
|
|
*
|
|
* Decorator for converting LF to CRLF
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
|
|
|
|
/* Document-const: CR_NEWLINE_DECORATOR
|
|
*
|
|
* Decorator for converting LF to CR
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
|
|
|
|
/* Document-const: XML_TEXT_DECORATOR
|
|
*
|
|
* Escape as XML CharData
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
|
|
|
|
/* Document-const: XML_ATTR_CONTENT_DECORATOR
|
|
*
|
|
* Escape as XML AttValue
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
|
|
|
|
/* Document-const: XML_ATTR_QUOTE_DECORATOR
|
|
*
|
|
* Escape as XML AttValue
|
|
*/
|
|
rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
|
|
|
|
rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
|
|
rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
|
|
rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
|
|
rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
|
|
rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
|
|
|
|
rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
|
|
rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
|
|
|
|
Init_newline();
|
|
}
|