From 020e681eece7359f1cfcda9871b600084507dbb1 Mon Sep 17 00:00:00 2001 From: akr Date: Sun, 7 Sep 2008 03:13:29 +0000 Subject: [PATCH] * include/ruby/encoding.h (ECONV_XML_ATTR_CONTENT_ENCODER): defined. (ECONV_STATEFUL_ENCODER_MASK): defined. (ECONV_XML_ATTR_QUOTE_ENCODER): defined. (ECONV_XML_ATTR_ENCODER): removed. * enc/trans/escape.trans (rb_escape_xml_attr_content): defined. (rb_escape_xml_attr_quote): defined. (rb_escape_xml_attr): removed. * io.c (NEED_WRITECONV): writeconv is required if supplemental converter is used. (make_writeconv): apply stateful encoder in writeconv. * transcode.c: follow the constant change. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19209 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 17 +++++++++++ enc/trans/escape.trans | 59 ++++++++++++++++++++++--------------- include/ruby/encoding.h | 5 +++- io.c | 62 ++++++++++++++++++++++----------------- test/ruby/test_econv.rb | 36 ++++++++++++++++++----- test/ruby/test_io_m17n.rb | 13 +++++++- transcode.c | 28 ++++++++++++------ 7 files changed, 150 insertions(+), 70 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7ec025f570..24396be89f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +Sun Sep 7 12:09:29 2008 Tanaka Akira + + * include/ruby/encoding.h (ECONV_XML_ATTR_CONTENT_ENCODER): defined. + (ECONV_STATEFUL_ENCODER_MASK): defined. + (ECONV_XML_ATTR_QUOTE_ENCODER): defined. + (ECONV_XML_ATTR_ENCODER): removed. + + * enc/trans/escape.trans (rb_escape_xml_attr_content): defined. + (rb_escape_xml_attr_quote): defined. + (rb_escape_xml_attr): removed. + + * io.c (NEED_WRITECONV): writeconv is required if supplemental + converter is used. + (make_writeconv): apply stateful encoder in writeconv. + + * transcode.c: follow the constant change. + Sun Sep 7 07:24:09 2008 Yukihiro Matsumoto * misc/*.el: merged the following patches from Nathan Weizenbaum diff --git a/enc/trans/escape.trans b/enc/trans/escape.trans index 0641c6e251..a64114f533 100644 --- a/enc/trans/escape.trans +++ b/enc/trans/escape.trans @@ -52,9 +52,18 @@ fun_so_escape_xml_chref(void *statep, const unsigned char *s, size_t l, unsigned map_xml_text["3E"] = :func_so transcode_generate_node(ActionMap.parse(map_xml_text), "escape_xml_text") - map_xml_attr = {} - map_xml_attr["{00-FF}"] = :func_so - transcode_generate_node(ActionMap.parse(map_xml_attr), "escape_xml_attr") + map_xml_attr_content = {} + map_xml_attr_content["{00-21,23-25,27-3B,3D,3F-FF}"] = :nomap + map_xml_attr_content["22"] = :func_so + map_xml_attr_content["26"] = :func_so + map_xml_attr_content["3C"] = :func_so + map_xml_attr_content["3E"] = :func_so + transcode_generate_node(ActionMap.parse(map_xml_attr_content), "escape_xml_attr_content") + + map_xml_attr_quote = {} + map_xml_attr_quote["{00-FF}"] = :func_so + transcode_generate_node(ActionMap.parse(map_xml_attr_quote), "escape_xml_attr_quote") + %> <%= transcode_generated_code %> @@ -83,11 +92,23 @@ rb_escape_xml_text = { NULL, NULL, NULL, &fun_so_escape_xml_chref }; +static const rb_transcoder +rb_escape_xml_attr_content = { + "", "xml-attr-content-escaped", escape_xml_attr_content, + TRANSCODE_TABLE_INFO, + 1, /* input_unit_length */ + 1, /* max_input */ + 6, /* max_output */ + stateless_converter, /* stateful_type */ + 0, NULL, NULL, + NULL, NULL, NULL, &fun_so_escape_xml_chref +}; + #define END 0 #define NORMAL 1 static int -escape_xml_attr_init(void *statep) +escape_xml_attr_quote_init(void *statep) { unsigned char *sp = statep; *sp = END; @@ -95,7 +116,7 @@ escape_xml_attr_init(void *statep) } static int -fun_so_escape_xml_attr(void *statep, const unsigned char *s, size_t l, unsigned char *o) +fun_so_escape_xml_attr_quote(void *statep, const unsigned char *s, size_t l, unsigned char *o) { unsigned char *sp = statep; int n = 0; @@ -103,23 +124,12 @@ fun_so_escape_xml_attr(void *statep, const unsigned char *s, size_t l, unsigned *sp = NORMAL; o[n++] = '"'; } - switch (s[0]) { - case '&': - case '<': - case '>': - case '"': - n += fun_so_escape_xml_chref(statep, s, l, o+n); - break; - - default: - o[n++] = s[0]; - break; - } + o[n++] = s[0]; return n; } static int -escape_xml_attr_finish(void *statep, unsigned char *o) +escape_xml_attr_quote_finish(void *statep, unsigned char *o) { unsigned char *sp = statep; int n = 0; @@ -135,16 +145,16 @@ escape_xml_attr_finish(void *statep, unsigned char *o) } static const rb_transcoder -rb_escape_xml_attr = { - "", "xml-attr-escaped", escape_xml_attr, +rb_escape_xml_attr_quote = { + "", "xml-attr-quoted", escape_xml_attr_quote, TRANSCODE_TABLE_INFO, 1, /* input_unit_length */ 1, /* max_input */ 7, /* max_output */ stateful_encoder, /* stateful_type */ - 1, escape_xml_attr_init, escape_xml_attr_init, - NULL, NULL, NULL, fun_so_escape_xml_attr, - escape_xml_attr_finish + 1, escape_xml_attr_quote_init, escape_xml_attr_quote_init, + NULL, NULL, NULL, fun_so_escape_xml_attr_quote, + escape_xml_attr_quote_finish }; void @@ -152,6 +162,7 @@ Init_escape(void) { rb_register_transcoder(&rb_escape_amp_as_chref); rb_register_transcoder(&rb_escape_xml_text); - rb_register_transcoder(&rb_escape_xml_attr); + rb_register_transcoder(&rb_escape_xml_attr_content); + rb_register_transcoder(&rb_escape_xml_attr_quote); } diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index d78ef11dc1..eac7326a6d 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -269,7 +269,10 @@ void rb_econv_binmode(rb_econv_t *ec); #define ECONV_CRLF_NEWLINE_ENCODER 0x00001000 #define ECONV_CR_NEWLINE_ENCODER 0x00002000 #define ECONV_XML_TEXT_ENCODER 0x00004000 -#define ECONV_XML_ATTR_ENCODER 0x00008000 +#define ECONV_XML_ATTR_CONTENT_ENCODER 0x00008000 + +#define ECONV_STATEFUL_ENCODER_MASK 0x00f00000 +#define ECONV_XML_ATTR_QUOTE_ENCODER 0x00100000 /* end of flags for rb_econv_open */ diff --git a/io.c b/io.c index ac7c1f16b5..5c43a062de 100644 --- a/io.c +++ b/io.c @@ -682,7 +682,7 @@ rb_io_wait_writable(int f) # define NEED_NEWLINE_ENCODER(fptr) 0 #endif #define NEED_READCONV(fptr) (fptr->encs.enc2 != NULL || NEED_NEWLINE_DECODER(fptr)) -#define NEED_WRITECONV(fptr) (fptr->encs.enc != NULL || NEED_NEWLINE_ENCODER(fptr)) +#define NEED_WRITECONV(fptr) (fptr->encs.enc != NULL || NEED_NEWLINE_ENCODER(fptr) || (fptr->encs.ecflags & (ECONV_DECODER_MASK|ECONV_ENCODER_MASK|ECONV_STATEFUL_ENCODER_MASK))) static void make_writeconv(rb_io_t *fptr) @@ -695,42 +695,50 @@ make_writeconv(rb_io_t *fptr) fptr->writeconv_initialized = 1; - /* ECONV_INVALID_XXX and ECONV_UNDEF_XXX should be set both. - * But ECONV_CRLF_NEWLINE_ENCODER should be set only for the first. */ - fptr->writeconv_pre_ecflags = fptr->encs.ecflags; - fptr->writeconv_pre_ecopts = fptr->encs.ecopts; ecflags = fptr->encs.ecflags; ecopts = fptr->encs.ecopts; - #ifdef TEXTMODE_NEWLINE_ENCODER + if (NEED_NEWLINE_ENCODER(fptr)) + ecflags |= TEXTMODE_NEWLINE_ENCODER; +#endif + if (!fptr->encs.enc) { - if (NEED_NEWLINE_ENCODER(fptr)) - ecflags |= TEXTMODE_NEWLINE_ENCODER; + /* no encoding conversion */ + fptr->writeconv_pre_ecflags = 0; + fptr->writeconv_pre_ecopts = Qnil; fptr->writeconv = rb_econv_open_opts("", "", ecflags, ecopts); if (!fptr->writeconv) rb_exc_raise(rb_econv_open_exc("", "", ecflags)); fptr->writeconv_stateless = Qnil; - return; - } - - if (NEED_NEWLINE_ENCODER(fptr)) - fptr->writeconv_pre_ecflags |= TEXTMODE_NEWLINE_ENCODER; -#endif - ecflags &= ECONV_ERROR_HANDLER_MASK; - - enc = fptr->encs.enc2 ? fptr->encs.enc2 : fptr->encs.enc; - senc = rb_econv_stateless_encoding(enc->name); - if (senc) { - denc = enc->name; - fptr->writeconv_stateless = rb_str_new2(senc); - fptr->writeconv = rb_econv_open_opts(senc, denc, ecflags, ecopts); - if (!fptr->writeconv) - rb_exc_raise(rb_econv_open_exc(senc, denc, ecflags)); } else { - denc = NULL; - fptr->writeconv_stateless = Qnil; - fptr->writeconv = NULL; + enc = fptr->encs.enc2 ? fptr->encs.enc2 : fptr->encs.enc; + senc = rb_econv_stateless_encoding(enc->name); + if (!senc && !(fptr->encs.ecflags & ECONV_STATEFUL_ENCODER_MASK)) { + /* single conversion */ + fptr->writeconv_pre_ecflags = ecflags; + fptr->writeconv_pre_ecopts = ecopts; + fptr->writeconv = NULL; + fptr->writeconv_stateless = Qnil; + } + else { + /* double conversion */ + fptr->writeconv_pre_ecflags = ecflags & ~ECONV_STATEFUL_ENCODER_MASK; + fptr->writeconv_pre_ecopts = ecopts; + if (senc) { + denc = enc->name; + fptr->writeconv_stateless = rb_str_new2(senc); + } + else { + senc = denc = ""; + fptr->writeconv_stateless = rb_str_new2(enc->name); + } + ecflags = fptr->encs.ecflags & (ECONV_ERROR_HANDLER_MASK|ECONV_STATEFUL_ENCODER_MASK); + ecopts = fptr->encs.ecopts; + fptr->writeconv = rb_econv_open_opts(senc, denc, ecflags, ecopts); + if (!fptr->writeconv) + rb_exc_raise(rb_econv_open_exc(senc, denc, ecflags)); + } } } } diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb index 5c9fc143b1..ef87fff4cc 100644 --- a/test/ruby/test_econv.rb +++ b/test/ruby/test_econv.rb @@ -738,20 +738,37 @@ class TestEncodingConverter < Test::Unit::TestCase assert_equal('', ec.finish) end - def test_xml_escape_attr - ec = Encoding::Converter.new("", "xml-attr-escaped") + def test_xml_escape_attr_content + ec = Encoding::Converter.new("", "xml-attr-content-escaped") + assert_equal('', ec.finish) + + ec = Encoding::Converter.new("", "xml-attr-content-escaped") + assert_equal('', ec.convert("")) + assert_equal('', ec.finish) + + ec = Encoding::Converter.new("", "xml-attr-content-escaped") + assert_equal('"', ec.convert('"')) + assert_equal('', ec.finish) + + ec = Encoding::Converter.new("", "xml-attr-content-escaped") + assert_equal('&<>"', ec.convert("&<>\"")) + assert_equal('', ec.finish) + end + + def test_xml_escape_attr_quote + ec = Encoding::Converter.new("", "xml-attr-quoted") assert_equal('""', ec.finish) - ec = Encoding::Converter.new("", "xml-attr-escaped") + ec = Encoding::Converter.new("", "xml-attr-quoted") assert_equal('', ec.convert("")) assert_equal('""', ec.finish) - ec = Encoding::Converter.new("", "xml-attr-escaped") - assert_equal('""', ec.convert('"')) + ec = Encoding::Converter.new("", "xml-attr-quoted") + assert_equal('""', ec.convert('"')) assert_equal('"', ec.finish) - ec = Encoding::Converter.new("", "xml-attr-escaped") - assert_equal('"&<>"', ec.convert("&<>\"")) + ec = Encoding::Converter.new("", "xml-attr-quoted") + assert_equal('"&<>"', ec.convert("&<>\"")) assert_equal('"', ec.finish) end @@ -760,7 +777,10 @@ class TestEncodingConverter < Test::Unit::TestCase assert_equal('<♥>&"♡"', ec.convert("<\u2665>&\"\u2661\"")) assert_equal('', ec.finish) - ec = Encoding::Converter.new("utf-8", "euc-jp", Encoding::Converter::XML_ATTR_ENCODER|Encoding::Converter::UNDEF_HEX_CHARREF) + ec = Encoding::Converter.new("utf-8", "euc-jp", + Encoding::Converter::XML_ATTR_CONTENT_ENCODER| + Encoding::Converter::XML_ATTR_QUOTE_ENCODER| + Encoding::Converter::UNDEF_HEX_CHARREF) assert_equal('"<♥>&"♡"', ec.convert("<\u2665>&\"\u2661\"")) assert_equal('"', ec.finish) diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb index 9d999be59e..57943df25f 100644 --- a/test/ruby/test_io_m17n.rb +++ b/test/ruby/test_io_m17n.rb @@ -1461,6 +1461,18 @@ EOT def test_w_xml_attr with_tmpdir { + open("raw.txt", "wb", xml: :attr) {|f| f.print '&<>"\''; f.puts "\u4E02\u3042" } + content = File.read("raw.txt", :mode=>"rb:ascii-8bit") + assert_equal("\"&<>"'\u4E02\u3042\n\"".force_encoding("ascii-8bit"), content) + + open("ascii.txt", "wb:us-ascii", xml: :attr) {|f| f.print '&<>"\''; f.puts "\u4E02\u3042" } + content = File.read("ascii.txt", :mode=>"rb:ascii-8bit") + assert_equal("\"&<>"'丂あ\n\"".force_encoding("ascii-8bit"), content) + + open("iso-2022-jp.txt", "wb:iso-2022-jp", xml: :attr) {|f| f.print '&<>"\''; f.puts "\u4E02\u3042" } + content = File.read("iso-2022-jp.txt", :mode=>"rb:ascii-8bit") + assert_equal("\"&<>"'丂\e$B$\"\e(B\n\"".force_encoding("ascii-8bit"), content) + open("eucjp.txt", "w:euc-jp:utf-8", xml: :attr) {|f| f.print "\u4E02" # U+4E02 is 0x3021 in JIS X 0212 } @@ -1480,6 +1492,5 @@ EOT assert_equal("\"丂\"".force_encoding("ascii-8bit"), content) } end - end diff --git a/transcode.c b/transcode.c index acfe688ca4..1fdd27d7de 100644 --- a/transcode.c +++ b/transcode.c @@ -896,7 +896,7 @@ rb_econv_open(const char *sname, const char *dname, int ecflags) return NULL; if ((ecflags & ECONV_XML_TEXT_ENCODER) && - (ecflags & ECONV_XML_ATTR_ENCODER)) + (ecflags & ECONV_XML_ATTR_CONTENT_ENCODER)) return NULL; num_encoders = 0; @@ -909,8 +909,11 @@ rb_econv_open(const char *sname, const char *dname, int ecflags) if (ecflags & ECONV_XML_TEXT_ENCODER) if (!(encoders[num_encoders++] = get_transcoder_entry("", "xml-text-escaped"))) return NULL; - if (ecflags & ECONV_XML_ATTR_ENCODER) - if (!(encoders[num_encoders++] = get_transcoder_entry("", "xml-attr-escaped"))) + if (ecflags & ECONV_XML_ATTR_CONTENT_ENCODER) + if (!(encoders[num_encoders++] = get_transcoder_entry("", "xml-attr-content-escaped"))) + return NULL; + if (ecflags & ECONV_XML_ATTR_QUOTE_ENCODER) + if (!(encoders[num_encoders++] = get_transcoder_entry("", "xml-attr-quoted"))) return NULL; num_decoders = 0; @@ -1792,7 +1795,8 @@ econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg) ECONV_CRLF_NEWLINE_ENCODER| ECONV_CR_NEWLINE_ENCODER| ECONV_XML_TEXT_ENCODER| - ECONV_XML_ATTR_ENCODER)) { + ECONV_XML_ATTR_CONTENT_ENCODER| + ECONV_XML_ATTR_QUOTE_ENCODER)) { const char *pre = ""; if (has_description) rb_str_cat2(mesg, " with "); @@ -1812,9 +1816,13 @@ econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg) rb_str_cat2(mesg, pre); pre = ","; rb_str_cat2(mesg, "XML-text"); } - if (ecflags & ECONV_XML_ATTR_ENCODER) { + if (ecflags & ECONV_XML_ATTR_CONTENT_ENCODER) { rb_str_cat2(mesg, pre); pre = ","; - rb_str_cat2(mesg, "XML-attr"); + rb_str_cat2(mesg, "XML-attr-content"); + } + if (ecflags & ECONV_XML_ATTR_QUOTE_ENCODER) { + rb_str_cat2(mesg, pre); pre = ","; + rb_str_cat2(mesg, "XML-attr-quote"); } has_description = 1; } @@ -2173,7 +2181,7 @@ econv_opts(VALUE opt) ecflags |= ECONV_XML_TEXT_ENCODER|ECONV_UNDEF_HEX_CHARREF; } else if (v==sym_attr) { - ecflags |= ECONV_XML_ATTR_ENCODER|ECONV_UNDEF_HEX_CHARREF; + ecflags |= ECONV_XML_ATTR_CONTENT_ENCODER|ECONV_XML_ATTR_QUOTE_ENCODER|ECONV_UNDEF_HEX_CHARREF; } else { rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v))); @@ -2329,7 +2337,8 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) ECONV_CRLF_NEWLINE_ENCODER| ECONV_CR_NEWLINE_ENCODER| ECONV_XML_TEXT_ENCODER| - ECONV_XML_ATTR_ENCODER)) == 0) { + ECONV_XML_ATTR_CONTENT_ENCODER| + ECONV_XML_ATTR_QUOTE_ENCODER)) == 0) { if (senc && senc == denc) { return -1; } @@ -3573,7 +3582,8 @@ Init_transcode(void) rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_ENCODER", INT2FIX(ECONV_CRLF_NEWLINE_ENCODER)); rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_ENCODER", INT2FIX(ECONV_CR_NEWLINE_ENCODER)); rb_define_const(rb_cEncodingConverter, "XML_TEXT_ENCODER", INT2FIX(ECONV_XML_TEXT_ENCODER)); - rb_define_const(rb_cEncodingConverter, "XML_ATTR_ENCODER", INT2FIX(ECONV_XML_ATTR_ENCODER)); + rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_ENCODER", INT2FIX(ECONV_XML_ATTR_CONTENT_ENCODER)); + rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_ENCODER", INT2FIX(ECONV_XML_ATTR_QUOTE_ENCODER)); rb_define_method(rb_eConversionUndefined, "source_encoding_name", ecerr_source_encoding_name, 0); rb_define_method(rb_eConversionUndefined, "destination_encoding_name", ecerr_destination_encoding_name, 0);