From bbec11d329f5a72fe6151ec9fb0e25ff255f2eed Mon Sep 17 00:00:00 2001 From: tadd Date: Thu, 14 Dec 2017 08:47:13 +0000 Subject: [PATCH] Implement String#undump to unescape String#dump-ed string [Feature #12275] [close GH-1765] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@61228 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- NEWS | 1 + string.c | 301 ++++++++++++++++++++++++++++++++++++--- test/ruby/test_string.rb | 35 +++++ 3 files changed, 320 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index b458b549a3..fb47ed1516 100644 --- a/NEWS +++ b/NEWS @@ -173,6 +173,7 @@ with all sufficient information, see the ChangeLog file or Redmine * String#delete_suffix, String#delete_suffix! [Feature #13665] * String#each_grapheme_cluster and String#grapheme_clusters to enumerate grapheme clusters [Feature #13780] + * String#undump to unescape String#dump'ed string [Feature #12275] * Struct diff --git a/string.c b/string.c index 56b6f641c6..6a8e24a1bf 100644 --- a/string.c +++ b/string.c @@ -19,6 +19,7 @@ #include "ruby_assert.h" #include "id.h" #include "debug_counter.h" +#include "ruby/util.h" #define BEG(no) (regs->beg[(no)]) #define END(no) (regs->end[(no)]) @@ -3422,13 +3423,34 @@ str_casecmp_p(VALUE str1, VALUE str2) return rb_str_eql(folded_str1, folded_str2); } +static long +strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len, + const char *sub_ptr, long sub_len, long offset, rb_encoding *enc) +{ + const char *search_start = str_ptr; + long pos, search_len = str_len - offset; + + for (;;) { + const char *t; + pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc); + if (pos < 0) return pos; + t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc); + if (t == search_start + pos) break; + search_len -= t - search_start; + if (search_len <= 0) return -1; + offset += t - search_start; + search_start = t; + } + return pos + offset; +} + #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0) static long rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte) { - const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start; - long pos, str_len, sub_len, search_len; + const char *str_ptr, *str_ptr_end, *sub_ptr; + long str_len, sub_len; int single_byte = single_byte_optimizable(str); rb_encoding *enc; @@ -3458,21 +3480,7 @@ rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte) if (sub_len == 0) return offset; /* need proceed one character at a time */ - - search_start = str_ptr; - search_len = RSTRING_LEN(str) - offset; - for (;;) { - const char *t; - pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc); - if (pos < 0) return pos; - t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc); - if (t == search_start + pos) break; - search_len -= t - search_start; - if (search_len <= 0) return -1; - offset += t - search_start; - search_start = t; - } - return pos + offset; + return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc); } @@ -6073,6 +6081,264 @@ rb_str_dump(VALUE str) return result; } +enum undump_source_format { + UNDUMP_SOURCE_SIMPLE, /* "..." */ + UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */ + UNDUMP_SOURCE_INVALID +}; + +static enum undump_source_format +check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc, + VALUE *forced_enc_str, long *forced_enc_str_len) +{ + unsigned int cbeg, cend; + const char *prev; + static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")"); + static const char force_encoding_middle_part[] = "\".force_encoding(\""; + static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\""); + static const char force_encoding_end_part[] = "\")"; + static const long force_encoding_end_part_len = rb_strlen_lit("\")"); + long pos_before_middle_part, pos_before_end_part, pos_after_middle_part; + + if (len < 2) return UNDUMP_SOURCE_INVALID; + + cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc); + if (cbeg != '"') return UNDUMP_SOURCE_INVALID; + + prev = rb_enc_prev_char(s, s_end, s_end, enc); + cend = rb_enc_mbc_to_codepoint(prev, s_end, enc); + if (cend == '"') return UNDUMP_SOURCE_SIMPLE; + + if (cend != ')' || len < force_encoding_minimum_len) { + return UNDUMP_SOURCE_INVALID; + } + + /* find '".force_encoding("' */ + pos_before_middle_part = strseq_core(s, s_end, len, + force_encoding_middle_part, force_encoding_middle_part_len, + 0, enc); + if (pos_before_middle_part <= 0) { + return UNDUMP_SOURCE_INVALID; + } + + pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len; + /* find '")' */ + pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part, + force_encoding_end_part, force_encoding_end_part_len, + 0, enc); + if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) { + return UNDUMP_SOURCE_INVALID; + } + + *forced_enc_str_len = pos_before_end_part; + *forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len); + return UNDUMP_SOURCE_FORCE_ENCODING; +} + +static int +unescape_ascii(unsigned int c) +{ + switch (c) { + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'f': + return '\f'; + case 'v': + return '\13'; + case 'b': + return '\010'; + case 'a': + return '\007'; + case 'e': + return 033; + default: + UNREACHABLE; + } +} + +static int +undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc) +{ + unsigned int c, c2; + int n, codelen; + size_t hexlen; + char buf[6]; + static rb_encoding *enc_utf8 = NULL; + + c = rb_enc_codepoint_len(s, s_end, &n, *penc); + switch (c) { + case '\\': + case '"': + case '#': + rb_str_cat(undumped, s, n); /* cat itself */ + n++; + break; + case 'n': + case 'r': + case 't': + case 'f': + case 'v': + case 'b': + case 'a': + case 'e': + *buf = (char)unescape_ascii(c); + rb_str_cat(undumped, buf, n); + n++; + break; + case 'u': + if (s+1 >= s_end) { + rb_raise(rb_eRuntimeError, "invalid Unicode escape"); + } + if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding(); + if (*penc != enc_utf8) { + *penc = enc_utf8; + rb_enc_associate(undumped, enc_utf8); + ENC_CODERANGE_CLEAR(undumped); + } + c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc); + if (c2 == '{') { /* handle \u{...} form */ + const char *hexstr = s + 2; + int hex; + static const char* const close_brace = "}"; + long pos; + + if (hexstr >= s_end) { + rb_raise(rb_eRuntimeError, "unterminated Unicode escape"); + } + /* find close brace */ + pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc); + if (pos < 0) { + rb_raise(rb_eRuntimeError, "unterminated Unicode escape"); + } + hex = scan_hex(hexstr, pos, &hexlen); + if (hexlen == 0 || hexlen > 6) { + rb_raise(rb_eRuntimeError, "invalid Unicode escape"); + } + if (hex > 0x10ffff) { + rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)"); + } + if ((hex & 0xfffff800) == 0xd800) { + rb_raise(rb_eRuntimeError, "invalid Unicode codepoint"); + } + codelen = rb_enc_codelen(hex, *penc); + rb_enc_mbcput(hex, buf, *penc); + rb_str_cat(undumped, buf, codelen); + n += rb_strlen_lit("u{}") + hexlen; + } + else { /* handle \uXXXX form */ + int hex = scan_hex(s+1, 4, &hexlen); + if (hexlen != 4) { + rb_raise(rb_eRuntimeError, "invalid Unicode escape"); + } + codelen = rb_enc_codelen(hex, *penc); + rb_enc_mbcput(hex, buf, *penc); + rb_str_cat(undumped, buf, codelen); + n += rb_strlen_lit("uXXXX"); + } + break; + case 'x': + if (s+1 >= s_end) { + rb_raise(rb_eRuntimeError, "invalid hex escape"); + } + c2 = scan_hex(s+1, 2, &hexlen); + if (hexlen != 2) { + rb_raise(rb_eRuntimeError, "invalid hex escape"); + } + *buf = (char)c2; + rb_str_cat(undumped, buf, 1L); + n += rb_strlen_lit("xXX"); + break; + default: + rb_str_cat(undumped, "\\", 1L); /* keep backslash */ + } + + return n; +} + +static VALUE rb_str_is_ascii_only_p(VALUE str); + +/* + * call-seq: + * str.undump -> new_str + * + * Produces unescaped version of +str+. + * See also String#dump because String#undump does inverse of String#dump. + * + * "\"hello \\n ''\"".undump #=> "hello \n ''" + */ + +static VALUE +str_undump(VALUE str) +{ + const char *s = RSTRING_PTR(str); + const char *s_end = RSTRING_END(str); + long len = RSTRING_LEN(str); + rb_encoding *enc = rb_enc_get(str), *forced_enc; + int n; + unsigned int c; + enum undump_source_format source_format; + VALUE undumped = rb_enc_str_new(s, 0L, enc); + VALUE forced_enc_str; + long forced_enc_str_len; + int w; + + rb_must_asciicompat(str); + if (rb_str_is_ascii_only_p(str) == Qfalse) { + rb_raise(rb_eRuntimeError, "non-ASCII character detected"); + } + if (!str_null_check(str, &w)) { + rb_raise(rb_eRuntimeError, "string contains null byte"); + } + + source_format = check_undump_source_format(s, s_end, len, enc, + &forced_enc_str, &forced_enc_str_len); + if (source_format == UNDUMP_SOURCE_INVALID) { + rb_raise(rb_eRuntimeError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form"); + } + if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) { + forced_enc = rb_find_encoding(forced_enc_str); + if (forced_enc == NULL) { + rb_raise(rb_eRuntimeError, "unknown encoding name - %"PRIsVALUE, forced_enc_str); + } + } + + /* strip '"' at the start */ + s++; + if (source_format == UNDUMP_SOURCE_SIMPLE) { + /* strip '"' at the end */ + s_end--; + } else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */ + /* strip '".force_encoding("...")' */ + s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len; + } + + for (; s < s_end; s += n) { + c = rb_enc_codepoint_len(s, s_end, &n, enc); + if (c == '\\') { + if (s+1 >= s_end) { + rb_raise(rb_eRuntimeError, "invalid escape"); + } + n = undump_after_backslash(undumped, s+1, s_end, &enc); + } + else if (c == '"') { + rb_raise(rb_eRuntimeError, "non-escaped double quote detected"); + } + else { + rb_str_cat(undumped, s, n); + } + } + + if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) { + rb_enc_associate(undumped, forced_enc); + ENC_CODERANGE_CLEAR(undumped); + } + OBJ_INFECT(undumped, str); + return undumped; +} static void rb_str_check_dummy_enc(rb_encoding *enc) @@ -10586,6 +10852,7 @@ Init_String(void) rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); rb_define_method(rb_cString, "dump", rb_str_dump, 0); + rb_define_method(rb_cString, "undump", str_undump, 0); sym_ascii = ID2SYM(rb_intern("ascii")); sym_turkic = ID2SYM(rb_intern("turkic")); diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 81e03aab30..52a6ad4056 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -753,6 +753,41 @@ CODE assert_equal(S('"\\u{10ABCD}"'), b.dump) end + def test_undump + a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10 + assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump) + assert_equal(S("\u{7F}"), S('"\\x7F"').undump) + assert_equal(S("\u{AB}"), S('"\\u00AB"').undump) + assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump) + assert_equal(S("\uABCD"), S('"\\uABCD"').undump) + assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump) + assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump) + + assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump) + assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump) + + assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding) + + assert_equal("abc".encode(Encoding::UTF_16LE), + '"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump) + + assert_equal('\#', '"\\\\#"'.undump) + assert_equal('\#{', '"\\\\\#{"'.undump) + + assert_raise(RuntimeError) { S('\u3042').undump } + assert_raise(RuntimeError) { S('"".force_encoding()').undump } + assert_raise(RuntimeError) { S('"".force_encoding("UNKNOWN")').undump } + assert_raise(RuntimeError) { S(%("\u00E4")).undump } + assert_raise(RuntimeError) { S('""""').undump } + + assert_raise(RuntimeError) { S('"\u"').undump } + assert_raise(RuntimeError) { S('"\u{"').undump } + assert_raise(RuntimeError) { S('"\u{3042"').undump } + assert_raise(RuntimeError) { S('"\x"').undump } + assert_raise(RuntimeError) { S('"\\"').undump } + assert_raise(RuntimeError) { S(%("\0")).undump } + end + def test_dup for taint in [ false, true ] for frozen in [ false, true ]