From bbec11d329f5a72fe6151ec9fb0e25ff255f2eed Mon Sep 17 00:00:00 2001
From: tadd <tadd@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>
Date: Thu, 14 Dec 2017 08:47:13 +0000
Subject: [PATCH] Implement String#undump to unescape String#dump-ed string
 [Feature #12275] [close GH-1765]

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@61228 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
---
 NEWS                     |   1 +
 string.c                 | 301 ++++++++++++++++++++++++++++++++++++---
 test/ruby/test_string.rb |  35 +++++
 3 files changed, 320 insertions(+), 17 deletions(-)

diff --git a/NEWS b/NEWS
index b458b549a3..fb47ed1516 100644
--- a/NEWS
+++ b/NEWS
@@ -173,6 +173,7 @@ with all sufficient information, see the ChangeLog file or Redmine
     * String#delete_suffix, String#delete_suffix! [Feature #13665]
     * String#each_grapheme_cluster and String#grapheme_clusters to
       enumerate grapheme clusters [Feature #13780]
+    * String#undump to unescape String#dump'ed string [Feature #12275]
 
 * Struct
 
diff --git a/string.c b/string.c
index 56b6f641c6..6a8e24a1bf 100644
--- a/string.c
+++ b/string.c
@@ -19,6 +19,7 @@
 #include "ruby_assert.h"
 #include "id.h"
 #include "debug_counter.h"
+#include "ruby/util.h"
 
 #define BEG(no) (regs->beg[(no)])
 #define END(no) (regs->end[(no)])
@@ -3422,13 +3423,34 @@ str_casecmp_p(VALUE str1, VALUE str2)
     return rb_str_eql(folded_str1, folded_str2);
 }
 
+static long
+strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
+	    const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
+{
+    const char *search_start = str_ptr;
+    long pos, search_len = str_len - offset;
+
+    for (;;) {
+	const char *t;
+	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
+	if (pos < 0) return pos;
+	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
+	if (t == search_start + pos) break;
+	search_len -= t - search_start;
+	if (search_len <= 0) return -1;
+	offset += t - search_start;
+	search_start = t;
+    }
+    return pos + offset;
+}
+
 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
 
 static long
 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
 {
-    const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
-    long pos, str_len, sub_len, search_len;
+    const char *str_ptr, *str_ptr_end, *sub_ptr;
+    long str_len, sub_len;
     int single_byte = single_byte_optimizable(str);
     rb_encoding *enc;
 
@@ -3458,21 +3480,7 @@ rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
     if (sub_len == 0) return offset;
 
     /* need proceed one character at a time */
-
-    search_start = str_ptr;
-    search_len = RSTRING_LEN(str) - offset;
-    for (;;) {
-	const char *t;
-	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
-	if (pos < 0) return pos;
-	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
-	if (t == search_start + pos) break;
-	search_len -= t - search_start;
-	if (search_len <= 0) return -1;
-	offset += t - search_start;
-	search_start = t;
-    }
-    return pos + offset;
+    return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
 }
 
 
@@ -6073,6 +6081,264 @@ rb_str_dump(VALUE str)
     return result;
 }
 
+enum undump_source_format {
+    UNDUMP_SOURCE_SIMPLE, /* "..." */
+    UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
+    UNDUMP_SOURCE_INVALID
+};
+
+static enum undump_source_format
+check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
+			   VALUE *forced_enc_str, long *forced_enc_str_len)
+{
+    unsigned int cbeg, cend;
+    const char *prev;
+    static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
+    static const char force_encoding_middle_part[] = "\".force_encoding(\"";
+    static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
+    static const char force_encoding_end_part[] = "\")";
+    static const long force_encoding_end_part_len = rb_strlen_lit("\")");
+    long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;
+
+    if (len < 2) return UNDUMP_SOURCE_INVALID;
+
+    cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
+    if (cbeg != '"') return UNDUMP_SOURCE_INVALID;
+
+    prev = rb_enc_prev_char(s, s_end, s_end, enc);
+    cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
+    if (cend == '"') return UNDUMP_SOURCE_SIMPLE;
+
+    if (cend != ')' || len < force_encoding_minimum_len) {
+	return UNDUMP_SOURCE_INVALID;
+    }
+
+    /* find '".force_encoding("' */
+    pos_before_middle_part = strseq_core(s, s_end, len,
+					 force_encoding_middle_part, force_encoding_middle_part_len,
+					 0, enc);
+    if (pos_before_middle_part <= 0) {
+	return UNDUMP_SOURCE_INVALID;
+    }
+
+    pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
+    /* find '")' */
+    pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
+				      force_encoding_end_part, force_encoding_end_part_len,
+				      0, enc);
+    if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
+	return UNDUMP_SOURCE_INVALID;
+    }
+
+    *forced_enc_str_len = pos_before_end_part;
+    *forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
+    return UNDUMP_SOURCE_FORCE_ENCODING;
+}
+
+static int
+unescape_ascii(unsigned int c)
+{
+    switch (c) {
+      case 'n':
+	return '\n';
+      case 'r':
+	return '\r';
+      case 't':
+	return '\t';
+      case 'f':
+	return '\f';
+      case 'v':
+	return '\13';
+      case 'b':
+	return '\010';
+      case 'a':
+	return '\007';
+      case 'e':
+	return 033;
+      default:
+	UNREACHABLE;
+    }
+}
+
+static int
+undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
+{
+    unsigned int c, c2;
+    int n, codelen;
+    size_t hexlen;
+    char buf[6];
+    static rb_encoding *enc_utf8 = NULL;
+
+    c = rb_enc_codepoint_len(s, s_end, &n, *penc);
+    switch (c) {
+      case '\\':
+      case '"':
+      case '#':
+	rb_str_cat(undumped, s, n); /* cat itself */
+	n++;
+	break;
+      case 'n':
+      case 'r':
+      case 't':
+      case 'f':
+      case 'v':
+      case 'b':
+      case 'a':
+      case 'e':
+	*buf = (char)unescape_ascii(c);
+	rb_str_cat(undumped, buf, n);
+	n++;
+	break;
+      case 'u':
+	if (s+1 >= s_end) {
+	    rb_raise(rb_eRuntimeError, "invalid Unicode escape");
+	}
+	if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
+	if (*penc != enc_utf8) {
+	    *penc = enc_utf8;
+	    rb_enc_associate(undumped, enc_utf8);
+	    ENC_CODERANGE_CLEAR(undumped);
+	}
+	c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
+	if (c2 == '{') { /* handle \u{...} form */
+	    const char *hexstr = s + 2;
+	    int hex;
+	    static const char* const close_brace = "}";
+	    long pos;
+
+	    if (hexstr >= s_end) {
+		rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
+	    }
+	    /* find close brace */
+	    pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
+	    if (pos < 0) {
+		rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
+	    }
+	    hex = scan_hex(hexstr, pos, &hexlen);
+	    if (hexlen == 0 || hexlen > 6) {
+		rb_raise(rb_eRuntimeError, "invalid Unicode escape");
+	    }
+	    if (hex > 0x10ffff) {
+		rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
+	    }
+	    if ((hex & 0xfffff800) == 0xd800) {
+		rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
+	    }
+	    codelen = rb_enc_codelen(hex, *penc);
+	    rb_enc_mbcput(hex, buf, *penc);
+	    rb_str_cat(undumped, buf, codelen);
+	    n += rb_strlen_lit("u{}") + hexlen;
+	}
+	else { /* handle \uXXXX form */
+	    int hex = scan_hex(s+1, 4, &hexlen);
+	    if (hexlen != 4) {
+		rb_raise(rb_eRuntimeError, "invalid Unicode escape");
+	    }
+	    codelen = rb_enc_codelen(hex, *penc);
+	    rb_enc_mbcput(hex, buf, *penc);
+	    rb_str_cat(undumped, buf, codelen);
+	    n += rb_strlen_lit("uXXXX");
+	}
+	break;
+      case 'x':
+	if (s+1 >= s_end) {
+	    rb_raise(rb_eRuntimeError, "invalid hex escape");
+	}
+	c2 = scan_hex(s+1, 2, &hexlen);
+	if (hexlen != 2) {
+	    rb_raise(rb_eRuntimeError, "invalid hex escape");
+	}
+	*buf = (char)c2;
+	rb_str_cat(undumped, buf, 1L);
+	n += rb_strlen_lit("xXX");
+	break;
+      default:
+	rb_str_cat(undumped, "\\", 1L); /* keep backslash */
+    }
+
+    return n;
+}
+
+static VALUE rb_str_is_ascii_only_p(VALUE str);
+
+/*
+ *  call-seq:
+ *     str.undump   -> new_str
+ *
+ *  Produces unescaped version of +str+.
+ *  See also String#dump because String#undump does inverse of String#dump.
+ *
+ *    "\"hello \\n ''\"".undump #=> "hello \n ''"
+ */
+
+static VALUE
+str_undump(VALUE str)
+{
+    const char *s = RSTRING_PTR(str);
+    const char *s_end = RSTRING_END(str);
+    long len = RSTRING_LEN(str);
+    rb_encoding *enc = rb_enc_get(str), *forced_enc;
+    int n;
+    unsigned int c;
+    enum undump_source_format source_format;
+    VALUE undumped = rb_enc_str_new(s, 0L, enc);
+    VALUE forced_enc_str;
+    long forced_enc_str_len;
+    int w;
+
+    rb_must_asciicompat(str);
+    if (rb_str_is_ascii_only_p(str) == Qfalse) {
+	rb_raise(rb_eRuntimeError, "non-ASCII character detected");
+    }
+    if (!str_null_check(str, &w)) {
+	rb_raise(rb_eRuntimeError, "string contains null byte");
+    }
+
+    source_format = check_undump_source_format(s, s_end, len, enc,
+					       &forced_enc_str, &forced_enc_str_len);
+    if (source_format == UNDUMP_SOURCE_INVALID) {
+	rb_raise(rb_eRuntimeError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
+    }
+    if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
+	forced_enc = rb_find_encoding(forced_enc_str);
+	if (forced_enc == NULL) {
+	    rb_raise(rb_eRuntimeError, "unknown encoding name - %"PRIsVALUE, forced_enc_str);
+	}
+    }
+
+    /* strip '"' at the start */
+    s++;
+    if (source_format == UNDUMP_SOURCE_SIMPLE) {
+	/* strip '"' at the end */
+	s_end--;
+    } else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
+	/* strip '".force_encoding("...")' */
+	s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
+    }
+
+    for (; s < s_end; s += n) {
+	c = rb_enc_codepoint_len(s, s_end, &n, enc);
+	if (c == '\\') {
+	    if (s+1 >= s_end) {
+		rb_raise(rb_eRuntimeError, "invalid escape");
+	    }
+	    n = undump_after_backslash(undumped, s+1, s_end, &enc);
+	}
+	else if (c == '"') {
+	    rb_raise(rb_eRuntimeError, "non-escaped double quote detected");
+	}
+	else {
+	    rb_str_cat(undumped, s, n);
+	}
+    }
+
+    if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
+	rb_enc_associate(undumped, forced_enc);
+	ENC_CODERANGE_CLEAR(undumped);
+    }
+    OBJ_INFECT(undumped, str);
+    return undumped;
+}
 
 static void
 rb_str_check_dummy_enc(rb_encoding *enc)
@@ -10586,6 +10852,7 @@ Init_String(void)
     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
+    rb_define_method(rb_cString, "undump", str_undump, 0);
 
     sym_ascii      = ID2SYM(rb_intern("ascii"));
     sym_turkic     = ID2SYM(rb_intern("turkic"));
diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
index 81e03aab30..52a6ad4056 100644
--- a/test/ruby/test_string.rb
+++ b/test/ruby/test_string.rb
@@ -753,6 +753,41 @@ CODE
     assert_equal(S('"\\u{10ABCD}"'), b.dump)
   end
 
+  def test_undump
+    a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
+    assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
+    assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
+    assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
+    assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
+    assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
+    assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
+    assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
+
+    assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
+    assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
+
+    assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding)
+
+    assert_equal("abc".encode(Encoding::UTF_16LE),
+                 '"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump)
+
+    assert_equal('\#', '"\\\\#"'.undump)
+    assert_equal('\#{', '"\\\\\#{"'.undump)
+
+    assert_raise(RuntimeError) { S('\u3042').undump }
+    assert_raise(RuntimeError) { S('"".force_encoding()').undump }
+    assert_raise(RuntimeError) { S('"".force_encoding("UNKNOWN")').undump }
+    assert_raise(RuntimeError) { S(%("\u00E4")).undump }
+    assert_raise(RuntimeError) { S('""""').undump }
+
+    assert_raise(RuntimeError) { S('"\u"').undump }
+    assert_raise(RuntimeError) { S('"\u{"').undump }
+    assert_raise(RuntimeError) { S('"\u{3042"').undump }
+    assert_raise(RuntimeError) { S('"\x"').undump }
+    assert_raise(RuntimeError) { S('"\\"').undump }
+    assert_raise(RuntimeError) { S(%("\0")).undump }
+  end
+
   def test_dup
     for taint in [ false, true ]
       for frozen in [ false, true ]