1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Implement String#undump to unescape String#dump-ed string

[Feature #12275] [close GH-1765]

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@61228 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
tadd 2017-12-14 08:47:13 +00:00
parent 4abc1a24af
commit bbec11d329
3 changed files with 320 additions and 17 deletions

1
NEWS
View file

@ -173,6 +173,7 @@ with all sufficient information, see the ChangeLog file or Redmine
* String#delete_suffix, String#delete_suffix! [Feature #13665] * String#delete_suffix, String#delete_suffix! [Feature #13665]
* String#each_grapheme_cluster and String#grapheme_clusters to * String#each_grapheme_cluster and String#grapheme_clusters to
enumerate grapheme clusters [Feature #13780] enumerate grapheme clusters [Feature #13780]
* String#undump to unescape String#dump'ed string [Feature #12275]
* Struct * Struct

301
string.c
View file

@ -19,6 +19,7 @@
#include "ruby_assert.h" #include "ruby_assert.h"
#include "id.h" #include "id.h"
#include "debug_counter.h" #include "debug_counter.h"
#include "ruby/util.h"
#define BEG(no) (regs->beg[(no)]) #define BEG(no) (regs->beg[(no)])
#define END(no) (regs->end[(no)]) #define END(no) (regs->end[(no)])
@ -3422,13 +3423,34 @@ str_casecmp_p(VALUE str1, VALUE str2)
return rb_str_eql(folded_str1, folded_str2); return rb_str_eql(folded_str1, folded_str2);
} }
static long
strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
{
const char *search_start = str_ptr;
long pos, search_len = str_len - offset;
for (;;) {
const char *t;
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
if (pos < 0) return pos;
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
if (t == search_start + pos) break;
search_len -= t - search_start;
if (search_len <= 0) return -1;
offset += t - search_start;
search_start = t;
}
return pos + offset;
}
#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0) #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
static long static long
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte) rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
{ {
const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start; const char *str_ptr, *str_ptr_end, *sub_ptr;
long pos, str_len, sub_len, search_len; long str_len, sub_len;
int single_byte = single_byte_optimizable(str); int single_byte = single_byte_optimizable(str);
rb_encoding *enc; rb_encoding *enc;
@ -3458,21 +3480,7 @@ rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
if (sub_len == 0) return offset; if (sub_len == 0) return offset;
/* need proceed one character at a time */ /* need proceed one character at a time */
return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
search_start = str_ptr;
search_len = RSTRING_LEN(str) - offset;
for (;;) {
const char *t;
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
if (pos < 0) return pos;
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
if (t == search_start + pos) break;
search_len -= t - search_start;
if (search_len <= 0) return -1;
offset += t - search_start;
search_start = t;
}
return pos + offset;
} }
@ -6073,6 +6081,264 @@ rb_str_dump(VALUE str)
return result; return result;
} }
enum undump_source_format {
UNDUMP_SOURCE_SIMPLE, /* "..." */
UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
UNDUMP_SOURCE_INVALID
};
static enum undump_source_format
check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
VALUE *forced_enc_str, long *forced_enc_str_len)
{
unsigned int cbeg, cend;
const char *prev;
static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
static const char force_encoding_middle_part[] = "\".force_encoding(\"";
static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
static const char force_encoding_end_part[] = "\")";
static const long force_encoding_end_part_len = rb_strlen_lit("\")");
long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;
if (len < 2) return UNDUMP_SOURCE_INVALID;
cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
if (cbeg != '"') return UNDUMP_SOURCE_INVALID;
prev = rb_enc_prev_char(s, s_end, s_end, enc);
cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
if (cend == '"') return UNDUMP_SOURCE_SIMPLE;
if (cend != ')' || len < force_encoding_minimum_len) {
return UNDUMP_SOURCE_INVALID;
}
/* find '".force_encoding("' */
pos_before_middle_part = strseq_core(s, s_end, len,
force_encoding_middle_part, force_encoding_middle_part_len,
0, enc);
if (pos_before_middle_part <= 0) {
return UNDUMP_SOURCE_INVALID;
}
pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
/* find '")' */
pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
force_encoding_end_part, force_encoding_end_part_len,
0, enc);
if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
return UNDUMP_SOURCE_INVALID;
}
*forced_enc_str_len = pos_before_end_part;
*forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
return UNDUMP_SOURCE_FORCE_ENCODING;
}
static int
unescape_ascii(unsigned int c)
{
switch (c) {
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
case 'f':
return '\f';
case 'v':
return '\13';
case 'b':
return '\010';
case 'a':
return '\007';
case 'e':
return 033;
default:
UNREACHABLE;
}
}
static int
undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
{
unsigned int c, c2;
int n, codelen;
size_t hexlen;
char buf[6];
static rb_encoding *enc_utf8 = NULL;
c = rb_enc_codepoint_len(s, s_end, &n, *penc);
switch (c) {
case '\\':
case '"':
case '#':
rb_str_cat(undumped, s, n); /* cat itself */
n++;
break;
case 'n':
case 'r':
case 't':
case 'f':
case 'v':
case 'b':
case 'a':
case 'e':
*buf = (char)unescape_ascii(c);
rb_str_cat(undumped, buf, n);
n++;
break;
case 'u':
if (s+1 >= s_end) {
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
}
if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
if (*penc != enc_utf8) {
*penc = enc_utf8;
rb_enc_associate(undumped, enc_utf8);
ENC_CODERANGE_CLEAR(undumped);
}
c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
if (c2 == '{') { /* handle \u{...} form */
const char *hexstr = s + 2;
int hex;
static const char* const close_brace = "}";
long pos;
if (hexstr >= s_end) {
rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
}
/* find close brace */
pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
if (pos < 0) {
rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
}
hex = scan_hex(hexstr, pos, &hexlen);
if (hexlen == 0 || hexlen > 6) {
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
}
if (hex > 0x10ffff) {
rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
}
if ((hex & 0xfffff800) == 0xd800) {
rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
}
codelen = rb_enc_codelen(hex, *penc);
rb_enc_mbcput(hex, buf, *penc);
rb_str_cat(undumped, buf, codelen);
n += rb_strlen_lit("u{}") + hexlen;
}
else { /* handle \uXXXX form */
int hex = scan_hex(s+1, 4, &hexlen);
if (hexlen != 4) {
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
}
codelen = rb_enc_codelen(hex, *penc);
rb_enc_mbcput(hex, buf, *penc);
rb_str_cat(undumped, buf, codelen);
n += rb_strlen_lit("uXXXX");
}
break;
case 'x':
if (s+1 >= s_end) {
rb_raise(rb_eRuntimeError, "invalid hex escape");
}
c2 = scan_hex(s+1, 2, &hexlen);
if (hexlen != 2) {
rb_raise(rb_eRuntimeError, "invalid hex escape");
}
*buf = (char)c2;
rb_str_cat(undumped, buf, 1L);
n += rb_strlen_lit("xXX");
break;
default:
rb_str_cat(undumped, "\\", 1L); /* keep backslash */
}
return n;
}
static VALUE rb_str_is_ascii_only_p(VALUE str);
/*
* call-seq:
* str.undump -> new_str
*
* Produces unescaped version of +str+.
* See also String#dump because String#undump does inverse of String#dump.
*
* "\"hello \\n ''\"".undump #=> "hello \n ''"
*/
static VALUE
str_undump(VALUE str)
{
const char *s = RSTRING_PTR(str);
const char *s_end = RSTRING_END(str);
long len = RSTRING_LEN(str);
rb_encoding *enc = rb_enc_get(str), *forced_enc;
int n;
unsigned int c;
enum undump_source_format source_format;
VALUE undumped = rb_enc_str_new(s, 0L, enc);
VALUE forced_enc_str;
long forced_enc_str_len;
int w;
rb_must_asciicompat(str);
if (rb_str_is_ascii_only_p(str) == Qfalse) {
rb_raise(rb_eRuntimeError, "non-ASCII character detected");
}
if (!str_null_check(str, &w)) {
rb_raise(rb_eRuntimeError, "string contains null byte");
}
source_format = check_undump_source_format(s, s_end, len, enc,
&forced_enc_str, &forced_enc_str_len);
if (source_format == UNDUMP_SOURCE_INVALID) {
rb_raise(rb_eRuntimeError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
}
if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
forced_enc = rb_find_encoding(forced_enc_str);
if (forced_enc == NULL) {
rb_raise(rb_eRuntimeError, "unknown encoding name - %"PRIsVALUE, forced_enc_str);
}
}
/* strip '"' at the start */
s++;
if (source_format == UNDUMP_SOURCE_SIMPLE) {
/* strip '"' at the end */
s_end--;
} else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
/* strip '".force_encoding("...")' */
s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
}
for (; s < s_end; s += n) {
c = rb_enc_codepoint_len(s, s_end, &n, enc);
if (c == '\\') {
if (s+1 >= s_end) {
rb_raise(rb_eRuntimeError, "invalid escape");
}
n = undump_after_backslash(undumped, s+1, s_end, &enc);
}
else if (c == '"') {
rb_raise(rb_eRuntimeError, "non-escaped double quote detected");
}
else {
rb_str_cat(undumped, s, n);
}
}
if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
rb_enc_associate(undumped, forced_enc);
ENC_CODERANGE_CLEAR(undumped);
}
OBJ_INFECT(undumped, str);
return undumped;
}
static void static void
rb_str_check_dummy_enc(rb_encoding *enc) rb_str_check_dummy_enc(rb_encoding *enc)
@ -10586,6 +10852,7 @@ Init_String(void)
rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
rb_define_method(rb_cString, "dump", rb_str_dump, 0); rb_define_method(rb_cString, "dump", rb_str_dump, 0);
rb_define_method(rb_cString, "undump", str_undump, 0);
sym_ascii = ID2SYM(rb_intern("ascii")); sym_ascii = ID2SYM(rb_intern("ascii"));
sym_turkic = ID2SYM(rb_intern("turkic")); sym_turkic = ID2SYM(rb_intern("turkic"));

View file

@ -753,6 +753,41 @@ CODE
assert_equal(S('"\\u{10ABCD}"'), b.dump) assert_equal(S('"\\u{10ABCD}"'), b.dump)
end end
def test_undump
a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding)
assert_equal("abc".encode(Encoding::UTF_16LE),
'"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump)
assert_equal('\#', '"\\\\#"'.undump)
assert_equal('\#{', '"\\\\\#{"'.undump)
assert_raise(RuntimeError) { S('\u3042').undump }
assert_raise(RuntimeError) { S('"".force_encoding()').undump }
assert_raise(RuntimeError) { S('"".force_encoding("UNKNOWN")').undump }
assert_raise(RuntimeError) { S(%("\u00E4")).undump }
assert_raise(RuntimeError) { S('""""').undump }
assert_raise(RuntimeError) { S('"\u"').undump }
assert_raise(RuntimeError) { S('"\u{"').undump }
assert_raise(RuntimeError) { S('"\u{3042"').undump }
assert_raise(RuntimeError) { S('"\x"').undump }
assert_raise(RuntimeError) { S('"\\"').undump }
assert_raise(RuntimeError) { S(%("\0")).undump }
end
def test_dup def test_dup
for taint in [ false, true ] for taint in [ false, true ]
for frozen in [ false, true ] for frozen in [ false, true ]