mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
Implement String#undump to unescape String#dump-ed string
[Feature #12275] [close GH-1765] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@61228 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
4abc1a24af
commit
bbec11d329
3 changed files with 320 additions and 17 deletions
1
NEWS
1
NEWS
|
@ -173,6 +173,7 @@ with all sufficient information, see the ChangeLog file or Redmine
|
|||
* String#delete_suffix, String#delete_suffix! [Feature #13665]
|
||||
* String#each_grapheme_cluster and String#grapheme_clusters to
|
||||
enumerate grapheme clusters [Feature #13780]
|
||||
* String#undump to unescape String#dump'ed string [Feature #12275]
|
||||
|
||||
* Struct
|
||||
|
||||
|
|
301
string.c
301
string.c
|
@ -19,6 +19,7 @@
|
|||
#include "ruby_assert.h"
|
||||
#include "id.h"
|
||||
#include "debug_counter.h"
|
||||
#include "ruby/util.h"
|
||||
|
||||
#define BEG(no) (regs->beg[(no)])
|
||||
#define END(no) (regs->end[(no)])
|
||||
|
@ -3422,13 +3423,34 @@ str_casecmp_p(VALUE str1, VALUE str2)
|
|||
return rb_str_eql(folded_str1, folded_str2);
|
||||
}
|
||||
|
||||
static long
|
||||
strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
|
||||
const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
|
||||
{
|
||||
const char *search_start = str_ptr;
|
||||
long pos, search_len = str_len - offset;
|
||||
|
||||
for (;;) {
|
||||
const char *t;
|
||||
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
|
||||
if (pos < 0) return pos;
|
||||
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
|
||||
if (t == search_start + pos) break;
|
||||
search_len -= t - search_start;
|
||||
if (search_len <= 0) return -1;
|
||||
offset += t - search_start;
|
||||
search_start = t;
|
||||
}
|
||||
return pos + offset;
|
||||
}
|
||||
|
||||
#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
|
||||
|
||||
static long
|
||||
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
|
||||
{
|
||||
const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
|
||||
long pos, str_len, sub_len, search_len;
|
||||
const char *str_ptr, *str_ptr_end, *sub_ptr;
|
||||
long str_len, sub_len;
|
||||
int single_byte = single_byte_optimizable(str);
|
||||
rb_encoding *enc;
|
||||
|
||||
|
@ -3458,21 +3480,7 @@ rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
|
|||
if (sub_len == 0) return offset;
|
||||
|
||||
/* need proceed one character at a time */
|
||||
|
||||
search_start = str_ptr;
|
||||
search_len = RSTRING_LEN(str) - offset;
|
||||
for (;;) {
|
||||
const char *t;
|
||||
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
|
||||
if (pos < 0) return pos;
|
||||
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
|
||||
if (t == search_start + pos) break;
|
||||
search_len -= t - search_start;
|
||||
if (search_len <= 0) return -1;
|
||||
offset += t - search_start;
|
||||
search_start = t;
|
||||
}
|
||||
return pos + offset;
|
||||
return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
|
||||
}
|
||||
|
||||
|
||||
|
@ -6073,6 +6081,264 @@ rb_str_dump(VALUE str)
|
|||
return result;
|
||||
}
|
||||
|
||||
enum undump_source_format {
|
||||
UNDUMP_SOURCE_SIMPLE, /* "..." */
|
||||
UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
|
||||
UNDUMP_SOURCE_INVALID
|
||||
};
|
||||
|
||||
static enum undump_source_format
|
||||
check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
|
||||
VALUE *forced_enc_str, long *forced_enc_str_len)
|
||||
{
|
||||
unsigned int cbeg, cend;
|
||||
const char *prev;
|
||||
static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
|
||||
static const char force_encoding_middle_part[] = "\".force_encoding(\"";
|
||||
static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
|
||||
static const char force_encoding_end_part[] = "\")";
|
||||
static const long force_encoding_end_part_len = rb_strlen_lit("\")");
|
||||
long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;
|
||||
|
||||
if (len < 2) return UNDUMP_SOURCE_INVALID;
|
||||
|
||||
cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
|
||||
if (cbeg != '"') return UNDUMP_SOURCE_INVALID;
|
||||
|
||||
prev = rb_enc_prev_char(s, s_end, s_end, enc);
|
||||
cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
|
||||
if (cend == '"') return UNDUMP_SOURCE_SIMPLE;
|
||||
|
||||
if (cend != ')' || len < force_encoding_minimum_len) {
|
||||
return UNDUMP_SOURCE_INVALID;
|
||||
}
|
||||
|
||||
/* find '".force_encoding("' */
|
||||
pos_before_middle_part = strseq_core(s, s_end, len,
|
||||
force_encoding_middle_part, force_encoding_middle_part_len,
|
||||
0, enc);
|
||||
if (pos_before_middle_part <= 0) {
|
||||
return UNDUMP_SOURCE_INVALID;
|
||||
}
|
||||
|
||||
pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
|
||||
/* find '")' */
|
||||
pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
|
||||
force_encoding_end_part, force_encoding_end_part_len,
|
||||
0, enc);
|
||||
if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
|
||||
return UNDUMP_SOURCE_INVALID;
|
||||
}
|
||||
|
||||
*forced_enc_str_len = pos_before_end_part;
|
||||
*forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
|
||||
return UNDUMP_SOURCE_FORCE_ENCODING;
|
||||
}
|
||||
|
||||
static int
|
||||
unescape_ascii(unsigned int c)
|
||||
{
|
||||
switch (c) {
|
||||
case 'n':
|
||||
return '\n';
|
||||
case 'r':
|
||||
return '\r';
|
||||
case 't':
|
||||
return '\t';
|
||||
case 'f':
|
||||
return '\f';
|
||||
case 'v':
|
||||
return '\13';
|
||||
case 'b':
|
||||
return '\010';
|
||||
case 'a':
|
||||
return '\007';
|
||||
case 'e':
|
||||
return 033;
|
||||
default:
|
||||
UNREACHABLE;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
|
||||
{
|
||||
unsigned int c, c2;
|
||||
int n, codelen;
|
||||
size_t hexlen;
|
||||
char buf[6];
|
||||
static rb_encoding *enc_utf8 = NULL;
|
||||
|
||||
c = rb_enc_codepoint_len(s, s_end, &n, *penc);
|
||||
switch (c) {
|
||||
case '\\':
|
||||
case '"':
|
||||
case '#':
|
||||
rb_str_cat(undumped, s, n); /* cat itself */
|
||||
n++;
|
||||
break;
|
||||
case 'n':
|
||||
case 'r':
|
||||
case 't':
|
||||
case 'f':
|
||||
case 'v':
|
||||
case 'b':
|
||||
case 'a':
|
||||
case 'e':
|
||||
*buf = (char)unescape_ascii(c);
|
||||
rb_str_cat(undumped, buf, n);
|
||||
n++;
|
||||
break;
|
||||
case 'u':
|
||||
if (s+1 >= s_end) {
|
||||
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
|
||||
}
|
||||
if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
|
||||
if (*penc != enc_utf8) {
|
||||
*penc = enc_utf8;
|
||||
rb_enc_associate(undumped, enc_utf8);
|
||||
ENC_CODERANGE_CLEAR(undumped);
|
||||
}
|
||||
c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
|
||||
if (c2 == '{') { /* handle \u{...} form */
|
||||
const char *hexstr = s + 2;
|
||||
int hex;
|
||||
static const char* const close_brace = "}";
|
||||
long pos;
|
||||
|
||||
if (hexstr >= s_end) {
|
||||
rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
|
||||
}
|
||||
/* find close brace */
|
||||
pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
|
||||
if (pos < 0) {
|
||||
rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
|
||||
}
|
||||
hex = scan_hex(hexstr, pos, &hexlen);
|
||||
if (hexlen == 0 || hexlen > 6) {
|
||||
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
|
||||
}
|
||||
if (hex > 0x10ffff) {
|
||||
rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
|
||||
}
|
||||
if ((hex & 0xfffff800) == 0xd800) {
|
||||
rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
|
||||
}
|
||||
codelen = rb_enc_codelen(hex, *penc);
|
||||
rb_enc_mbcput(hex, buf, *penc);
|
||||
rb_str_cat(undumped, buf, codelen);
|
||||
n += rb_strlen_lit("u{}") + hexlen;
|
||||
}
|
||||
else { /* handle \uXXXX form */
|
||||
int hex = scan_hex(s+1, 4, &hexlen);
|
||||
if (hexlen != 4) {
|
||||
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
|
||||
}
|
||||
codelen = rb_enc_codelen(hex, *penc);
|
||||
rb_enc_mbcput(hex, buf, *penc);
|
||||
rb_str_cat(undumped, buf, codelen);
|
||||
n += rb_strlen_lit("uXXXX");
|
||||
}
|
||||
break;
|
||||
case 'x':
|
||||
if (s+1 >= s_end) {
|
||||
rb_raise(rb_eRuntimeError, "invalid hex escape");
|
||||
}
|
||||
c2 = scan_hex(s+1, 2, &hexlen);
|
||||
if (hexlen != 2) {
|
||||
rb_raise(rb_eRuntimeError, "invalid hex escape");
|
||||
}
|
||||
*buf = (char)c2;
|
||||
rb_str_cat(undumped, buf, 1L);
|
||||
n += rb_strlen_lit("xXX");
|
||||
break;
|
||||
default:
|
||||
rb_str_cat(undumped, "\\", 1L); /* keep backslash */
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static VALUE rb_str_is_ascii_only_p(VALUE str);
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
* str.undump -> new_str
|
||||
*
|
||||
* Produces unescaped version of +str+.
|
||||
* See also String#dump because String#undump does inverse of String#dump.
|
||||
*
|
||||
* "\"hello \\n ''\"".undump #=> "hello \n ''"
|
||||
*/
|
||||
|
||||
static VALUE
|
||||
str_undump(VALUE str)
|
||||
{
|
||||
const char *s = RSTRING_PTR(str);
|
||||
const char *s_end = RSTRING_END(str);
|
||||
long len = RSTRING_LEN(str);
|
||||
rb_encoding *enc = rb_enc_get(str), *forced_enc;
|
||||
int n;
|
||||
unsigned int c;
|
||||
enum undump_source_format source_format;
|
||||
VALUE undumped = rb_enc_str_new(s, 0L, enc);
|
||||
VALUE forced_enc_str;
|
||||
long forced_enc_str_len;
|
||||
int w;
|
||||
|
||||
rb_must_asciicompat(str);
|
||||
if (rb_str_is_ascii_only_p(str) == Qfalse) {
|
||||
rb_raise(rb_eRuntimeError, "non-ASCII character detected");
|
||||
}
|
||||
if (!str_null_check(str, &w)) {
|
||||
rb_raise(rb_eRuntimeError, "string contains null byte");
|
||||
}
|
||||
|
||||
source_format = check_undump_source_format(s, s_end, len, enc,
|
||||
&forced_enc_str, &forced_enc_str_len);
|
||||
if (source_format == UNDUMP_SOURCE_INVALID) {
|
||||
rb_raise(rb_eRuntimeError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
|
||||
}
|
||||
if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
|
||||
forced_enc = rb_find_encoding(forced_enc_str);
|
||||
if (forced_enc == NULL) {
|
||||
rb_raise(rb_eRuntimeError, "unknown encoding name - %"PRIsVALUE, forced_enc_str);
|
||||
}
|
||||
}
|
||||
|
||||
/* strip '"' at the start */
|
||||
s++;
|
||||
if (source_format == UNDUMP_SOURCE_SIMPLE) {
|
||||
/* strip '"' at the end */
|
||||
s_end--;
|
||||
} else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
|
||||
/* strip '".force_encoding("...")' */
|
||||
s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
|
||||
}
|
||||
|
||||
for (; s < s_end; s += n) {
|
||||
c = rb_enc_codepoint_len(s, s_end, &n, enc);
|
||||
if (c == '\\') {
|
||||
if (s+1 >= s_end) {
|
||||
rb_raise(rb_eRuntimeError, "invalid escape");
|
||||
}
|
||||
n = undump_after_backslash(undumped, s+1, s_end, &enc);
|
||||
}
|
||||
else if (c == '"') {
|
||||
rb_raise(rb_eRuntimeError, "non-escaped double quote detected");
|
||||
}
|
||||
else {
|
||||
rb_str_cat(undumped, s, n);
|
||||
}
|
||||
}
|
||||
|
||||
if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
|
||||
rb_enc_associate(undumped, forced_enc);
|
||||
ENC_CODERANGE_CLEAR(undumped);
|
||||
}
|
||||
OBJ_INFECT(undumped, str);
|
||||
return undumped;
|
||||
}
|
||||
|
||||
static void
|
||||
rb_str_check_dummy_enc(rb_encoding *enc)
|
||||
|
@ -10586,6 +10852,7 @@ Init_String(void)
|
|||
rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
|
||||
rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
|
||||
rb_define_method(rb_cString, "dump", rb_str_dump, 0);
|
||||
rb_define_method(rb_cString, "undump", str_undump, 0);
|
||||
|
||||
sym_ascii = ID2SYM(rb_intern("ascii"));
|
||||
sym_turkic = ID2SYM(rb_intern("turkic"));
|
||||
|
|
|
@ -753,6 +753,41 @@ CODE
|
|||
assert_equal(S('"\\u{10ABCD}"'), b.dump)
|
||||
end
|
||||
|
||||
def test_undump
|
||||
a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
|
||||
assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
|
||||
assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
|
||||
assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
|
||||
assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
|
||||
assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
|
||||
assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
|
||||
assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
|
||||
|
||||
assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
|
||||
assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
|
||||
|
||||
assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding)
|
||||
|
||||
assert_equal("abc".encode(Encoding::UTF_16LE),
|
||||
'"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump)
|
||||
|
||||
assert_equal('\#', '"\\\\#"'.undump)
|
||||
assert_equal('\#{', '"\\\\\#{"'.undump)
|
||||
|
||||
assert_raise(RuntimeError) { S('\u3042').undump }
|
||||
assert_raise(RuntimeError) { S('"".force_encoding()').undump }
|
||||
assert_raise(RuntimeError) { S('"".force_encoding("UNKNOWN")').undump }
|
||||
assert_raise(RuntimeError) { S(%("\u00E4")).undump }
|
||||
assert_raise(RuntimeError) { S('""""').undump }
|
||||
|
||||
assert_raise(RuntimeError) { S('"\u"').undump }
|
||||
assert_raise(RuntimeError) { S('"\u{"').undump }
|
||||
assert_raise(RuntimeError) { S('"\u{3042"').undump }
|
||||
assert_raise(RuntimeError) { S('"\x"').undump }
|
||||
assert_raise(RuntimeError) { S('"\\"').undump }
|
||||
assert_raise(RuntimeError) { S(%("\0")).undump }
|
||||
end
|
||||
|
||||
def test_dup
|
||||
for taint in [ false, true ]
|
||||
for frozen in [ false, true ]
|
||||
|
|
Loading…
Reference in a new issue