1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Optimize CGI.escapeHTML by reducing buffer extension

and switch-case branches.

Buffer allocation optimization using `ALLOCA_N` would be the main
benefit of patch. It eliminates the O(N) buffer extensions.

It also reduces the number of branches using escape table like
https://mattn.kaoriya.net/software/lang/c/20160817011915.htm.

Closes: https://github.com/ruby/ruby/pull/2226

Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
Co-authored-by: Yasuhiro MATSUMOTO <mattn.jp@gmail.com>
This commit is contained in:
Takashi Kokubun 2019-06-05 19:28:51 +09:00
parent f3c877e8de
commit 0a29dc87e6
No known key found for this signature in database
GPG key ID: 6FFC433B12EE23DD
2 changed files with 75 additions and 48 deletions

View file

@ -0,0 +1,40 @@
prelude: require 'cgi/escape'
benchmark:
- name: escape_html_blank
prelude: str = ""
script: CGI.escapeHTML(str)
loop_count: 20000000
- name: escape_html_short_none
prelude: str = "abcde"
script: CGI.escapeHTML(str)
loop_count: 20000000
- name: escape_html_short_one
prelude: str = "abcd<"
script: CGI.escapeHTML(str)
loop_count: 20000000
- name: escape_html_short_all
prelude: str = "'&\"<>"
script: CGI.escapeHTML(str)
loop_count: 5000000
- name: escape_html_long_none
prelude: str = "abcde" * 300
script: CGI.escapeHTML(str)
loop_count: 1000000
- name: escape_html_long_all
prelude: str = "'&\"<>" * 10
script: CGI.escapeHTML(str)
loop_count: 1000000
- name: escape_html_real
prelude: | # http://example.com/
str = <<~HTML
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is established to be used for illustrative examples in documents. You may use this
domain in examples without prior coordination or asking for permission.</p>
<p><a href="http://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
HTML
script: CGI.escapeHTML(str)
loop_count: 1000000

View file

@ -11,27 +11,20 @@ RUBY_EXTERN const signed char ruby_digit36_to_number_table[];
static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
static ID id_accept_charset;
static void
html_escaped_cat(VALUE str, char c)
{
switch (c) {
case '\'':
rb_str_cat_cstr(str, "&#39;");
break;
case '&':
rb_str_cat_cstr(str, "&amp;");
break;
case '"':
rb_str_cat_cstr(str, "&quot;");
break;
case '<':
rb_str_cat_cstr(str, "&lt;");
break;
case '>':
rb_str_cat_cstr(str, "&gt;");
break;
}
}
#define HTML_ESCAPE_MAX_LEN 6
static const struct {
uint8_t len;
char str[HTML_ESCAPE_MAX_LEN+1];
} html_escape_table[UCHAR_MAX+1] = {
#define HTML_ESCAPE(c, str) [c] = {rb_strlen_lit(str), str}
HTML_ESCAPE('\'', "&#39;"),
HTML_ESCAPE('&', "&amp;"),
HTML_ESCAPE('"', "&quot;"),
HTML_ESCAPE('<', "&lt;"),
HTML_ESCAPE('>', "&gt;"),
#undef HTML_ESCAPE
};
static inline void
preserve_original_state(VALUE orig, VALUE dest)
@ -44,40 +37,34 @@ preserve_original_state(VALUE orig, VALUE dest)
static VALUE
optimized_escape_html(VALUE str)
{
long i, len, beg = 0;
VALUE dest = 0;
const char *cstr;
VALUE vbuf;
char *buf = ALLOCV_N(char, vbuf, RSTRING_LEN(str) * HTML_ESCAPE_MAX_LEN);
const char *cstr = RSTRING_PTR(str);
const char *end = cstr + RSTRING_LEN(str);
len = RSTRING_LEN(str);
cstr = RSTRING_PTR(str);
for (i = 0; i < len; i++) {
switch (cstr[i]) {
case '\'':
case '&':
case '"':
case '<':
case '>':
if (!dest) {
dest = rb_str_buf_new(len);
}
rb_str_cat(dest, cstr + beg, i - beg);
beg = i + 1;
html_escaped_cat(dest, cstr[i]);
break;
char *dest = buf;
while (cstr < end) {
const unsigned char c = *cstr++;
uint8_t len = html_escape_table[c].len;
if (len) {
memcpy(dest, html_escape_table[c].str, len);
dest += len;
}
else {
*dest++ = c;
}
}
if (dest) {
rb_str_cat(dest, cstr + beg, len - beg);
preserve_original_state(str, dest);
return dest;
VALUE escaped;
if (RSTRING_LEN(str) < (dest - buf)) {
escaped = rb_str_new(buf, dest - buf);
preserve_original_state(str, escaped);
}
else {
return rb_str_dup(str);
escaped = rb_str_dup(str);
}
ALLOCV_END(vbuf);
return escaped;
}
static VALUE