578 lines
23 KiB
Ruby
578 lines
23 KiB
Ruby
module Hanami
|
|
module Utils
|
|
# HTML escape utilities
|
|
#
|
|
# Based on OWASP research and OWASP ESAPI code
|
|
#
|
|
# @since 0.4.0
|
|
#
|
|
# @see https://www.owasp.org
|
|
# @see https://www.owasp.org/index.php/Cross-site_Scripting_%28XSS%29
|
|
# @see https://www.owasp.org/index.php/XSS_(Cross_Site_Scripting)_Prevention_Cheat_Sheet
|
|
# @see https://www.owasp.org/index.php/ESAPI
|
|
# @see https://github.com/ESAPI/esapi-java-legacy
|
|
module Escape
|
|
# Hex base for base 10 integer conversion
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see http://www.ruby-doc.org/core/Fixnum.html#method-i-to_s
|
|
HEX_BASE = 16
|
|
|
|
# Limit for non printable chars
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
LOW_HEX_CODE_LIMIT = 0xff
|
|
|
|
# Replacement hex for non printable characters
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
REPLACEMENT_HEX = "fffd".freeze
|
|
|
|
# Low hex codes lookup table
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
HEX_CODES = (0..255).each_with_object({}) do |c, codes|
|
|
if (c >= 0x30 && c <= 0x39) || (c >= 0x41 && c <= 0x5A) || (c >= 0x61 && c <= 0x7A)
|
|
codes[c] = nil
|
|
else
|
|
codes[c] = c.to_s(HEX_BASE)
|
|
end
|
|
end.freeze
|
|
|
|
# Non printable chars
|
|
#
|
|
# This is a Hash instead of a Set, to make lookup faster.
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see https://gist.github.com/jodosha/ac5dd54416de744b9600
|
|
NON_PRINTABLE_CHARS = {
|
|
0x0 => true, 0x1 => true, 0x2 => true, 0x3 => true, 0x4 => true,
|
|
0x5 => true, 0x6 => true, 0x7 => true, 0x8 => true, 0x11 => true,
|
|
0x12 => true, 0x14 => true, 0x15 => true, 0x16 => true, 0x17 => true,
|
|
0x18 => true, 0x19 => true, 0x1a => true, 0x1b => true, 0x1c => true,
|
|
0x1d => true, 0x1e => true, 0x1f => true, 0x7f => true, 0x80 => true,
|
|
0x81 => true, 0x82 => true, 0x83 => true, 0x84 => true, 0x85 => true,
|
|
0x86 => true, 0x87 => true, 0x88 => true, 0x89 => true, 0x8a => true,
|
|
0x8b => true, 0x8c => true, 0x8d => true, 0x8e => true, 0x8f => true,
|
|
0x90 => true, 0x91 => true, 0x92 => true, 0x93 => true, 0x94 => true,
|
|
0x95 => true, 0x96 => true, 0x97 => true, 0x98 => true, 0x99 => true,
|
|
0x9a => true, 0x9b => true, 0x9c => true, 0x9d => true, 0x9e => true,
|
|
0x9f => true
|
|
}.freeze
|
|
|
|
# Lookup table for HTML escape
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see Hanami::Utils::Escape.html
|
|
HTML_CHARS = {
|
|
'&' => '&',
|
|
'<' => '<',
|
|
'>' => '>',
|
|
'"' => '"',
|
|
"'" => ''',
|
|
'/' => '/'
|
|
}.freeze
|
|
|
|
# Lookup table for safe chars for HTML attributes.
|
|
#
|
|
# This is a Hash instead of a Set, to make lookup faster.
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see Lookup::Utils::Escape.html_attribute
|
|
# @see https://gist.github.com/jodosha/ac5dd54416de744b9600
|
|
HTML_ATTRIBUTE_SAFE_CHARS = {
|
|
',' => true, '.' => true, '-' => true, '_' => true
|
|
}.freeze
|
|
|
|
# Lookup table for HTML attribute escape
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see Hanami::Utils::Escape.html_attribute
|
|
HTML_ENTITIES = {
|
|
34 => 'quot', # quotation mark
|
|
38 => 'amp', # ampersand
|
|
60 => 'lt', # less-than sign
|
|
62 => 'gt', # greater-than sign
|
|
160 => 'nbsp', # no-break space
|
|
161 => 'iexcl', # inverted exclamation mark
|
|
162 => 'cent', # cent sign
|
|
163 => 'pound', # pound sign
|
|
164 => 'curren', # currency sign
|
|
165 => 'yen', # yen sign
|
|
166 => 'brvbar', # broken bar
|
|
167 => 'sect', # section sign
|
|
168 => 'uml', # diaeresis
|
|
169 => 'copy', # copyright sign
|
|
170 => 'ordf', # feminine ordinal indicator
|
|
171 => 'laquo', # left-pointing double angle quotation mark
|
|
172 => 'not', # not sign
|
|
173 => 'shy', # soft hyphen
|
|
174 => 'reg', # registered sign
|
|
175 => 'macr', # macron
|
|
176 => 'deg', # degree sign
|
|
177 => 'plusmn', # plus-minus sign
|
|
178 => 'sup2', # superscript two
|
|
179 => 'sup3', # superscript three
|
|
180 => 'acute', # acute accent
|
|
181 => 'micro', # micro sign
|
|
182 => 'para', # pilcrow sign
|
|
183 => 'middot', # middle dot
|
|
184 => 'cedil', # cedilla
|
|
185 => 'sup1', # superscript one
|
|
186 => 'ordm', # masculine ordinal indicator
|
|
187 => 'raquo', # right-pointing double angle quotation mark
|
|
188 => 'frac14', # vulgar fraction one quarter
|
|
189 => 'frac12', # vulgar fraction one half
|
|
190 => 'frac34', # vulgar fraction three quarters
|
|
191 => 'iquest', # inverted question mark
|
|
192 => 'Agrave', # Latin capital letter a with grave
|
|
193 => 'Aacute', # Latin capital letter a with acute
|
|
194 => 'Acirc', # Latin capital letter a with circumflex
|
|
195 => 'Atilde', # Latin capital letter a with tilde
|
|
196 => 'Auml', # Latin capital letter a with diaeresis
|
|
197 => 'Aring', # Latin capital letter a with ring above
|
|
198 => 'AElig', # Latin capital letter ae
|
|
199 => 'Ccedil', # Latin capital letter c with cedilla
|
|
200 => 'Egrave', # Latin capital letter e with grave
|
|
201 => 'Eacute', # Latin capital letter e with acute
|
|
202 => 'Ecirc', # Latin capital letter e with circumflex
|
|
203 => 'Euml', # Latin capital letter e with diaeresis
|
|
204 => 'Igrave', # Latin capital letter i with grave
|
|
205 => 'Iacute', # Latin capital letter i with acute
|
|
206 => 'Icirc', # Latin capital letter i with circumflex
|
|
207 => 'Iuml', # Latin capital letter i with diaeresis
|
|
208 => 'ETH', # Latin capital letter eth
|
|
209 => 'Ntilde', # Latin capital letter n with tilde
|
|
210 => 'Ograve', # Latin capital letter o with grave
|
|
211 => 'Oacute', # Latin capital letter o with acute
|
|
212 => 'Ocirc', # Latin capital letter o with circumflex
|
|
213 => 'Otilde', # Latin capital letter o with tilde
|
|
214 => 'Ouml', # Latin capital letter o with diaeresis
|
|
215 => 'times', # multiplication sign
|
|
216 => 'Oslash', # Latin capital letter o with stroke
|
|
217 => 'Ugrave', # Latin capital letter u with grave
|
|
218 => 'Uacute', # Latin capital letter u with acute
|
|
219 => 'Ucirc', # Latin capital letter u with circumflex
|
|
220 => 'Uuml', # Latin capital letter u with diaeresis
|
|
221 => 'Yacute', # Latin capital letter y with acute
|
|
222 => 'THORN', # Latin capital letter thorn
|
|
223 => 'szlig', # Latin small letter sharp sXCOMMAX German Eszett
|
|
224 => 'agrave', # Latin small letter a with grave
|
|
225 => 'aacute', # Latin small letter a with acute
|
|
226 => 'acirc', # Latin small letter a with circumflex
|
|
227 => 'atilde', # Latin small letter a with tilde
|
|
228 => 'auml', # Latin small letter a with diaeresis
|
|
229 => 'aring', # Latin small letter a with ring above
|
|
230 => 'aelig', # Latin lowercase ligature ae
|
|
231 => 'ccedil', # Latin small letter c with cedilla
|
|
232 => 'egrave', # Latin small letter e with grave
|
|
233 => 'eacute', # Latin small letter e with acute
|
|
234 => 'ecirc', # Latin small letter e with circumflex
|
|
235 => 'euml', # Latin small letter e with diaeresis
|
|
236 => 'igrave', # Latin small letter i with grave
|
|
237 => 'iacute', # Latin small letter i with acute
|
|
238 => 'icirc', # Latin small letter i with circumflex
|
|
239 => 'iuml', # Latin small letter i with diaeresis
|
|
240 => 'eth', # Latin small letter eth
|
|
241 => 'ntilde', # Latin small letter n with tilde
|
|
242 => 'ograve', # Latin small letter o with grave
|
|
243 => 'oacute', # Latin small letter o with acute
|
|
244 => 'ocirc', # Latin small letter o with circumflex
|
|
245 => 'otilde', # Latin small letter o with tilde
|
|
246 => 'ouml', # Latin small letter o with diaeresis
|
|
247 => 'divide', # division sign
|
|
248 => 'oslash', # Latin small letter o with stroke
|
|
249 => 'ugrave', # Latin small letter u with grave
|
|
250 => 'uacute', # Latin small letter u with acute
|
|
251 => 'ucirc', # Latin small letter u with circumflex
|
|
252 => 'uuml', # Latin small letter u with diaeresis
|
|
253 => 'yacute', # Latin small letter y with acute
|
|
254 => 'thorn', # Latin small letter thorn
|
|
255 => 'yuml', # Latin small letter y with diaeresis
|
|
338 => 'OElig', # Latin capital ligature oe
|
|
339 => 'oelig', # Latin small ligature oe
|
|
352 => 'Scaron', # Latin capital letter s with caron
|
|
353 => 'scaron', # Latin small letter s with caron
|
|
376 => 'Yuml', # Latin capital letter y with diaeresis
|
|
402 => 'fnof', # Latin small letter f with hook
|
|
710 => 'circ', # modifier letter circumflex accent
|
|
732 => 'tilde', # small tilde
|
|
913 => 'Alpha', # Greek capital letter alpha
|
|
914 => 'Beta', # Greek capital letter beta
|
|
915 => 'Gamma', # Greek capital letter gamma
|
|
916 => 'Delta', # Greek capital letter delta
|
|
917 => 'Epsilon', # Greek capital letter epsilon
|
|
918 => 'Zeta', # Greek capital letter zeta
|
|
919 => 'Eta', # Greek capital letter eta
|
|
920 => 'Theta', # Greek capital letter theta
|
|
921 => 'Iota', # Greek capital letter iota
|
|
922 => 'Kappa', # Greek capital letter kappa
|
|
923 => 'Lambda', # Greek capital letter lambda
|
|
924 => 'Mu', # Greek capital letter mu
|
|
925 => 'Nu', # Greek capital letter nu
|
|
926 => 'Xi', # Greek capital letter xi
|
|
927 => 'Omicron', # Greek capital letter omicron
|
|
928 => 'Pi', # Greek capital letter pi
|
|
929 => 'Rho', # Greek capital letter rho
|
|
931 => 'Sigma', # Greek capital letter sigma
|
|
932 => 'Tau', # Greek capital letter tau
|
|
933 => 'Upsilon', # Greek capital letter upsilon
|
|
934 => 'Phi', # Greek capital letter phi
|
|
935 => 'Chi', # Greek capital letter chi
|
|
936 => 'Psi', # Greek capital letter psi
|
|
937 => 'Omega', # Greek capital letter omega
|
|
945 => 'alpha', # Greek small letter alpha
|
|
946 => 'beta', # Greek small letter beta
|
|
947 => 'gamma', # Greek small letter gamma
|
|
948 => 'delta', # Greek small letter delta
|
|
949 => 'epsilon', # Greek small letter epsilon
|
|
950 => 'zeta', # Greek small letter zeta
|
|
951 => 'eta', # Greek small letter eta
|
|
952 => 'theta', # Greek small letter theta
|
|
953 => 'iota', # Greek small letter iota
|
|
954 => 'kappa', # Greek small letter kappa
|
|
955 => 'lambda', # Greek small letter lambda
|
|
956 => 'mu', # Greek small letter mu
|
|
957 => 'nu', # Greek small letter nu
|
|
958 => 'xi', # Greek small letter xi
|
|
959 => 'omicron', # Greek small letter omicron
|
|
960 => 'pi', # Greek small letter pi
|
|
961 => 'rho', # Greek small letter rho
|
|
962 => 'sigmaf', # Greek small letter final sigma
|
|
963 => 'sigma', # Greek small letter sigma
|
|
964 => 'tau', # Greek small letter tau
|
|
965 => 'upsilon', # Greek small letter upsilon
|
|
966 => 'phi', # Greek small letter phi
|
|
967 => 'chi', # Greek small letter chi
|
|
968 => 'psi', # Greek small letter psi
|
|
969 => 'omega', # Greek small letter omega
|
|
977 => 'thetasym', # Greek theta symbol
|
|
978 => 'upsih', # Greek upsilon with hook symbol
|
|
982 => 'piv', # Greek pi symbol
|
|
8194 => 'ensp', # en space
|
|
8195 => 'emsp', # em space
|
|
8201 => 'thinsp', # thin space
|
|
8204 => 'zwnj', # zero width non-joiner
|
|
8205 => 'zwj', # zero width joiner
|
|
8206 => 'lrm', # left-to-right mark
|
|
8207 => 'rlm', # right-to-left mark
|
|
8211 => 'ndash', # en dash
|
|
8212 => 'mdash', # em dash
|
|
8216 => 'lsquo', # left single quotation mark
|
|
8217 => 'rsquo', # right single quotation mark
|
|
8218 => 'sbquo', # single low-9 quotation mark
|
|
8220 => 'ldquo', # left double quotation mark
|
|
8221 => 'rdquo', # right double quotation mark
|
|
8222 => 'bdquo', # double low-9 quotation mark
|
|
8224 => 'dagger', # dagger
|
|
8225 => 'Dagger', # double dagger
|
|
8226 => 'bull', # bullet
|
|
8230 => 'hellip', # horizontal ellipsis
|
|
8240 => 'permil', # per mille sign
|
|
8242 => 'prime', # prime
|
|
8243 => 'Prime', # double prime
|
|
8249 => 'lsaquo', # single left-pointing angle quotation mark
|
|
8250 => 'rsaquo', # single right-pointing angle quotation mark
|
|
8254 => 'oline', # overline
|
|
8260 => 'frasl', # fraction slash
|
|
8364 => 'euro', # euro sign
|
|
8465 => 'image', # black-letter capital i
|
|
8472 => 'weierp', # script capital pXCOMMAX Weierstrass p
|
|
8476 => 'real', # black-letter capital r
|
|
8482 => 'trade', # trademark sign
|
|
8501 => 'alefsym', # alef symbol
|
|
8592 => 'larr', # leftwards arrow
|
|
8593 => 'uarr', # upwards arrow
|
|
8594 => 'rarr', # rightwards arrow
|
|
8595 => 'darr', # downwards arrow
|
|
8596 => 'harr', # left right arrow
|
|
8629 => 'crarr', # downwards arrow with corner leftwards
|
|
8656 => 'lArr', # leftwards double arrow
|
|
8657 => 'uArr', # upwards double arrow
|
|
8658 => 'rArr', # rightwards double arrow
|
|
8659 => 'dArr', # downwards double arrow
|
|
8660 => 'hArr', # left right double arrow
|
|
8704 => 'forall', # for all
|
|
8706 => 'part', # partial differential
|
|
8707 => 'exist', # there exists
|
|
8709 => 'empty', # empty set
|
|
8711 => 'nabla', # nabla
|
|
8712 => 'isin', # element of
|
|
8713 => 'notin', # not an element of
|
|
8715 => 'ni', # contains as member
|
|
8719 => 'prod', # n-ary product
|
|
8721 => 'sum', # n-ary summation
|
|
8722 => 'minus', # minus sign
|
|
8727 => 'lowast', # asterisk operator
|
|
8730 => 'radic', # square root
|
|
8733 => 'prop', # proportional to
|
|
8734 => 'infin', # infinity
|
|
8736 => 'ang', # angle
|
|
8743 => 'and', # logical and
|
|
8744 => 'or', # logical or
|
|
8745 => 'cap', # intersection
|
|
8746 => 'cup', # union
|
|
8747 => 'int', # integral
|
|
8756 => 'there4', # therefore
|
|
8764 => 'sim', # tilde operator
|
|
8773 => 'cong', # congruent to
|
|
8776 => 'asymp', # almost equal to
|
|
8800 => 'ne', # not equal to
|
|
8801 => 'equiv', # identical toXCOMMAX equivalent to
|
|
8804 => 'le', # less-than or equal to
|
|
8805 => 'ge', # greater-than or equal to
|
|
8834 => 'sub', # subset of
|
|
8835 => 'sup', # superset of
|
|
8836 => 'nsub', # not a subset of
|
|
8838 => 'sube', # subset of or equal to
|
|
8839 => 'supe', # superset of or equal to
|
|
8853 => 'oplus', # circled plus
|
|
8855 => 'otimes', # circled times
|
|
8869 => 'perp', # up tack
|
|
8901 => 'sdot', # dot operator
|
|
8968 => 'lceil', # left ceiling
|
|
8969 => 'rceil', # right ceiling
|
|
8970 => 'lfloor', # left floor
|
|
8971 => 'rfloor', # right floor
|
|
9001 => 'lang', # left-pointing angle bracket
|
|
9002 => 'rang', # right-pointing angle bracket
|
|
9674 => 'loz', # lozenge
|
|
9824 => 'spades', # black spade suit
|
|
9827 => 'clubs', # black club suit
|
|
9829 => 'hearts', # black heart suit
|
|
9830 => 'diams', # black diamond suit
|
|
}.freeze
|
|
|
|
# Allowed URL schemes
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see Hanami::Utils::Escape.url
|
|
DEFAULT_URL_SCHEMES = ['http', 'https', 'mailto'].freeze
|
|
|
|
# The output of an escape.
|
|
#
|
|
# It's marked with this special class for two reasons:
|
|
#
|
|
# * Don't double escape the same string (this is for `Hanami::Helpers` compatibility)
|
|
# * Leave open the possibility to developers to mark a string as safe with an higher API (eg. `#raw` in `Hanami::View` or `Hanami::Helpers`)
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
class SafeString < ::String
|
|
# @return [SafeString] the duped string
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see http://www.ruby-doc.org/core/String.html#method-i-to_s
|
|
def to_s
|
|
dup
|
|
end
|
|
|
|
# Encode the string the given encoding
|
|
#
|
|
# @return [SafeString] an encoded SafeString
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
#
|
|
# @see http://www.ruby-doc.org/core/String.html#method-i-encode
|
|
def encode(*args)
|
|
self.class.new super
|
|
end
|
|
end
|
|
|
|
# Escape HTML contents
|
|
#
|
|
# This MUST be used only for tag contents.
|
|
# Please use `html_attribute` for escaping HTML attributes.
|
|
#
|
|
# @param input [String] the input
|
|
#
|
|
# @return [String] the escaped string
|
|
#
|
|
# @since 0.4.0
|
|
#
|
|
# @see https://www.owasp.org/index.php/XSS_%28Cross_Site_Scripting%29_Prevention_Cheat_Sheet OWASP XSS Cheat Sheet Rule #1
|
|
#
|
|
# @example Good practice
|
|
# <div><%= Hanami::Utils::Escape.html('<script>alert(1);</script>') %></div>
|
|
# <div><script>alert(1);</script></div>
|
|
#
|
|
# @example Bad practice
|
|
# # WRONG Use Escape.html_attribute
|
|
# <a title="<%= Hanami::Utils::Escape.html('...') %>">link</a>
|
|
def self.html(input)
|
|
input = encode(input)
|
|
return input if input.is_a?(SafeString)
|
|
|
|
result = SafeString.new
|
|
|
|
input.chars do |chr|
|
|
result << HTML_CHARS.fetch(chr, chr)
|
|
end
|
|
|
|
result
|
|
end
|
|
|
|
# Escape HTML attributes
|
|
#
|
|
# This can be used both for HTML attributes and contents.
|
|
# Please note that this is more computational expensive.
|
|
# If you need to escape only HTML contents, please use `.html`.
|
|
#
|
|
# @param input [String] the input
|
|
#
|
|
# @return [String] the escaped string
|
|
#
|
|
# @since 0.4.0
|
|
#
|
|
# @see https://www.owasp.org/index.php/XSS_%28Cross_Site_Scripting%29_Prevention_Cheat_Sheet OWASP XSS Cheat Sheet Rule #2
|
|
#
|
|
# @example Good practice
|
|
# <a title="<%= Hanami::Utils::Escape.html_attribute('...') %>">link</a>
|
|
#
|
|
# @example Good but expensive practice
|
|
# # Alternatively you can use Escape.html
|
|
# <p><%= Hanami::Utils::Escape.html_attribute('...') %></p>
|
|
def self.html_attribute(input)
|
|
input = encode(input)
|
|
return input if input.is_a?(SafeString)
|
|
|
|
result = SafeString.new
|
|
|
|
input.chars do |chr|
|
|
result << encode_char(chr, HTML_ATTRIBUTE_SAFE_CHARS)
|
|
end
|
|
|
|
result
|
|
end
|
|
|
|
# Escape URL for HTML attributes (href, src, etc..).
|
|
#
|
|
# It extracts from the given input the first valid URL that matches the
|
|
# whitelisted schemes (default: http, https and mailto).
|
|
#
|
|
# It's possible to pass a second optional argument to specify different
|
|
# schemes.
|
|
#
|
|
# @param input [String] the input
|
|
# @param schemes [Array<String>] an array of whitelisted schemes
|
|
#
|
|
# @return [String] the escaped string
|
|
#
|
|
# @since 0.4.0
|
|
#
|
|
# @see Hanami::Utils::Escape::DEFAULT_URL_SCHEMES
|
|
# @see http://www.ruby-doc.org/stdlib/libdoc/uri/rdoc/URI.html#method-c-extract
|
|
#
|
|
# @example Basic usage
|
|
# <%
|
|
# good_input = "http://hanamirb.org"
|
|
# evil_input = "javascript:alert('xss')"
|
|
#
|
|
# escaped_good_input = Hanami::Utils::Escape.url(good_input) # => "http://hanamirb.org"
|
|
# escaped_evil_input = Hanami::Utils::Escape.url(evil_input) # => ""
|
|
# %>
|
|
#
|
|
# <a href="<%= escaped_good_input %>">personal website</a>
|
|
# <a href="<%= escaped_evil_input %>">personal website</a>
|
|
#
|
|
# @example Custom scheme
|
|
# <%
|
|
# schemes = ['ftp', 'ftps']
|
|
#
|
|
# accepted = "ftps://ftp.example.org"
|
|
# rejected = "http://www.example.org"
|
|
#
|
|
# escaped_accepted = Hanami::Utils::Escape.url(accepted) # => "ftps://ftp.example.org"
|
|
# escaped_rejected = Hanami::Utils::Escape.url(rejected) # => ""
|
|
# %>
|
|
#
|
|
# <a href="<%= escaped_accepted %>">FTP</a>
|
|
# <a href="<%= escaped_rejected %>">FTP</a>
|
|
def self.url(input, schemes = DEFAULT_URL_SCHEMES)
|
|
input = encode(input)
|
|
return input if input.is_a?(SafeString)
|
|
|
|
SafeString.new(
|
|
URI.extract(
|
|
URI.decode(input),
|
|
schemes
|
|
).first.to_s
|
|
)
|
|
end
|
|
|
|
private
|
|
# Encode the given string into UTF-8
|
|
#
|
|
# @param input [String] the input
|
|
#
|
|
# @return [String] an UTF-8 encoded string
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
def self.encode(input)
|
|
return '' if input.nil?
|
|
input.encode(Encoding::UTF_8)
|
|
rescue Encoding::UndefinedConversionError
|
|
input.dup.force_encoding(Encoding::UTF_8)
|
|
end
|
|
|
|
# Encode the given UTF-8 char.
|
|
#
|
|
# @param char [String] an UTF-8 char
|
|
# @param safe_chars [Hash] a table of safe chars
|
|
#
|
|
# @return [String] an HTML encoded string
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
def self.encode_char(char, safe_chars = {})
|
|
return char if safe_chars[char]
|
|
|
|
code = char.ord
|
|
hex = hex_for_non_alphanumeric_code(code)
|
|
return char if hex.nil?
|
|
|
|
if NON_PRINTABLE_CHARS[code]
|
|
hex = REPLACEMENT_HEX
|
|
end
|
|
|
|
if entity = HTML_ENTITIES[code]
|
|
"&#{ entity };"
|
|
else
|
|
"&#x#{ hex };"
|
|
end
|
|
end
|
|
|
|
# Transforms the given char code
|
|
#
|
|
# @since 0.4.0
|
|
# @api private
|
|
def self.hex_for_non_alphanumeric_code(input)
|
|
if input < LOW_HEX_CODE_LIMIT
|
|
HEX_CODES[input]
|
|
else
|
|
input.to_s(HEX_BASE)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|