module Doc
# Regular expressions (regexps) are patterns which describe the
# contents of a string. They're used for testing whether a string contains a
# given pattern, or extracting the portions that match. They are created
# with the /pat/ and
# %r{pat} literals or the Regexp.new
# constructor.
#
# A regexp is usually delimited with forward slashes (/). For
# example:
#
# /hay/ =~ 'haystack' #=> 0
# /y/.match('haystack') #=> #
#
# If a string contains the pattern it is said to match. A literal
# string matches itself.
#
# # 'haystack' does not contain the pattern 'needle', so doesn't match.
# /needle/.match('haystack') #=> nil
# # 'haystack' does contain the pattern 'hay', so it matches
# /hay/.match('haystack') #=> #
#
# Specifically, /st/ requires that the string contains the letter
# _s_ followed by the letter _t_, so it matches _haystack_, also.
#
# == Metacharacters and Escapes
#
# The following are metacharacters (, ),
# [, ], {, }, ., ?,
# +, *. They have a specific meaning when appearing in a
# pattern. To match them literally they must be backslash-escaped. To match
# a backslash literally backslash-escape that: \\\\\\.
#
# /1 \+ 2 = 3\?/.match('Does 1 + 2 = 3?') #=> #
#
# Patterns behave like double-quoted strings so can contain the same
# backslash escapes.
#
# /\s\u{6771 4eac 90fd}/.match("Go to 東京都")
# #=> #
#
# Arbitrary Ruby expressions can be embedded into patterns with the
# #{...} construct.
#
# place = "東京都"
# /#{place}/.match("Go to 東京都")
# #=> #
#
# == Character Classes
#
# A character class is delimited with square brackets ([,
# ]) and lists characters that may appear at that point in the
# match. /[ab]/ means _a_ or _b_, as opposed to /ab/ which
# means _a_ followed by _b_.
#
# /W[aeiou]rd/.match("Word") #=> #
#
# Within a character class the hyphen (-) is a metacharacter
# denoting an inclusive range of characters. [abcd] is equivalent
# to [a-d]. A range can be followed by another range, so
# [abcdwxyz] is equivalent to [a-dw-z]. The order in which
# ranges or individual characters appear inside a character class is
# irrelevant.
#
# /[0-9a-f]/.match('9f') #=> #
# /[9f]/.match('9f') #=> #
#
# If the first character of a character class is a caret (^) the
# class is inverted: it matches any character _except_ those named.
#
# /[^a-eg-z]/.match('f') #=> #
#
# A character class may contain another character class. By itself this
# isn't useful because [a-z[0-9]] describes the same set as
# [a-z0-9]. However, character classes also support the &&
# operator which performs set intersection on its arguments. The two can be
# combined as follows:
#
# /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z))
# # This is equivalent to:
# /[abh-w]/
#
# The following metacharacters also behave like character classes:
#
# * /./ - Any character except a newline.
# * /./m - Any character (the +m+ modifier enables multiline mode)
# * /\w/ - A word character ([a-zA-Z0-9_])
# * /\W/ - A non-word character ([^a-zA-Z0-9_])
# * /\d/ - A digit character ([0-9])
# * /\D/ - A non-digit character ([^0-9])
# * /\h/ - A hexdigit character ([0-9a-fA-F])
# * /\H/ - A non-hexdigit character ([^0-9a-fA-F])
# * /\s/ - A whitespace character: /[ \t\r\n\f]/
# * /\S/ - A non-whitespace character: /[^ \t\r\n\f]/
#
# POSIX bracket expressions are also similar to character classes.
# They provide a portable alternative to the above, with the added benefit
# that they encompass non-ASCII characters. For instance, /\d/
# matches only the ASCII decimal digits (0-9); whereas /[[:digit:]]/
# matches any character in the Unicode _Nd_ category.
#
# * /[[:alnum:]]/ - Alphabetic and numeric character
# * /[[:alpha:]]/ - Alphabetic character
# * /[[:blank:]]/ - Space or tab
# * /[[:cntrl:]]/ - Control character
# * /[[:digit:]]/ - Digit
# * /[[:graph:]]/ - Non-blank character (excludes spaces, control
# characters, and similar)
# * /[[:lower:]]/ - Lowercase alphabetical character
# * /[[:print:]]/ - Like [:graph:], but includes the space character
# * /[[:punct:]]/ - Punctuation character
# * /[[:space:]]/ - Whitespace character ([:blank:], newline,
# carriage return, etc.)
# * /[[:upper:]]/ - Uppercase alphabetical
# * /[[:xdigit:]]/ - Digit allowed in a hexadecimal number (i.e.,
# 0-9a-fA-F)
#
# Ruby also supports the following non-POSIX character classes:
#
# * /[[:word:]]/ - A character in one of the following Unicode
# general categories _Letter_, _Mark_, _Number_,
# Connector_Punctuation
# * /[[:ascii:]]/ - A character in the ASCII character set
#
# # U+06F2 is "EXTENDED ARABIC-INDIC DIGIT TWO"
# /[[:digit:]]/.match("\u06F2") #=> #
# /[[:upper:]][[:lower:]]/.match("Hello") #=> #
# /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> #
#
# == Repetition
#
# The constructs described so far match a single character. They can be
# followed by a repetition metacharacter to specify how many times they need
# to occur. Such metacharacters are called quantifiers.
#
# * * - Zero or more times
# * + - One or more times
# * ? - Zero or one times (optional)
# * {n} - Exactly n times
# * {n,} - n or more times
# * {,m} - m or less times
# * {n,m} - At least n and
# at most m times
#
# # At least one uppercase character ('H'), at least one lowercase
# # character ('e'), two 'l' characters, then one 'o'
# "Hello".match(/[[:upper:]]+[[:lower:]]+l{2}o/) #=> #
#
# Repetition is greedy by default: as many occurrences as possible
# are matched while still allowing the overall match to succeed. By
# contrast, lazy matching makes the minimal amount of matches
# necessary for overall success. A greedy metacharacter can be made lazy by
# following it with ?.
#
# # Both patterns below match the string. The fist uses a greedy
# # quantifier so '.+' matches ''; the second uses a lazy
# # quantifier so '.+?' matches ''.
# /<.+>/.match("") #=> #">
# /<.+?>/.match("") #=> #">
#
# A quantifier followed by + matches possessively: once it
# has matched it does not backtrack. They behave like greedy quantifiers,
# but having matched they refuse to "give up" their match even if this
# jeopardises the overall match.
#
# == Capturing
#
# Parentheses can be used for capturing. The text enclosed by the
# nth group of parentheses can be subsequently referred to
# with n. Within a pattern use the backreference
# \n; outside of the pattern use
# MatchData[n].
#
# # 'at' is captured by the first group of parentheses, then referred to
# # later with \1
# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")
# #=> #
# # Regexp#match returns a MatchData object which makes the captured
# # text available with its #[] method.
# /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at'
#
# Capture groups can be referred to by name when defined with the
# (?<name>) or (?'name')
# constructs.
#
# /\$(?\d+)\.(?\d+)/.match("$3.67")
# => #
# /\$(?\d+)\.(?\d+)/.match("$3.67")[:dollars] #=> "3"
#
# Named groups can be backreferenced with \k<name>,
# where _name_ is the group name.
#
# /(?[aeiou]).\k.\k/.match('ototomy')
# #=> #
#
# *Note*: A regexp can't use named backreferences and numbered
# backreferences simultaneously.
#
# When named capture groups are used with a literal regexp on the left-hand
# side of an expression and the =~ operator, the captured text is
# also assigned to local variables with corresponding names.
#
# /\$(?\d+)\.(?\d+)/ =~ "$3.67" #=> 0
# dollars #=> "3"
#
# == Grouping
#
# Parentheses also group the terms they enclose, allowing them to be
# quantified as one atomic whole.
#
# # The pattern below matches a vowel followed by 2 word characters:
# # 'aen'
# /[aeiou]\w{2}/.match("Caenorhabditis elegans") #=> #
# # Whereas the following pattern matches a vowel followed by a word
# # character, twice, i.e. [aeiou]\w[aeiou]\w: 'enor'.
# /([aeiou]\w){2}/.match("Caenorhabditis elegans")
# #=> #
#
# The (?:...) construct provides grouping without
# capturing. That is, it combines the terms it contains into an atomic whole
# without creating a backreference. This benefits performance at the slight
# expense of readabilty.
#
# # The group of parentheses captures 'n' and the second 'ti'. The
# # second group is referred to later with the backreference \2
# /I(n)ves(ti)ga\2ons/.match("Investigations")
# #=> #
# # The first group of parentheses is now made non-capturing with '?:',
# # so it still matches 'n', but doesn't create the backreference. Thus,
# # the backreference \1 now refers to 'ti'.
# /I(?:n)ves(ti)ga\1ons/.match("Investigations")
# #=> #
#
# === Atomic Grouping
#
# Grouping can be made atomic with
# (?>pat). This causes the subexpression pat
# to be matched independently of the rest of the expression such that what
# it matches becomes fixed for the remainder of the match, unless the entire
# subexpression must be abandoned and subsequently revisited. In this
# way pat is treated as a non-divisible whole. Atomic grouping is
# typically used to optimise patterns so as to prevent the regular
# expression engine from backtracking needlesly.
#
# # The " in the pattern below matches the first character of
# # the string, then .* matches Quote". This causes the
# # overall match to fail, so the text matched by .* is
# # backtracked by one position, which leaves the final character of the
# # string available to match "
# /".*"/.match('"Quote"') #=> #
# # If .* is grouped atomically, it refuses to backtrack
# # Quote", even though this means that the overall match fails
# /"(?>.*)"/.match('"Quote"') #=> nil
#
# == Subexpression Calls
#
# The \g<name> syntax matches the previous
# subexpression named _name_, which can be a group name or number, again.
# This differs from backreferences in that it re-executes the group rather
# than simply trying to re-match the same text.
#
# # Matches a ( character and assigns it to the paren
# # group, tries to call that the paren sub-expression again
# # but fails, then matches a literal ).
# /\A(?\(\g*\))*\z/ =~ '()'
#
#
# /\A(?\(\g*\))*\z/ =~ '(())' #=> 0
# # ^1
# # ^2
# # ^3
# # ^4
# # ^5
# # ^6
# # ^7
# # ^8
# # ^9
# # ^10
#
# 1. Matches at the beginning of the string, i.e. before the first
# character.
# 2. Enters a named capture group called paren
# 3. Matches a literal (, the first character in the string
# 4. Calls the paren group again, i.e. recurses back to the
# second step
# 5. Re-enters the paren group
# 6. Matches a literal (, the second character in the
# string
# 7. Try to call paren a third time, but fail because
# doing so would prevent an overall successful match
# 8. Match a literal ), the third character in the string.
# Marks the end of the second recursive call
# 9. Match a literal ), the fourth character in the string
# 10. Match the end of the string
#
# == Alternation
#
# The vertical bar metacharacter (|) combines two expressions into
# a single one that matches either of the expressions. Each expression is an
# alternative.
#
# /\w(and|or)\w/.match("Feliformia") #=> #
# /\w(and|or)\w/.match("furandi") #=> #
# /\w(and|or)\w/.match("dissemblance") #=> nil
#
# == Character Properties
#
# The \p{} construct matches characters with the named property,
# much like POSIX bracket classes.
#
# * /\p{Alnum}/ - Alphabetic and numeric character
# * /\p{Alpha}/ - Alphabetic character
# * /\p{Blank}/ - Space or tab
# * /\p{Cntrl}/ - Control character
# * /\p{Digit}/ - Digit
# * /\p{Graph}/ - Non-blank character (excludes spaces, control
# characters, and similar)
# * /\p{Lower}/ - Lowercase alphabetical character
# * /\p{Print}/ - Like \p{Graph}, but includes the space character
# * /\p{Punct}/ - Punctuation character
# * /\p{Space}/ - Whitespace character ([:blank:], newline,
# carriage return, etc.)
# * /\p{Upper}/ - Uppercase alphabetical
# * /\p{XDigit}/ - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F)
# * /\p{Word}/ - A member of one of the following Unicode general
# category Letter, Mark, Number,
# Connector\_Punctuation
# * /\p{ASCII}/ - A character in the ASCII character set
# * /\p{Any}/ - Any Unicode character (including unassigned
# characters)
# * /\p{Assigned}/ - An assigned character
#
# A Unicode character's General Category value can also be matched
# with \p{Ab} where Ab is the category's
# abbreviation as described below:
#
# * /\p{L}/ - 'Letter'
# * /\p{Ll}/ - 'Letter: Lowercase'
# * /\p{Lm}/ - 'Letter: Mark'
# * /\p{Lo}/ - 'Letter: Other'
# * /\p{Lt}/ - 'Letter: Titlecase'
# * /\p{Lu}/ - 'Letter: Uppercase
# * /\p{Lo}/ - 'Letter: Other'
# * /\p{M}/ - 'Mark'
# * /\p{Mn}/ - 'Mark: Nonspacing'
# * /\p{Mc}/ - 'Mark: Spacing Combining'
# * /\p{Me}/ - 'Mark: Enclosing'
# * /\p{N}/ - 'Number'
# * /\p{Nd}/ - 'Number: Decimal Digit'
# * /\p{Nl}/ - 'Number: Letter'
# * /\p{No}/ - 'Number: Other'
# * /\p{P}/ - 'Punctuation'
# * /\p{Pc}/ - 'Punctuation: Connector'
# * /\p{Pd}/ - 'Punctuation: Dash'
# * /\p{Ps}/ - 'Punctuation: Open'
# * /\p{Pe}/ - 'Punctuation: Close'
# * /\p{Pi}/ - 'Punctuation: Initial Quote'
# * /\p{Pf}/ - 'Punctuation: Final Quote'
# * /\p{Po}/ - 'Punctuation: Other'
# * /\p{S}/ - 'Symbol'
# * /\p{Sm}/ - 'Symbol: Math'
# * /\p{Sc}/ - 'Symbol: Currency'
# * /\p{Sc}/ - 'Symbol: Currency'
# * /\p{Sk}/ - 'Symbol: Modifier'
# * /\p{So}/ - 'Symbol: Other'
# * /\p{Z}/ - 'Separator'
# * /\p{Zs}/ - 'Separator: Space'
# * /\p{Zl}/ - 'Separator: Line'
# * /\p{Zp}/ - 'Separator: Paragraph'
# * /\p{C}/ - 'Other'
# * /\p{Cc}/ - 'Other: Control'
# * /\p{Cf}/ - 'Other: Format'
# * /\p{Cn}/ - 'Other: Not Assigned'
# * /\p{Co}/ - 'Other: Private Use'
# * /\p{Cs}/ - 'Other: Surrogate'
#
# Lastly, \p{} matches a character's Unicode script. The
# following scripts are supported: Arabic, Armenian,
# Balinese, Bengali, Bopomofo, Braille,
# Buginese, Buhid, Canadian_Aboriginal, Carian,
# Cham, Cherokee, Common, Coptic,
# Cuneiform, Cypriot, Cyrillic, Deseret,
# Devanagari, Ethiopic, Georgian, Glagolitic,
# Gothic, Greek, Gujarati, Gurmukhi, Han,
# Hangul, Hanunoo, Hebrew, Hiragana,
# Inherited, Kannada, Katakana, Kayah_Li,
# Kharoshthi, Khmer, Lao, Latin, Lepcha,
# Limbu, Linear_B, Lycian, Lydian,
# Malayalam, Mongolian, Myanmar, New_Tai_Lue,
# Nko, Ogham, Ol_Chiki, Old_Italic,
# Old_Persian, Oriya, Osmanya, Phags_Pa,
# Phoenician, Rejang, Runic, Saurashtra,
# Shavian, Sinhala, Sundanese, Syloti_Nagri,
# Syriac, Tagalog, Tagbanwa, Tai_Le,
# Tamil, Telugu, Thaana, Thai, Tibetan,
# Tifinagh, Ugaritic, Vai, and Yi.
#
# # Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and
# # belongs to the Arabic script.
# /\p{Arabic}/.match("\u06E9") #=> #
#
# All character properties can be inverted by prefixing their name with a
# caret (^).
#
# # Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so
# # this match succeeds
# /\p{^Ll}/.match("A") #=> #
#
# == Anchors
#
# Anchors are metacharacter that match the zero-width positions between
# characters, anchoring the match to a specific position.
#
# * ^ - Matches beginning of line
# * $ - Matches end of line
# * \A - Matches beginning of string.
# * \Z - Matches end of string. If string ends with a newline,
# it matches just before newline
# * \z - Matches end of string
# * \G - Matches point where last match finished
# * \b - Matches word boundaries when outside brackets; backspace
# (0x08) inside brackets
# * \B - Matches non-word boundaries
# * (?=pat) - Positive lookahead assertion:
# ensures that the following characters match pat, but doesn't
# include those characters in the matched text
# * (?!pat) - Negative lookahead assertion:
# ensures that the following characters do not match pat, but
# doesn't include those characters in the matched text
# * (?<=pat) - Positive lookbehind
# assertion: ensures that the preceding characters match pat, but
# doesn't include those characters in the matched text
# * (?pat) - Negative lookbehind
# assertion: ensures that the preceding characters do not match
# pat, but doesn't include those characters in the matched text
#
# # If a pattern isn't anchored it can begin at any point in the string
# /real/.match("surrealist") #=> #
# # Anchoring the pattern to the beginning of the string forces the
# # match to start there. 'real' doesn't occur at the beginning of the
# # string, so now the match fails
# /\Areal/.match("surrealist") #=> nil
# # The match below fails because although 'Demand' contains 'and', the
# pattern does not occur at a word boundary.
# /\band/.match("Demand")
# # Whereas in the following example 'and' has been anchored to a
# # non-word boundary so instead of matching the first 'and' it matches
# # from the fourth letter of 'demand' instead
# /\Band.+/.match("Supply and demand curve") #=> #
# # The pattern below uses positive lookahead and positive lookbehind to
# # match text appearing in tags without including the tags in the
# # match
# /(?<=)\w+(?=<\/b>)/.match("Fortune favours the bold")
# #=> #
#
# == Options
#
# The end delimiter for a regexp can be followed by one or more single-letter
# options which control how the pattern can match.
#
# * /pat/i - Ignore case
# * /pat/m - Treat a newline as a character matched by .
# * /pat/x - Ignore whitespace and comments in the pattern
# * /pat/o - Perform #{} interpolation only once
#
# i, m, and x can also be applied on the
# subexpression level with the
# (?on-off) construct, which
# enables options on, and disables options off for the
# expression enclosed by the parentheses.
#
# /a(?i:b)c/.match('aBc') #=> #
# /a(?i:b)c/.match('abc') #=> #
#
# == Free-Spacing Mode and Comments
#
# As mentioned above, the x option enables free-spacing
# mode. Literal white space inside the pattern is ignored, and the
# octothorpe (#) character introduces a comment until the end of
# the line. This allows the components of the pattern to be organised in a
# potentially more readable fashion.
#
# # A contrived pattern to match a number with optional decimal places
# float_pat = /\A
# [[:digit:]]+ # 1 or more digits before the decimal point
# (\. # Decimal point
# [[:digit:]]+ # 1 or more digits after the decimal point
# )? # The decimal point and following digits are optional
# \Z/x
# float_pat.match('3.14') #=> #
#
# *Note*: To match whitespace in an x pattern use an escape such as
# \s or \p{Space}.
#
# Comments can be included in a non-x pattern with the
# (?#comment) construct, where comment is
# arbitrary text ignored by the regexp engine.
#
# == Encoding
#
# Regular expressions are assumed to use the source encoding. This can be
# overridden with one of the following modifiers.
#
# * /pat/u - UTF-8
# * /pat/e - EUC-JP
# * /pat/s - Windows-31J
# * /pat/n - ASCII-8BIT
#
# A regexp can be matched against a string when they either share an
# encoding, or the regexp's encoding is _US-ASCII_ and the string's encoding
# is ASCII-compatible.
#
# If a match between incompatible encodings is attempted an
# Encoding::CompatibilityError exception is raised.
#
# The Regexp#fixed_encoding? predicate indicates whether the regexp
# has a fixed encoding, that is one incompatible with ASCII. A
# regexp's encoding can be explicitly fixed by supplying
# Regexp::FIXEDENCODING as the second argument of
# Regexp.new:
#
# r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING)
# r =~"a\u3042"
# #=> Encoding::CompatibilityError: incompatible encoding regexp match
# (ISO-8859-1 regexp with UTF-8 string)
#
# == Performance
#
# Certain pathological combinations of constructs can lead to abysmally bad
# performance.
#
# Consider a string of 25 as, a d, 4 as, and a
# c.
#
# s = 'a' * 25 + 'd' 'a' * 4 + 'c'
# #=> "aaaaaaaaaaaaaaaaaaaaaaaaadadadadac"
#
# The following patterns match instantly as you would expect:
#
# /(b|a)/ =~ s #=> 0
# /(b|a+)/ =~ s #=> 0
# /(b|a+)*\/ =~ s #=> 0
#
# However, the following pattern takes appreciably longer:
#
# /(b|a+)*c/ =~ s #=> 32
#
# This happens because an atom in the regexp is quantified by both an
# immediate + and an enclosing * with nothing to
# differentiate which is in control of any particular character. The
# nondeterminism that results produces super-linear performance. (Consult
# Mastering Regular Expressions (3rd ed.), pp 222, by
# Jeffery Friedl, for an in-depth analysis). This particular case
# can be fixed by use of atomic grouping, which prevents the unnecessary
# backtracking:
#
# (start = Time.now) && /(b|a+)*c/ =~ s && (Time.now - start)
# #=> 24.702736882
# (start = Time.now) && /(?>b|a+)*c/ =~ s && (Time.now - start)
# #=> 0.000166571
#
# A similar case is typified by the following example, which takes
# approximately 60 seconds to execute for me:
#
# # Match a string of 29 as against a pattern of 29 optional
# # as followed by 29 mandatory as.
# Regexp.new('a?' * 29 + 'a' * 29) =~ 'a' * 29
#
# The 29 optional as match the string, but this prevents the 29
# mandatory as that follow from matching. Ruby must then backtrack
# repeatedly so as to satisfy as many of the optional matches as it can
# while still matching the mandatory 29. It is plain to us that none of the
# optional matches can succeed, but this fact unfortunately eludes Ruby.
#
# One approach for improving performance is to anchor the match to the
# beginning of the string, thus significantly reducing the amount of
# backtracking needed.
#
# Regexp.new('\A' 'a?' * 29 + 'a' * 29).match('a' * 29)
# #=> #
#
#
class Regexp; end
end