diff --git a/include/ruby/internal/ctype.h b/include/ruby/internal/ctype.h index ba9eb20390..0f7ca6c516 100644 --- a/include/ruby/internal/ctype.h +++ b/include/ruby/internal/ctype.h @@ -29,34 +29,161 @@ #include "ruby/internal/attr/artificial.h" #include "ruby/internal/attr/const.h" #include "ruby/internal/attr/constexpr.h" +#include "ruby/internal/attr/nonnull.h" #include "ruby/internal/dllexport.h" +/** + * @name Old character classification macros + * + * What is this #ISPRINT business? Well, according to our VCS and some + * internet surfing, it appears that the initial intent of these macros were to + * mimic codes appear in common in several GNU projects. As far as @shyouhei + * detects they seem to originate GNU regex (that standalone one rather than + * Gnulib or Glibc), and at least date back to 1995. + * + * Let me lawfully quote from a GNU coreutils commit + * https://git.savannah.gnu.org/cgit/coreutils.git/commit/?id=49803907f5dbd7646184a8912c9db9b09dcd0f22 + * + * > Jim Meyering writes: + * > + * > "... Some ctype macros are valid only for character codes that + * > isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when + * > using /bin/cc or gcc but without giving an ansi option). So, all + * > ctype uses should be through macros like ISPRINT... If + * > STDC_HEADERS is defined, then autoconf has verified that the ctype + * > macros don't need to be guarded with references to isascii. ... + * > Defining isascii to 1 should let any compiler worth its salt + * > eliminate the && through constant folding." + * > + * > Bruno Haible adds: + * > + * > "... Furthermore, isupper(c) etc. have an undefined result if c is + * > outside the range -1 <= c <= 255. One is tempted to write isupper(c) + * > with c being of type `char', but this is wrong if c is an 8-bit + * > character >= 128 which gets sign-extended to a negative value. + * > The macro ISUPPER protects against this as well." + * + * So the intent was to reroute old problematic systems that no longer exist. + * At the same time the problems described above no longer hurt us, because we + * decided to completely avoid using system-provided isupper etc. to reinvent + * the wheel. These macros are entirely legacy; please ignore them. + * + * But let me also put stress that GNU people are wise; they use those macros + * only inside of their own implementations and never let them be public. On + * the other hand ruby has thoughtlessly publicised them to 3rd party libraries + * since its beginning, which is a very bad idea. These macros are too easy to + * get conflicted with definitions elsewhere. + * + * New programs should stick to the `rb_` prefixed names. + * + * @note It seems we just mimic the API. We do not share their implementation + * with GPL-ed programs. + * + * @{ + */ #ifndef ISPRINT -# define ISASCII rb_isascii -# define ISPRINT rb_isprint -# define ISGRAPH rb_isgraph -# define ISSPACE rb_isspace -# define ISUPPER rb_isupper -# define ISLOWER rb_islower -# define ISALNUM rb_isalnum -# define ISALPHA rb_isalpha -# define ISDIGIT rb_isdigit -# define ISXDIGIT rb_isxdigit -# define ISBLANK rb_isblank -# define ISCNTRL rb_iscntrl -# define ISPUNCT rb_ispunct +# define ISASCII rb_isascii /**< @old{rb_isascii}*/ +# define ISPRINT rb_isprint /**< @old{rb_isprint}*/ +# define ISGRAPH rb_isgraph /**< @old{rb_isgraph}*/ +# define ISSPACE rb_isspace /**< @old{rb_isspace}*/ +# define ISUPPER rb_isupper /**< @old{rb_isupper}*/ +# define ISLOWER rb_islower /**< @old{rb_islower}*/ +# define ISALNUM rb_isalnum /**< @old{rb_isalnum}*/ +# define ISALPHA rb_isalpha /**< @old{rb_isalpha}*/ +# define ISDIGIT rb_isdigit /**< @old{rb_isdigit}*/ +# define ISXDIGIT rb_isxdigit /**< @old{rb_isxdigit}*/ +# define ISBLANK rb_isblank /**< @old{rb_isblank}*/ +# define ISCNTRL rb_iscntrl /**< @old{rb_iscntrl}*/ +# define ISPUNCT rb_ispunct /**< @old{rb_ispunct}*/ #endif -#define TOUPPER rb_toupper -#define TOLOWER rb_tolower -#define STRCASECMP st_locale_insensitive_strcasecmp -#define STRNCASECMP st_locale_insensitive_strncasecmp -#define STRTOUL ruby_strtoul +#define TOUPPER rb_toupper /**< @old{rb_toupper}*/ +#define TOLOWER rb_tolower /**< @old{rb_tolower}*/ +#define STRCASECMP st_locale_insensitive_strcasecmp /**< @old{st_locale_insensitive_strcasecmp}*/ +#define STRNCASECMP st_locale_insensitive_strncasecmp /**< @old{st_locale_insensitive_strncasecmp}*/ +#define STRTOUL ruby_strtoul /**< @old{ruby_strtoul}*/ + +/** @} */ RBIMPL_SYMBOL_EXPORT_BEGIN() -/* locale insensitive functions */ +/** @name locale insensitive functions + * @{ + */ + +/* In descriptions below, `the POSIX Locale` and `the "C" locale` are tactfully + * used as to whether the described function mimics POSIX or C99. */ + +RBIMPL_ATTR_NONNULL(()) +/** + * Our own locale-insensitive version of `strcasecmp(3)`. The "case" here + * always means that of the POSIX Locale. It doesn't depend on runtime locale + * settings. + * + * @param[in] s1 Comparison LHS. + * @param[in] s2 Comparison RHS. + * @retval -1 `s1` is "less" than `s2`. + * @retval 0 Both strings converted into lowercase would be identical. + * @retval 1 `s1` is "greater" than `s2`. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + */ int st_locale_insensitive_strcasecmp(const char *s1, const char *s2); + +RBIMPL_ATTR_NONNULL(()) +/** + * Our own locale-insensitive version of `strcnasecmp(3)`. The "case" here + * always means that of the POSIX Locale. It doesn't depend on runtime locale + * settings. + * + * @param[in] s1 Comparison LHS. + * @param[in] s2 Comparison RHS. + * @param[in] n Comparison shall stop after first `n` bytes are scanned. + * @retval -1 `s1` is "less" than `s2`. + * @retval 0 Both strings converted into lowercase would be identical. + * @retval 1 `s1` is "greater" than `s2`. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning This function is _not_ timing safe. + */ int st_locale_insensitive_strncasecmp(const char *s1, const char *s2, size_t n); + +RBIMPL_ATTR_NONNULL((1)) +/** + * Our own locale-insensitive version of `strtoul(3)`. The conversion is done + * as if the current locale is set to the "C" locale, no matter actual runtime + * locale settings. + * + * @note This is needed because `strtoul("i", 0, 36)` would return zero + * if it is locale sensitive and the current locale is `tr_TR`. + * @param[in] str String of digits, optionally preceded with whitespaces + * (ignored) and optionally `+` or `-` sign. + * @param[out] endptr NULL, or an arbitrary pointer (overwritten on return). + * @param[in] base `2` to `36` inclusive for each base, or special case + * `0` to detect the base from the contents of the string. + * @return Converted integer, casted to unsigned long. + * @post If `endptr` is not NULL, it is updated to point the first such + * byte where conversion failed. + * @note This function sets `errno` on failure. + * - `EINVAL`: Passed `base` is out of range. + * - `ERANGE`: Converted integer is out of range of `long`. + * @warning As far as @shyouhei reads ISO/IEC 9899:2018 section 7.22.1.4, a + * conforming `strtoul` implementation shall render `ERANGE` + * whenever it finds the input string represents a negative + * integer. Such thing can never be representable using `unsigned + * long`. However this implementation does not honour that + * language. It just casts such negative value to the return + * type, resulting a very big return value. This behaviour is at + * least questionable. But we can no longer change that at this + * point. + * @note Not only does this function works under the "C" locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + */ unsigned long ruby_strtoul(const char *str, char **endptr, int base); RBIMPL_SYMBOL_EXPORT_END() @@ -68,6 +195,16 @@ RBIMPL_SYMBOL_EXPORT_END() RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isascii(3)`. + * + * @param[in] c Byte in question to query. + * @retval false `c` is out of range of ASCII character set. + * @retval true Yes it is. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isascii(int c) { @@ -77,6 +214,20 @@ rb_isascii(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isupper(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "upper". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isupper(int c) { @@ -86,6 +237,20 @@ rb_isupper(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `islower(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "lower". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_islower(int c) { @@ -95,6 +260,21 @@ rb_islower(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isalpha(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 + * "upper" or "lower". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isalpha(int c) { @@ -104,6 +284,20 @@ rb_isalpha(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isdigit(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "digit". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isdigit(int c) { @@ -113,6 +307,21 @@ rb_isdigit(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isalnum(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 + * "upper", "lower", or "digit". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isalnum(int c) { @@ -122,6 +331,20 @@ rb_isalnum(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isxdigit(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "xdigit". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isxdigit(int c) { @@ -131,6 +354,20 @@ rb_isxdigit(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isblank(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "blank". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isblank(int c) { @@ -140,6 +377,20 @@ rb_isblank(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isspace(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "space". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isspace(int c) { @@ -149,6 +400,20 @@ rb_isspace(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `iscntrl(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "cntrl". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_iscntrl(int c) { @@ -158,6 +423,21 @@ rb_iscntrl(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Identical to rb_isgraph(), except it also returns true for `' '`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 + * "upper", "lower", "digit", "punct", or a `' '`. + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isprint(int c) { @@ -167,6 +447,20 @@ rb_isprint(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `ispunct(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "punct". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_ispunct(int c) { @@ -176,6 +470,21 @@ rb_ispunct(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `isgraph(3)`. + * + * @param[in] c Byte in question to query. + * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 + * "upper", "lower", "digit", or "punct". + * @retval false Anything else. + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_isgraph(int c) { @@ -185,6 +494,22 @@ rb_isgraph(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `tolower(3)`. + * + * @param[in] c Byte in question to convert. + * @retval c The byte is not listed in in IEEE 1003.1 section + * 7.3.1.1 "upper". + * @retval otherwise Byte converted using the map defined in IEEE 1003.1 + * section 7.3.1 "tolower". + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_tolower(int c) { @@ -194,10 +519,27 @@ rb_tolower(int c) RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() +/** + * Our own locale-insensitive version of `toupper(3)`. + * + * @param[in] c Byte in question to convert. + * @retval c The byte is not listed in in IEEE 1003.1 section + * 7.3.1.1 "lower". + * @retval otherwise Byte converted using the map defined in IEEE 1003.1 + * section 7.3.1 "toupper". + * @note Not only does this function works under the POSIX Locale, but + * also assumes its execution character set be what ruby calls an + * ASCII-compatible character set; which does not include for + * instance EBCDIC or UTF-16LE. + * @warning `c` is an int. This means that when you pass a `char` value + * here, it experiences "integer promotion" as defined in ISO/IEC + * 9899:2018 section 6.3.1.1 paragraph 1. + */ static inline int rb_toupper(int c) { return rb_islower(c) ? (c&0x5f) : c; } +/** @} */ #endif /* RBIMPL_CTYPE_H */ diff --git a/util.c b/util.c index 0c8de209cb..1b11ecb3f5 100644 --- a/util.c +++ b/util.c @@ -97,6 +97,8 @@ NO_SANITIZE("unsigned-integer-overflow", extern unsigned long ruby_scan_digits(c unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow) { + RBIMPL_ASSERT_OR_ASSUME(base >= 2); + RBIMPL_ASSERT_OR_ASSUME(base <= 36); const char *start = str; unsigned long ret = 0, x; @@ -136,6 +138,11 @@ ruby_strtoul(const char *str, char **endptr, int base) unsigned long ret; const char *subject_found = str; + if (base < 0) { + errno = EINVAL; + return 0; + } + if (base == 1 || 36 < base) { errno = EINVAL; return 0;