diff --git a/include/utils/string.hpp b/include/utils/string.hpp index 331c8ac1..b3c99641 100644 --- a/include/utils/string.hpp +++ b/include/utils/string.hpp @@ -81,6 +81,9 @@ namespace string_util { string rtrim(string&& value, const char& needle = ' '); string trim(string&& value, const char& needle = ' '); + size_t char_len(const string& value); + string utf8_truncate(string&& value, size_t len); + string join(const vector& strs, const string& delim); vector& split_into(const string& s, char delim, vector& container); vector split(const string& s, char delim); diff --git a/src/drawtypes/label.cpp b/src/drawtypes/label.cpp index 6d53161a..99b7ccf8 100644 --- a/src/drawtypes/label.cpp +++ b/src/drawtypes/label.cpp @@ -48,8 +48,8 @@ namespace drawtypes { for (auto&& tok : m_tokens) { if (token == tok.token) { - if (tok.max != 0_z && replacement.length() > tok.max) { - replacement = replacement.erase(tok.max) + tok.suffix; + if (tok.max != 0_z && string_util::char_len(replacement) > tok.max) { + replacement = string_util::utf8_truncate(std::move(replacement), tok.max) + tok.suffix; } else if (tok.min != 0_z && replacement.length() < tok.min) { replacement.insert(0_z, tok.min - replacement.length(), ' '); } diff --git a/src/utils/string.cpp b/src/utils/string.cpp index 14ad7220..b74c26c3 100644 --- a/src/utils/string.cpp +++ b/src/utils/string.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -145,6 +146,48 @@ namespace string_util { return rtrim(ltrim(forward(value), needle), needle); } + /** + * Counts the number of codepoints in a utf8 encoded string. + */ + size_t char_len(const string& value) { + // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we + // simply count the number of bytes not of this form. + // + // 0xc0 = 11000000 + // 0x80 = 10000000 + return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; }); + } + + /** + * Truncates a utf8 string at len number of codepoints. This isn't 100% + * matching the user-perceived character count, but it should be close + * enough and avoids having to pull in something like ICU to count actual + * grapheme clusters. + */ + string utf8_truncate(string&& value, size_t len) { + if (value.empty()) { + return ""; + } + + // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we + // simply jump forward to bytes not of that form and truncate starting + // at that byte if we've counted too many codepoints + // + // 0xc0 = 11000000 + // 0x80 = 10000000 + auto it = value.begin(); + auto end = value.end(); + for (size_t i = 0; i < len; ++i) { + if (it == end) + break; + ++it; + it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; }); + } + value.erase(it, end); + + return forward(value); + } + /** * Join all strings in vector into a single string separated by delim */