fix(label): Truncate label replacements based on codepoint count

This helps ensure that when a string is truncated it is not done in the
middle of a utf8 multi-byte sequence. This doesn't 100% correspond to
user-perceived characters, but it should be pretty close in most cases.
This commit is contained in:
Chase Geigle 2017-02-14 13:39:07 -06:00 committed by Michael Carlberg
parent 1d06df25a9
commit 73faa18cf0
3 changed files with 48 additions and 2 deletions

View File

@ -81,6 +81,9 @@ namespace string_util {
string rtrim(string&& value, const char& needle = ' ');
string trim(string&& value, const char& needle = ' ');
size_t char_len(const string& value);
string utf8_truncate(string&& value, size_t len);
string join(const vector<string>& strs, const string& delim);
vector<string>& split_into(const string& s, char delim, vector<string>& container);
vector<string> split(const string& s, char delim);

View File

@ -48,8 +48,8 @@ namespace drawtypes {
for (auto&& tok : m_tokens) {
if (token == tok.token) {
if (tok.max != 0_z && replacement.length() > tok.max) {
replacement = replacement.erase(tok.max) + tok.suffix;
if (tok.max != 0_z && string_util::char_len(replacement) > tok.max) {
replacement = string_util::utf8_truncate(std::move(replacement), tok.max) + tok.suffix;
} else if (tok.min != 0_z && replacement.length() < tok.min) {
replacement.insert(0_z, tok.min - replacement.length(), ' ');
}

View File

@ -1,3 +1,4 @@
#include <algorithm>
#include <cstring>
#include <iomanip>
#include <sstream>
@ -145,6 +146,48 @@ namespace string_util {
return rtrim(ltrim(forward<string>(value), needle), needle);
}
/**
* Counts the number of codepoints in a utf8 encoded string.
*/
size_t char_len(const string& value) {
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
// simply count the number of bytes not of this form.
//
// 0xc0 = 11000000
// 0x80 = 10000000
return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; });
}
/**
* Truncates a utf8 string at len number of codepoints. This isn't 100%
* matching the user-perceived character count, but it should be close
* enough and avoids having to pull in something like ICU to count actual
* grapheme clusters.
*/
string utf8_truncate(string&& value, size_t len) {
if (value.empty()) {
return "";
}
// utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
// simply jump forward to bytes not of that form and truncate starting
// at that byte if we've counted too many codepoints
//
// 0xc0 = 11000000
// 0x80 = 10000000
auto it = value.begin();
auto end = value.end();
for (size_t i = 0; i < len; ++i) {
if (it == end)
break;
++it;
it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; });
}
value.erase(it, end);
return forward<string>(value);
}
/**
* Join all strings in vector into a single string separated by delim
*/