fix(label): Truncate label replacements based on codepoint count

This helps ensure that when a string is truncated it is not done in the middle of a utf8 multi-byte sequence. This doesn't 100% correspond to user-perceived characters, but it should be pretty close in most cases.
2025-02-17 15:55:20 -05:00 · 2017-02-14 13:39:07 -06:00 · 2017-02-14 13:39:07 -06:00 · 73faa18cf0
commit 73faa18cf0
parent 1d06df25a9
3 changed files with 48 additions and 2 deletions
--- a/include/utils/string.hpp
+++ b/include/utils/string.hpp
@ -81,6 +81,9 @@ namespace string_util {
  string rtrim(string&& value, const char& needle = ' ');
  string trim(string&& value, const char& needle = ' ');

+  size_t char_len(const string& value);
+  string utf8_truncate(string&& value, size_t len);
+
  string join(const vector<string>& strs, const string& delim);
  vector<string>& split_into(const string& s, char delim, vector<string>& container);
  vector<string> split(const string& s, char delim);
--- a/src/drawtypes/label.cpp
+++ b/src/drawtypes/label.cpp
@ -48,8 +48,8 @@ namespace drawtypes {

    for (auto&& tok : m_tokens) {
      if (token == tok.token) {
-        if (tok.max != 0_z && replacement.length() > tok.max) {
-          replacement = replacement.erase(tok.max) + tok.suffix;
+        if (tok.max != 0_z && string_util::char_len(replacement) > tok.max) {
+          replacement = string_util::utf8_truncate(std::move(replacement), tok.max) + tok.suffix;
        } else if (tok.min != 0_z && replacement.length() < tok.min) {
          replacement.insert(0_z, tok.min - replacement.length(), ' ');
        }
--- a/src/utils/string.cpp
+++ b/src/utils/string.cpp
@ -1,3 +1,4 @@
+#include <algorithm>
 #include <cstring>
 #include <iomanip>
 #include <sstream>
@ -145,6 +146,48 @@ namespace string_util {
    return rtrim(ltrim(forward<string>(value), needle), needle);
  }

+  /**
+   * Counts the number of codepoints in a utf8 encoded string.
+   */
+  size_t char_len(const string& value) {
+    // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
+    // simply count the number of bytes not of this form.
+    //
+    // 0xc0 = 11000000
+    // 0x80 = 10000000
+    return std::count_if(value.begin(), value.end(), [](char c) { return (c & 0xc0) != 0x80; });
+  }
+
+  /**
+   * Truncates a utf8 string at len number of codepoints. This isn't 100%
+   * matching the user-perceived character count, but it should be close
+   * enough and avoids having to pull in something like ICU to count actual
+   * grapheme clusters.
+   */
+  string utf8_truncate(string&& value, size_t len) {
+    if (value.empty()) {
+      return "";
+    }
+
+    // utf-8 bytes of the form 10xxxxxx are continuation bytes, so we
+    // simply jump forward to bytes not of that form and truncate starting
+    // at that byte if we've counted too many codepoints
+    //
+    // 0xc0 = 11000000
+    // 0x80 = 10000000
+    auto it = value.begin();
+    auto end = value.end();
+    for (size_t i = 0; i < len; ++i) {
+      if (it == end)
+        break;
+      ++it;
+      it = std::find_if(it, end, [](char c) { return (c & 0xc0) != 0x80; });
+    }
+    value.erase(it, end);
+
+    return forward<string>(value);
+  }
+
  /**
   * Join all strings in vector into a single string separated by delim
   */