From 574bf2da828b4e3ab0ce3ce7fccd58879db60430 Mon Sep 17 00:00:00 2001
From: Tom Hinton <tom.hinton@cse.org.uk>
Date: Thu, 1 Oct 2015 11:41:44 +0100
Subject: [PATCH 1/6] Make dmenu reading very marginally faster

A slight reduction in use of realloc and avoidance of 3 or 4 strlens for
a string we know the length of
---
 include/helper.h       |  2 +-
 source/dialogs/dmenu.c | 28 ++++++++++++++++++++--------
 source/helper.c        |  4 ++--
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/helper.h b/include/helper.h
index cdd4bc7f..6d6f1826 100644
--- a/include/helper.h
+++ b/include/helper.h
@@ -16,7 +16,7 @@ int helper_parse_setup ( char * string, char ***output, int *length, ... );
 /**
  * Implementation of fgets with custom separator.
  */
-char* fgets_s ( char* s, int n, FILE *iop, char sep );
+char* fgets_s ( char* s, unsigned int n, FILE *iop, char sep );
 
 /**
  * @param token The string for which we want a collation key.
diff --git a/source/dialogs/dmenu.c b/source/dialogs/dmenu.c
index 1ef5eb2d..b82ca50e 100644
--- a/source/dialogs/dmenu.c
+++ b/source/dialogs/dmenu.c
@@ -59,28 +59,40 @@ typedef struct _DmenuModePrivateData
 
 static char **get_dmenu ( unsigned int *length )
 {
-    char buffer[1024];
+    const unsigned int buf_size = 1024;
+    char buffer[buf_size];
     char **retv = NULL;
+    char *buffer_end = NULL;
+    unsigned int rvlength = 1;
 
     *length = 0;
 
-    while ( fgets_s ( buffer, 1024, stdin, (char) config.separator ) != NULL ) {
-        retv                  = g_realloc ( retv, ( ( *length ) + 2 ) * sizeof ( char* ) );
-        retv[( *length )]     = g_strdup ( buffer );
-        retv[( *length ) + 1] = NULL;
+    while ( ( buffer_end = fgets_s ( buffer, buf_size, stdin, (char) config.separator ) ) != NULL ) {
+        if (rvlength < (*length + 2)) {
+          rvlength *= 2;
+          retv      = g_realloc ( retv, ( rvlength ) * sizeof ( char* ) );
+        }
+
+        size_t blength = buffer_end - &(buffer[0]);
+
+        char *copy = g_malloc( blength + 1 );
+        memcpy(copy, buffer, blength);
 
         // Filter out line-end.
-        if ( retv[( *length )][strlen ( buffer ) - 1] == '\n' ) {
-            retv[( *length )][strlen ( buffer ) - 1] = '\0';
+        if ( copy[blength] == '\n' ) {
+            copy[blength] = '\0';
         }
 
+        retv[( *length )]     = copy;
+        retv[( *length ) + 1] = NULL;
+
         ( *length )++;
         // Stop when we hit 2³¹ entries.
         if ( ( *length ) == INT_MAX ) {
             return retv;
         }
     }
-
+    retv      = g_realloc ( retv, ( *length + 1 ) * sizeof ( char* ) );
     return retv;
 }
 
diff --git a/source/helper.c b/source/helper.c
index 94b90a28..fd3ee577 100644
--- a/source/helper.c
+++ b/source/helper.c
@@ -52,7 +52,7 @@ void cmd_set_arguments ( int argc, char **argv )
 /**
  *  `fgets` implementation with custom separator.
  */
-char* fgets_s ( char* s, int n, FILE *iop, char sep )
+char* fgets_s ( char* s, unsigned int n, FILE *iop, char sep )
 {
     // Map these to registers.
     register int c = EOF;
@@ -72,7 +72,7 @@ char* fgets_s ( char* s, int n, FILE *iop, char sep )
     *cs = '\0';
     // if last read was end of file and current index is start, we are done:
     // Return NULL.
-    return ( c == EOF && cs == s ) ? NULL : s;
+    return ( c == EOF && cs == s ) ? NULL : cs;
 }
 
 /**

From af6a4b83ebdb83f24b6913b372e207bcf245ea0c Mon Sep 17 00:00:00 2001
From: Tom Hinton <tom.hinton@cse.org.uk>
Date: Thu, 1 Oct 2015 12:16:41 +0100
Subject: [PATCH 2/6] Make normal filtering of plain ASCII lines faster

This patch adds a field lines_not_ascii to the MenuState structure. The
nth entry is 0 unless the nth member of MenuState.lines has a non-ascii
codepoint in it.

All comparison functions (menu_match_cb type) take an additional
argument to tell them if the thing they are matching is not_ascii. They
can use this to determine whether to collate and case-fold the
input (for non-ascii strings), or whether to use strstr/strcasestr (for
ascii strings).

The change is not currently implemented for flex matching, due to my
laziness. However, it should be a simple enough matter to add.

For my large input of 400,000 lines, this reduces typical filtering time
to about ten microseconds from about 2 seconds.
---
 include/helper.h        |  9 ++++++++-
 include/rofi.h          |  2 +-
 source/dialogs/combi.c  |  6 +++---
 source/dialogs/dmenu.c  |  2 +-
 source/dialogs/window.c |  9 +++++----
 source/helper.c         | 36 ++++++++++++++++++++++++++----------
 source/rofi.c           | 14 +++++++++++++-
 7 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/include/helper.h b/include/helper.h
index 6d6f1826..d75bfe35 100644
--- a/include/helper.h
+++ b/include/helper.h
@@ -102,7 +102,7 @@ int find_arg ( const char * const key );
  *
  * @returns 1 when matches, 0 otherwise
  */
-int token_match ( char **tokens, const char *input, int case_sensitive,
+int token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive,
                   __attribute__( ( unused ) ) unsigned int index,
                   __attribute__( ( unused ) ) Switcher * data );
 
@@ -152,4 +152,11 @@ char helper_parse_char ( const char *arg );
  * Set the application arguments.
  */
 void cmd_set_arguments ( int argc, char **argv );
+
+/**
+ * @param str a UTF8 string
+ * @return 1 if the string contains any non-ascii codepoints
+ */
+int is_not_ascii ( const char *str );
+
 #endif // ROFI_HELPER_H
diff --git a/include/rofi.h b/include/rofi.h
index bf7ee333..1aff2350 100644
--- a/include/rofi.h
+++ b/include/rofi.h
@@ -78,7 +78,7 @@ typedef enum
  *
  * @returns 1 when it matches, 0 if not.
  */
-typedef int ( *menu_match_cb )( char **tokens, const char *input, int case_sensitive, unsigned int index, Switcher *data );
+typedef int ( *menu_match_cb )( char **tokens, const char *input, int not_ascii, int case_sensitive, unsigned int index, Switcher *data );
 
 /**
  * @param sw the Switcher to show.
diff --git a/source/dialogs/combi.c b/source/dialogs/combi.c
index 1002de08..8f67cd2f 100644
--- a/source/dialogs/combi.c
+++ b/source/dialogs/combi.c
@@ -169,7 +169,7 @@ static SwitcherMode combi_mode_result ( int mretv, char **input, unsigned int se
     }
     return MODE_EXIT;
 }
-static int combi_mode_match ( char **tokens, const char *input,
+static int combi_mode_match ( char **tokens, const char *input, int not_ascii,
                               int case_sensitive, unsigned int index, Switcher *sw )
 {
     CombiModePrivateData *pd = sw->private_data;
@@ -178,13 +178,13 @@ static int combi_mode_match ( char **tokens, const char *input,
         if ( index >= pd->starts[i] && index < ( pd->starts[i] + pd->lengths[i] ) ) {
             if ( tokens && input[0] && tokens[0][0] == '!' ) {
                 if ( tokens[0][1] == pd->switchers[i]->name[0] ) {
-                    return pd->switchers[i]->token_match ( &tokens[1], input, case_sensitive,
+                    return pd->switchers[i]->token_match ( &tokens[1], input, not_ascii, case_sensitive,
                                                            index - pd->starts[i], pd->switchers[i] );
                 }
                 return 0;
             }
             else {
-                return pd->switchers[i]->token_match ( tokens, input, case_sensitive,
+                return pd->switchers[i]->token_match ( tokens, input, not_ascii, case_sensitive,
                                                        index - pd->starts[i], pd->switchers[i] );
             }
         }
diff --git a/source/dialogs/dmenu.c b/source/dialogs/dmenu.c
index b82ca50e..53fe2e4f 100644
--- a/source/dialogs/dmenu.c
+++ b/source/dialogs/dmenu.c
@@ -316,7 +316,7 @@ int dmenu_switcher_dialog ( void )
         char         **tokens = tokenize ( select, config.case_sensitive );
         unsigned int i        = 0;
         for ( i = 0; i < cmd_list_length; i++ ) {
-            if ( token_match ( tokens, cmd_list[i], config.case_sensitive, 0, NULL ) ) {
+            if ( token_match ( tokens, cmd_list[i], is_not_ascii(cmd_list[i]), config.case_sensitive, 0, NULL ) ) {
                 pd->selected_line = i;
                 break;
             }
diff --git a/source/dialogs/window.c b/source/dialogs/window.c
index 6a8821dd..cc633786 100644
--- a/source/dialogs/window.c
+++ b/source/dialogs/window.c
@@ -322,6 +322,7 @@ typedef struct _SwitcherModePrivateData
 } SwitcherModePrivateData;
 
 static int window_match ( char **tokens, __attribute__( ( unused ) ) const char *input,
+                          __attribute__( ( unused) ) int not_ascii,
                           int case_sensitive, unsigned int index, Switcher *sw )
 {
     SwitcherModePrivateData *rmpd = (SwitcherModePrivateData *) sw->private_data;
@@ -338,19 +339,19 @@ static int window_match ( char **tokens, __attribute__( ( unused ) ) const char
             // e.g. when searching 'title element' and 'class element'
             char *ftokens[2] = { tokens[j], NULL };
             if ( !test && c->title[0] != '\0' ) {
-                test = token_match ( ftokens, c->title, case_sensitive, 0, NULL );
+                test = token_match ( ftokens, c->title, is_not_ascii(c->title), case_sensitive, 0, NULL );
             }
 
             if ( !test && c->class[0] != '\0' ) {
-                test = token_match ( ftokens, c->class, case_sensitive, 0, NULL );
+                test = token_match ( ftokens, c->class, is_not_ascii(c->title), case_sensitive, 0, NULL );
             }
 
             if ( !test && c->role[0] != '\0' ) {
-                test = token_match ( ftokens, c->role, case_sensitive, 0, NULL );
+                test = token_match ( ftokens, c->role, is_not_ascii(c->title), case_sensitive, 0, NULL );
             }
 
             if ( !test && c->name[0] != '\0' ) {
-                test = token_match ( ftokens, c->name, case_sensitive, 0, NULL );
+                test = token_match ( ftokens, c->name, is_not_ascii(c->title), case_sensitive, 0, NULL );
             }
 
             if ( test == 0 ) {
diff --git a/source/helper.c b/source/helper.c
index fd3ee577..3173055f 100644
--- a/source/helper.c
+++ b/source/helper.c
@@ -310,11 +310,13 @@ int find_arg_char ( const char * const key, char *val )
  * Shared 'token_match' function.
  * Matches tokenized.
  */
-static int fuzzy_token_match ( char **tokens, const char *input, int case_sensitive )
+static int fuzzy_token_match ( char **tokens, const char *input, __attribute__( (unused) ) int not_ascii,  int case_sensitive )
 {
     int  match  = 1;
     char *compk = token_collate_key ( input, case_sensitive );
     // Do a tokenized match.
+    // TODO: this doesn't work for unicode input, because it may split a codepoint which is over two bytes.
+    // TODO this does not use the non-ascii speed-up either.
     if ( tokens ) {
         for ( int j = 0; match && tokens[j]; j++ ) {
             char *t        = compk;
@@ -331,28 +333,33 @@ static int fuzzy_token_match ( char **tokens, const char *input, int case_sensit
     g_free ( compk );
     return match;
 }
-static int normal_token_match ( char **tokens, const char *input, int case_sensitive )
+static int normal_token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive )
 {
     int  match  = 1;
-    char *compk = token_collate_key ( input, case_sensitive );
+    char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input;
 
     // Do a tokenized match.
+
     if ( tokens ) {
-        for ( int j = 0; match && tokens[j]; j++ ) {
-            match = ( strstr ( compk, tokens[j] ) != NULL );
-        }
+      char *(*comparison)(const char *, const char *);
+      comparison = (case_sensitive || not_ascii) ? strstr : strcasestr;
+      for ( int j = 0; match && tokens[j]; j++ ) {
+        match = (comparison( compk, tokens[j] ) != NULL );
+      }
     }
-    g_free ( compk );
+
+    if (not_ascii) g_free ( compk );
+
     return match;
 }
-int token_match ( char **tokens, const char *input, int case_sensitive,
+int token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive,
                   __attribute__( ( unused ) ) unsigned int index,
                   __attribute__( ( unused ) ) Switcher *data )
 {
     if ( config.fuzzy ) {
-        return fuzzy_token_match ( tokens, input, case_sensitive );
+        return fuzzy_token_match ( tokens, input, not_ascii, case_sensitive );
     }
-    return normal_token_match ( tokens, input, case_sensitive );
+    return normal_token_match ( tokens, input, not_ascii, case_sensitive );
 }
 
 int execute_generator ( const char * cmd )
@@ -478,3 +485,12 @@ void config_sanity_check (  )
         config.menu_bg_alt = config.menu_bg;
     }
 }
+
+int is_not_ascii ( const char * str )
+{
+   while (*str > 0) {
+     str++;
+   }
+   if (*str) return 1;
+   return 0;
+}
diff --git a/source/rofi.c b/source/rofi.c
index d22eb54c..c05579f3 100644
--- a/source/rofi.c
+++ b/source/rofi.c
@@ -244,6 +244,7 @@ typedef struct MenuState
     unsigned int *selected_line;
     MenuReturn   retv;
     char         **lines;
+    int          *lines_not_ascii;
     int          line_height;
 }MenuState;
 
@@ -307,6 +308,8 @@ static void menu_free_state ( MenuState *state )
     g_free ( state->boxes );
     g_free ( state->line_map );
     g_free ( state->distance );
+
+    g_free ( state->lines_not_ascii );
 }
 
 /**
@@ -724,7 +727,7 @@ static void menu_refilter ( MenuState *state )
 
         // input changed
         for ( unsigned int i = 0; i < state->num_lines; i++ ) {
-            int match = state->sw->token_match ( tokens, state->lines[i], config.case_sensitive, i, state->sw );
+            int match = state->sw->token_match ( tokens, state->lines[i], state->lines_not_ascii[i], config.case_sensitive, i, state->sw );
 
             // If each token was matched, add it to list.
             if ( match ) {
@@ -749,6 +752,7 @@ static void menu_refilter ( MenuState *state )
         }
         state->filtered_lines = state->num_lines;
     }
+
     state->selected = MIN ( state->selected, state->filtered_lines - 1 );
 
     if ( config.auto_select == TRUE && state->filtered_lines == 1 && state->num_lines > 1 ) {
@@ -1013,6 +1017,14 @@ MenuReturn menu ( Switcher *sw, char **input, char *prompt, unsigned int *select
     };
     // Request the lines to show.
     state.lines = sw->get_data ( &( state.num_lines ), sw );
+    state.lines_not_ascii = g_malloc0_n( state.num_lines, sizeof( int ) );
+
+    // find out which lines contain non-ascii codepoints, so we can be faster in some cases.
+
+    for (unsigned int line = 0; state.lines[line]; line++) {
+      state.lines_not_ascii[line] = is_not_ascii(state.lines[line]);
+    }
+
     if ( next_pos ) {
         *next_pos = *selected_line;
     }

From 0a953a03b407dddcb6d73dc978ddb6e81a741981 Mon Sep 17 00:00:00 2001
From: Tom Hinton <tom.hinton@cse.org.uk>
Date: Thu, 1 Oct 2015 13:45:23 +0100
Subject: [PATCH 3/6] Make fuzzy matching fast and unicode-happy

---
 source/helper.c | 72 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 12 deletions(-)

diff --git a/source/helper.c b/source/helper.c
index 3173055f..40b0a09f 100644
--- a/source/helper.c
+++ b/source/helper.c
@@ -37,6 +37,7 @@
 #include <sys/types.h>
 #include <sys/file.h>
 #include <sys/stat.h>
+#include <ctype.h>
 #include "helper.h"
 #include "rofi.h"
 
@@ -306,6 +307,35 @@ int find_arg_char ( const char * const key, char *val )
     return FALSE;
 }
 
+/*
+ * auxiliary to `fuzzy-token-match' below;
+ */
+static void advance_unicode_glyph( char** token_in, char** input_in ) {
+  // determine the end of the glyph from token
+
+  char *token = *token_in;
+  char *input = *input_in;
+
+  while (*token < 0) {
+    token++;
+  }
+
+  // now we know the glyph length, we can scan for that substring in input
+  // temporarily add a null-terminator in case:
+  char glyph_end = *token;
+  *token = 0;
+  char *match = strstr(input, *token_in);
+  *token = glyph_end;
+
+  if ( match ) {
+    *token_in = token;
+    *input_in = match;
+  } else {
+    // wind input along to the end so that we fail
+    while ( **input_in ) (*input_in)++;
+  }
+}
+
 /**
  * Shared 'token_match' function.
  * Matches tokenized.
@@ -313,43 +343,61 @@ int find_arg_char ( const char * const key, char *val )
 static int fuzzy_token_match ( char **tokens, const char *input, __attribute__( (unused) ) int not_ascii,  int case_sensitive )
 {
     int  match  = 1;
-    char *compk = token_collate_key ( input, case_sensitive );
+
     // Do a tokenized match.
+
     // TODO: this doesn't work for unicode input, because it may split a codepoint which is over two bytes.
-    // TODO this does not use the non-ascii speed-up either.
+    //       mind you, it didn't work before I fiddled with it.
+
+    // this could perhaps be a bit more efficient by iterating over all the tokens at once.
+
+    fprintf(stderr, "fz match %s %d\n", input, not_ascii);
+
     if ( tokens ) {
+        char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input;
         for ( int j = 0; match && tokens[j]; j++ ) {
             char *t        = compk;
-            int  token_len = strlen ( tokens[j] );
-            for ( int id = 0; match && t != NULL && id < token_len; id++ ) {
-                match = ( ( t = strchr ( t, tokens[j][id] ) ) != NULL );
-                // next should match the next character.
-                if ( t != NULL ) {
-                    t++;
+            char *token    = tokens[j];
+
+            while (*t && *token) {
+              if ( *token > 0 ) // i.e. we are at an ascii codepoint
+                {
+                  if ( ( case_sensitive && (*t == *token)) ||
+                       (!case_sensitive && (tolower(*t) == tolower(*token))) )
+                    token++;
                 }
+              else
+                {
+                  // we are not at an ascii codepoint, and so we need to do something
+                  // complicated
+                  advance_unicode_glyph( &token, &t );
+                }
+              t++;
             }
+
+            match = !(*token);
         }
+        if (not_ascii) g_free ( compk );
     }
-    g_free ( compk );
+
     return match;
 }
 static int normal_token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive )
 {
     int  match  = 1;
-    char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input;
 
     // Do a tokenized match.
 
     if ( tokens ) {
+      char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input;
       char *(*comparison)(const char *, const char *);
       comparison = (case_sensitive || not_ascii) ? strstr : strcasestr;
       for ( int j = 0; match && tokens[j]; j++ ) {
         match = (comparison( compk, tokens[j] ) != NULL );
       }
+      if (not_ascii) g_free ( compk );
     }
 
-    if (not_ascii) g_free ( compk );
-
     return match;
 }
 int token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive,

From d1a5b0d36258b3659b9b4ba8d0ef6b784e2a2af7 Mon Sep 17 00:00:00 2001
From: Tom Hinton <tom.hinton@cse.org.uk>
Date: Thu, 1 Oct 2015 13:46:50 +0100
Subject: [PATCH 4/6] whoops, rogue print statement

---
 source/helper.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/source/helper.c b/source/helper.c
index 40b0a09f..4db21ab4 100644
--- a/source/helper.c
+++ b/source/helper.c
@@ -351,8 +351,6 @@ static int fuzzy_token_match ( char **tokens, const char *input, __attribute__(
 
     // this could perhaps be a bit more efficient by iterating over all the tokens at once.
 
-    fprintf(stderr, "fz match %s %d\n", input, not_ascii);
-
     if ( tokens ) {
         char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input;
         for ( int j = 0; match && tokens[j]; j++ ) {

From 3ba2da9cae0c48995e9a547defe029fe6f6fdd0c Mon Sep 17 00:00:00 2001
From: Tom Hinton <tom.hinton@cse.org.uk>
Date: Thu, 1 Oct 2015 18:18:01 +0100
Subject: [PATCH 5/6] Fix mistakes from copy paste

---
 source/dialogs/window.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/dialogs/window.c b/source/dialogs/window.c
index cc633786..df50f76b 100644
--- a/source/dialogs/window.c
+++ b/source/dialogs/window.c
@@ -343,15 +343,15 @@ static int window_match ( char **tokens, __attribute__( ( unused ) ) const char
             }
 
             if ( !test && c->class[0] != '\0' ) {
-                test = token_match ( ftokens, c->class, is_not_ascii(c->title), case_sensitive, 0, NULL );
+                test = token_match ( ftokens, c->class, is_not_ascii(c->class), case_sensitive, 0, NULL );
             }
 
             if ( !test && c->role[0] != '\0' ) {
-                test = token_match ( ftokens, c->role, is_not_ascii(c->title), case_sensitive, 0, NULL );
+                test = token_match ( ftokens, c->role, is_not_ascii(c->role), case_sensitive, 0, NULL );
             }
 
             if ( !test && c->name[0] != '\0' ) {
-                test = token_match ( ftokens, c->name, is_not_ascii(c->title), case_sensitive, 0, NULL );
+                test = token_match ( ftokens, c->name, is_not_ascii(c->name), case_sensitive, 0, NULL );
             }
 
             if ( test == 0 ) {

From c2d8cb1f45f3f592a87223cae5e2c495bb204934 Mon Sep 17 00:00:00 2001
From: Tom Hinton <tom.hinton@cse.org.uk>
Date: Fri, 2 Oct 2015 09:58:56 +0100
Subject: [PATCH 6/6] Move ascii testing gizmos to macros at the top.

Still not sure about best way to do this.
---
 source/helper.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/helper.c b/source/helper.c
index 4db21ab4..69c8f975 100644
--- a/source/helper.c
+++ b/source/helper.c
@@ -44,6 +44,10 @@
 static int  stored_argc   = 0;
 static char **stored_argv = NULL;
 
+// TODO: is this safe?
+#define NON_ASCII_NON_NULL( x ) ( ((x) < 0) )
+#define ASCII_NON_NULL( x ) ( ((x) > 0) )
+
 void cmd_set_arguments ( int argc, char **argv )
 {
     stored_argc = argc;
@@ -316,7 +320,7 @@ static void advance_unicode_glyph( char** token_in, char** input_in ) {
   char *token = *token_in;
   char *input = *input_in;
 
-  while (*token < 0) {
+  while (NON_ASCII_NON_NULL(*token)) {
     token++;
   }
 
@@ -534,7 +538,7 @@ void config_sanity_check (  )
 
 int is_not_ascii ( const char * str )
 {
-   while (*str > 0) {
+   while (ASCII_NON_NULL(*str)) {
      str++;
    }
    if (*str) return 1;