From 574bf2da828b4e3ab0ce3ce7fccd58879db60430 Mon Sep 17 00:00:00 2001 From: Tom Hinton Date: Thu, 1 Oct 2015 11:41:44 +0100 Subject: [PATCH 1/6] Make dmenu reading very marginally faster A slight reduction in use of realloc and avoidance of 3 or 4 strlens for a string we know the length of --- include/helper.h | 2 +- source/dialogs/dmenu.c | 28 ++++++++++++++++++++-------- source/helper.c | 4 ++-- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/helper.h b/include/helper.h index cdd4bc7f..6d6f1826 100644 --- a/include/helper.h +++ b/include/helper.h @@ -16,7 +16,7 @@ int helper_parse_setup ( char * string, char ***output, int *length, ... ); /** * Implementation of fgets with custom separator. */ -char* fgets_s ( char* s, int n, FILE *iop, char sep ); +char* fgets_s ( char* s, unsigned int n, FILE *iop, char sep ); /** * @param token The string for which we want a collation key. diff --git a/source/dialogs/dmenu.c b/source/dialogs/dmenu.c index 1ef5eb2d..b82ca50e 100644 --- a/source/dialogs/dmenu.c +++ b/source/dialogs/dmenu.c @@ -59,28 +59,40 @@ typedef struct _DmenuModePrivateData static char **get_dmenu ( unsigned int *length ) { - char buffer[1024]; + const unsigned int buf_size = 1024; + char buffer[buf_size]; char **retv = NULL; + char *buffer_end = NULL; + unsigned int rvlength = 1; *length = 0; - while ( fgets_s ( buffer, 1024, stdin, (char) config.separator ) != NULL ) { - retv = g_realloc ( retv, ( ( *length ) + 2 ) * sizeof ( char* ) ); - retv[( *length )] = g_strdup ( buffer ); - retv[( *length ) + 1] = NULL; + while ( ( buffer_end = fgets_s ( buffer, buf_size, stdin, (char) config.separator ) ) != NULL ) { + if (rvlength < (*length + 2)) { + rvlength *= 2; + retv = g_realloc ( retv, ( rvlength ) * sizeof ( char* ) ); + } + + size_t blength = buffer_end - &(buffer[0]); + + char *copy = g_malloc( blength + 1 ); + memcpy(copy, buffer, blength); // Filter out line-end. - if ( retv[( *length )][strlen ( buffer ) - 1] == '\n' ) { - retv[( *length )][strlen ( buffer ) - 1] = '\0'; + if ( copy[blength] == '\n' ) { + copy[blength] = '\0'; } + retv[( *length )] = copy; + retv[( *length ) + 1] = NULL; + ( *length )++; // Stop when we hit 2³¹ entries. if ( ( *length ) == INT_MAX ) { return retv; } } - + retv = g_realloc ( retv, ( *length + 1 ) * sizeof ( char* ) ); return retv; } diff --git a/source/helper.c b/source/helper.c index 94b90a28..fd3ee577 100644 --- a/source/helper.c +++ b/source/helper.c @@ -52,7 +52,7 @@ void cmd_set_arguments ( int argc, char **argv ) /** * `fgets` implementation with custom separator. */ -char* fgets_s ( char* s, int n, FILE *iop, char sep ) +char* fgets_s ( char* s, unsigned int n, FILE *iop, char sep ) { // Map these to registers. register int c = EOF; @@ -72,7 +72,7 @@ char* fgets_s ( char* s, int n, FILE *iop, char sep ) *cs = '\0'; // if last read was end of file and current index is start, we are done: // Return NULL. - return ( c == EOF && cs == s ) ? NULL : s; + return ( c == EOF && cs == s ) ? NULL : cs; } /** From af6a4b83ebdb83f24b6913b372e207bcf245ea0c Mon Sep 17 00:00:00 2001 From: Tom Hinton Date: Thu, 1 Oct 2015 12:16:41 +0100 Subject: [PATCH 2/6] Make normal filtering of plain ASCII lines faster This patch adds a field lines_not_ascii to the MenuState structure. The nth entry is 0 unless the nth member of MenuState.lines has a non-ascii codepoint in it. All comparison functions (menu_match_cb type) take an additional argument to tell them if the thing they are matching is not_ascii. They can use this to determine whether to collate and case-fold the input (for non-ascii strings), or whether to use strstr/strcasestr (for ascii strings). The change is not currently implemented for flex matching, due to my laziness. However, it should be a simple enough matter to add. For my large input of 400,000 lines, this reduces typical filtering time to about ten microseconds from about 2 seconds. --- include/helper.h | 9 ++++++++- include/rofi.h | 2 +- source/dialogs/combi.c | 6 +++--- source/dialogs/dmenu.c | 2 +- source/dialogs/window.c | 9 +++++---- source/helper.c | 36 ++++++++++++++++++++++++++---------- source/rofi.c | 14 +++++++++++++- 7 files changed, 57 insertions(+), 21 deletions(-) diff --git a/include/helper.h b/include/helper.h index 6d6f1826..d75bfe35 100644 --- a/include/helper.h +++ b/include/helper.h @@ -102,7 +102,7 @@ int find_arg ( const char * const key ); * * @returns 1 when matches, 0 otherwise */ -int token_match ( char **tokens, const char *input, int case_sensitive, +int token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive, __attribute__( ( unused ) ) unsigned int index, __attribute__( ( unused ) ) Switcher * data ); @@ -152,4 +152,11 @@ char helper_parse_char ( const char *arg ); * Set the application arguments. */ void cmd_set_arguments ( int argc, char **argv ); + +/** + * @param str a UTF8 string + * @return 1 if the string contains any non-ascii codepoints + */ +int is_not_ascii ( const char *str ); + #endif // ROFI_HELPER_H diff --git a/include/rofi.h b/include/rofi.h index bf7ee333..1aff2350 100644 --- a/include/rofi.h +++ b/include/rofi.h @@ -78,7 +78,7 @@ typedef enum * * @returns 1 when it matches, 0 if not. */ -typedef int ( *menu_match_cb )( char **tokens, const char *input, int case_sensitive, unsigned int index, Switcher *data ); +typedef int ( *menu_match_cb )( char **tokens, const char *input, int not_ascii, int case_sensitive, unsigned int index, Switcher *data ); /** * @param sw the Switcher to show. diff --git a/source/dialogs/combi.c b/source/dialogs/combi.c index 1002de08..8f67cd2f 100644 --- a/source/dialogs/combi.c +++ b/source/dialogs/combi.c @@ -169,7 +169,7 @@ static SwitcherMode combi_mode_result ( int mretv, char **input, unsigned int se } return MODE_EXIT; } -static int combi_mode_match ( char **tokens, const char *input, +static int combi_mode_match ( char **tokens, const char *input, int not_ascii, int case_sensitive, unsigned int index, Switcher *sw ) { CombiModePrivateData *pd = sw->private_data; @@ -178,13 +178,13 @@ static int combi_mode_match ( char **tokens, const char *input, if ( index >= pd->starts[i] && index < ( pd->starts[i] + pd->lengths[i] ) ) { if ( tokens && input[0] && tokens[0][0] == '!' ) { if ( tokens[0][1] == pd->switchers[i]->name[0] ) { - return pd->switchers[i]->token_match ( &tokens[1], input, case_sensitive, + return pd->switchers[i]->token_match ( &tokens[1], input, not_ascii, case_sensitive, index - pd->starts[i], pd->switchers[i] ); } return 0; } else { - return pd->switchers[i]->token_match ( tokens, input, case_sensitive, + return pd->switchers[i]->token_match ( tokens, input, not_ascii, case_sensitive, index - pd->starts[i], pd->switchers[i] ); } } diff --git a/source/dialogs/dmenu.c b/source/dialogs/dmenu.c index b82ca50e..53fe2e4f 100644 --- a/source/dialogs/dmenu.c +++ b/source/dialogs/dmenu.c @@ -316,7 +316,7 @@ int dmenu_switcher_dialog ( void ) char **tokens = tokenize ( select, config.case_sensitive ); unsigned int i = 0; for ( i = 0; i < cmd_list_length; i++ ) { - if ( token_match ( tokens, cmd_list[i], config.case_sensitive, 0, NULL ) ) { + if ( token_match ( tokens, cmd_list[i], is_not_ascii(cmd_list[i]), config.case_sensitive, 0, NULL ) ) { pd->selected_line = i; break; } diff --git a/source/dialogs/window.c b/source/dialogs/window.c index 6a8821dd..cc633786 100644 --- a/source/dialogs/window.c +++ b/source/dialogs/window.c @@ -322,6 +322,7 @@ typedef struct _SwitcherModePrivateData } SwitcherModePrivateData; static int window_match ( char **tokens, __attribute__( ( unused ) ) const char *input, + __attribute__( ( unused) ) int not_ascii, int case_sensitive, unsigned int index, Switcher *sw ) { SwitcherModePrivateData *rmpd = (SwitcherModePrivateData *) sw->private_data; @@ -338,19 +339,19 @@ static int window_match ( char **tokens, __attribute__( ( unused ) ) const char // e.g. when searching 'title element' and 'class element' char *ftokens[2] = { tokens[j], NULL }; if ( !test && c->title[0] != '\0' ) { - test = token_match ( ftokens, c->title, case_sensitive, 0, NULL ); + test = token_match ( ftokens, c->title, is_not_ascii(c->title), case_sensitive, 0, NULL ); } if ( !test && c->class[0] != '\0' ) { - test = token_match ( ftokens, c->class, case_sensitive, 0, NULL ); + test = token_match ( ftokens, c->class, is_not_ascii(c->title), case_sensitive, 0, NULL ); } if ( !test && c->role[0] != '\0' ) { - test = token_match ( ftokens, c->role, case_sensitive, 0, NULL ); + test = token_match ( ftokens, c->role, is_not_ascii(c->title), case_sensitive, 0, NULL ); } if ( !test && c->name[0] != '\0' ) { - test = token_match ( ftokens, c->name, case_sensitive, 0, NULL ); + test = token_match ( ftokens, c->name, is_not_ascii(c->title), case_sensitive, 0, NULL ); } if ( test == 0 ) { diff --git a/source/helper.c b/source/helper.c index fd3ee577..3173055f 100644 --- a/source/helper.c +++ b/source/helper.c @@ -310,11 +310,13 @@ int find_arg_char ( const char * const key, char *val ) * Shared 'token_match' function. * Matches tokenized. */ -static int fuzzy_token_match ( char **tokens, const char *input, int case_sensitive ) +static int fuzzy_token_match ( char **tokens, const char *input, __attribute__( (unused) ) int not_ascii, int case_sensitive ) { int match = 1; char *compk = token_collate_key ( input, case_sensitive ); // Do a tokenized match. + // TODO: this doesn't work for unicode input, because it may split a codepoint which is over two bytes. + // TODO this does not use the non-ascii speed-up either. if ( tokens ) { for ( int j = 0; match && tokens[j]; j++ ) { char *t = compk; @@ -331,28 +333,33 @@ static int fuzzy_token_match ( char **tokens, const char *input, int case_sensit g_free ( compk ); return match; } -static int normal_token_match ( char **tokens, const char *input, int case_sensitive ) +static int normal_token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive ) { int match = 1; - char *compk = token_collate_key ( input, case_sensitive ); + char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input; // Do a tokenized match. + if ( tokens ) { - for ( int j = 0; match && tokens[j]; j++ ) { - match = ( strstr ( compk, tokens[j] ) != NULL ); - } + char *(*comparison)(const char *, const char *); + comparison = (case_sensitive || not_ascii) ? strstr : strcasestr; + for ( int j = 0; match && tokens[j]; j++ ) { + match = (comparison( compk, tokens[j] ) != NULL ); + } } - g_free ( compk ); + + if (not_ascii) g_free ( compk ); + return match; } -int token_match ( char **tokens, const char *input, int case_sensitive, +int token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive, __attribute__( ( unused ) ) unsigned int index, __attribute__( ( unused ) ) Switcher *data ) { if ( config.fuzzy ) { - return fuzzy_token_match ( tokens, input, case_sensitive ); + return fuzzy_token_match ( tokens, input, not_ascii, case_sensitive ); } - return normal_token_match ( tokens, input, case_sensitive ); + return normal_token_match ( tokens, input, not_ascii, case_sensitive ); } int execute_generator ( const char * cmd ) @@ -478,3 +485,12 @@ void config_sanity_check ( ) config.menu_bg_alt = config.menu_bg; } } + +int is_not_ascii ( const char * str ) +{ + while (*str > 0) { + str++; + } + if (*str) return 1; + return 0; +} diff --git a/source/rofi.c b/source/rofi.c index d22eb54c..c05579f3 100644 --- a/source/rofi.c +++ b/source/rofi.c @@ -244,6 +244,7 @@ typedef struct MenuState unsigned int *selected_line; MenuReturn retv; char **lines; + int *lines_not_ascii; int line_height; }MenuState; @@ -307,6 +308,8 @@ static void menu_free_state ( MenuState *state ) g_free ( state->boxes ); g_free ( state->line_map ); g_free ( state->distance ); + + g_free ( state->lines_not_ascii ); } /** @@ -724,7 +727,7 @@ static void menu_refilter ( MenuState *state ) // input changed for ( unsigned int i = 0; i < state->num_lines; i++ ) { - int match = state->sw->token_match ( tokens, state->lines[i], config.case_sensitive, i, state->sw ); + int match = state->sw->token_match ( tokens, state->lines[i], state->lines_not_ascii[i], config.case_sensitive, i, state->sw ); // If each token was matched, add it to list. if ( match ) { @@ -749,6 +752,7 @@ static void menu_refilter ( MenuState *state ) } state->filtered_lines = state->num_lines; } + state->selected = MIN ( state->selected, state->filtered_lines - 1 ); if ( config.auto_select == TRUE && state->filtered_lines == 1 && state->num_lines > 1 ) { @@ -1013,6 +1017,14 @@ MenuReturn menu ( Switcher *sw, char **input, char *prompt, unsigned int *select }; // Request the lines to show. state.lines = sw->get_data ( &( state.num_lines ), sw ); + state.lines_not_ascii = g_malloc0_n( state.num_lines, sizeof( int ) ); + + // find out which lines contain non-ascii codepoints, so we can be faster in some cases. + + for (unsigned int line = 0; state.lines[line]; line++) { + state.lines_not_ascii[line] = is_not_ascii(state.lines[line]); + } + if ( next_pos ) { *next_pos = *selected_line; } From 0a953a03b407dddcb6d73dc978ddb6e81a741981 Mon Sep 17 00:00:00 2001 From: Tom Hinton Date: Thu, 1 Oct 2015 13:45:23 +0100 Subject: [PATCH 3/6] Make fuzzy matching fast and unicode-happy --- source/helper.c | 72 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/source/helper.c b/source/helper.c index 3173055f..40b0a09f 100644 --- a/source/helper.c +++ b/source/helper.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "helper.h" #include "rofi.h" @@ -306,6 +307,35 @@ int find_arg_char ( const char * const key, char *val ) return FALSE; } +/* + * auxiliary to `fuzzy-token-match' below; + */ +static void advance_unicode_glyph( char** token_in, char** input_in ) { + // determine the end of the glyph from token + + char *token = *token_in; + char *input = *input_in; + + while (*token < 0) { + token++; + } + + // now we know the glyph length, we can scan for that substring in input + // temporarily add a null-terminator in case: + char glyph_end = *token; + *token = 0; + char *match = strstr(input, *token_in); + *token = glyph_end; + + if ( match ) { + *token_in = token; + *input_in = match; + } else { + // wind input along to the end so that we fail + while ( **input_in ) (*input_in)++; + } +} + /** * Shared 'token_match' function. * Matches tokenized. @@ -313,43 +343,61 @@ int find_arg_char ( const char * const key, char *val ) static int fuzzy_token_match ( char **tokens, const char *input, __attribute__( (unused) ) int not_ascii, int case_sensitive ) { int match = 1; - char *compk = token_collate_key ( input, case_sensitive ); + // Do a tokenized match. + // TODO: this doesn't work for unicode input, because it may split a codepoint which is over two bytes. - // TODO this does not use the non-ascii speed-up either. + // mind you, it didn't work before I fiddled with it. + + // this could perhaps be a bit more efficient by iterating over all the tokens at once. + + fprintf(stderr, "fz match %s %d\n", input, not_ascii); + if ( tokens ) { + char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input; for ( int j = 0; match && tokens[j]; j++ ) { char *t = compk; - int token_len = strlen ( tokens[j] ); - for ( int id = 0; match && t != NULL && id < token_len; id++ ) { - match = ( ( t = strchr ( t, tokens[j][id] ) ) != NULL ); - // next should match the next character. - if ( t != NULL ) { - t++; + char *token = tokens[j]; + + while (*t && *token) { + if ( *token > 0 ) // i.e. we are at an ascii codepoint + { + if ( ( case_sensitive && (*t == *token)) || + (!case_sensitive && (tolower(*t) == tolower(*token))) ) + token++; } + else + { + // we are not at an ascii codepoint, and so we need to do something + // complicated + advance_unicode_glyph( &token, &t ); + } + t++; } + + match = !(*token); } + if (not_ascii) g_free ( compk ); } - g_free ( compk ); + return match; } static int normal_token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive ) { int match = 1; - char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input; // Do a tokenized match. if ( tokens ) { + char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input; char *(*comparison)(const char *, const char *); comparison = (case_sensitive || not_ascii) ? strstr : strcasestr; for ( int j = 0; match && tokens[j]; j++ ) { match = (comparison( compk, tokens[j] ) != NULL ); } + if (not_ascii) g_free ( compk ); } - if (not_ascii) g_free ( compk ); - return match; } int token_match ( char **tokens, const char *input, int not_ascii, int case_sensitive, From d1a5b0d36258b3659b9b4ba8d0ef6b784e2a2af7 Mon Sep 17 00:00:00 2001 From: Tom Hinton Date: Thu, 1 Oct 2015 13:46:50 +0100 Subject: [PATCH 4/6] whoops, rogue print statement --- source/helper.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/helper.c b/source/helper.c index 40b0a09f..4db21ab4 100644 --- a/source/helper.c +++ b/source/helper.c @@ -351,8 +351,6 @@ static int fuzzy_token_match ( char **tokens, const char *input, __attribute__( // this could perhaps be a bit more efficient by iterating over all the tokens at once. - fprintf(stderr, "fz match %s %d\n", input, not_ascii); - if ( tokens ) { char *compk = not_ascii ? token_collate_key ( input, case_sensitive ) : (char *) input; for ( int j = 0; match && tokens[j]; j++ ) { From 3ba2da9cae0c48995e9a547defe029fe6f6fdd0c Mon Sep 17 00:00:00 2001 From: Tom Hinton Date: Thu, 1 Oct 2015 18:18:01 +0100 Subject: [PATCH 5/6] Fix mistakes from copy paste --- source/dialogs/window.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/dialogs/window.c b/source/dialogs/window.c index cc633786..df50f76b 100644 --- a/source/dialogs/window.c +++ b/source/dialogs/window.c @@ -343,15 +343,15 @@ static int window_match ( char **tokens, __attribute__( ( unused ) ) const char } if ( !test && c->class[0] != '\0' ) { - test = token_match ( ftokens, c->class, is_not_ascii(c->title), case_sensitive, 0, NULL ); + test = token_match ( ftokens, c->class, is_not_ascii(c->class), case_sensitive, 0, NULL ); } if ( !test && c->role[0] != '\0' ) { - test = token_match ( ftokens, c->role, is_not_ascii(c->title), case_sensitive, 0, NULL ); + test = token_match ( ftokens, c->role, is_not_ascii(c->role), case_sensitive, 0, NULL ); } if ( !test && c->name[0] != '\0' ) { - test = token_match ( ftokens, c->name, is_not_ascii(c->title), case_sensitive, 0, NULL ); + test = token_match ( ftokens, c->name, is_not_ascii(c->name), case_sensitive, 0, NULL ); } if ( test == 0 ) { From c2d8cb1f45f3f592a87223cae5e2c495bb204934 Mon Sep 17 00:00:00 2001 From: Tom Hinton Date: Fri, 2 Oct 2015 09:58:56 +0100 Subject: [PATCH 6/6] Move ascii testing gizmos to macros at the top. Still not sure about best way to do this. --- source/helper.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/helper.c b/source/helper.c index 4db21ab4..69c8f975 100644 --- a/source/helper.c +++ b/source/helper.c @@ -44,6 +44,10 @@ static int stored_argc = 0; static char **stored_argv = NULL; +// TODO: is this safe? +#define NON_ASCII_NON_NULL( x ) ( ((x) < 0) ) +#define ASCII_NON_NULL( x ) ( ((x) > 0) ) + void cmd_set_arguments ( int argc, char **argv ) { stored_argc = argc; @@ -316,7 +320,7 @@ static void advance_unicode_glyph( char** token_in, char** input_in ) { char *token = *token_in; char *input = *input_in; - while (*token < 0) { + while (NON_ASCII_NON_NULL(*token)) { token++; } @@ -534,7 +538,7 @@ void config_sanity_check ( ) int is_not_ascii ( const char * str ) { - while (*str > 0) { + while (ASCII_NON_NULL(*str)) { str++; } if (*str) return 1;