From 46541694a0806dc6c90c3bb19d4ab95d234abc27 Mon Sep 17 00:00:00 2001 From: William Casarin Date: Mon, 13 Jan 2025 18:11:38 -0800 Subject: [PATCH] nostrdb: search: sort search terms from largest to smallest Add a helper for sorting search words from largest to smallest. This should help search performance. For example, let's say our search index is like so: "the pokemon is cool" the the the ... * 1000 Our root word search would have to start 1000 new recursive queries. By sorting by the largest word: pokemon pokemon pokemon ... * 10 We only have to do 10 recursive searches, assuming larger words are less common, which will likely be the case most of the time Signed-off-by: William Casarin --- nostrdb/src/nostrdb.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/nostrdb/src/nostrdb.c b/nostrdb/src/nostrdb.c index 3b501b0f..73f10819 100644 --- a/nostrdb/src/nostrdb.c +++ b/nostrdb/src/nostrdb.c @@ -3990,6 +3990,32 @@ void ndb_text_search_config_set_limit(struct ndb_text_search_config *cfg, int li cfg->limit = limit; } +static int compare_search_words(const void *pa, const void *pb) +{ + struct ndb_word *a, *b; + + a = (struct ndb_word *)pa; + b = (struct ndb_word *)pb; + + if (a->word_len == b->word_len) { + return 0; + } else if (a->word_len > b->word_len) { + // biggest words should be at the front of the list, + // so we say it's "smaller" here + return -1; + } else { + return 1; + } +} + +// Sort search words from largest to smallest. Larger words are less likely +// in the index, allowing our scan to walk fewer words at the root when +// recursively matching. +void sort_largest_to_smallest(struct ndb_search_words *words) +{ + qsort(words->words, words->num_words, sizeof(words->words[0]), compare_search_words); +} + int ndb_text_search(struct ndb_txn *txn, const char *query, struct ndb_text_search_results *results, struct ndb_text_search_config *config) @@ -4038,6 +4064,11 @@ int ndb_text_search(struct ndb_txn *txn, const char *query, return 0; } + // TODO: sort words from largest to smallest. This should complete the + // query quicker because the larger words are likely to have fewer + // entries in the search index. + sort_largest_to_smallest(&search_words); + // for each word, we recursively find all of the submatches while (results->num_results < limit) { last_result = NULL;