nostrdb: search: sort search terms from largest to smallest

Add a helper for sorting search words from largest to smallest. This should help search performance. For example, let's say our search index is like so: "the pokemon is cool" the the the ... * 1000 Our root word search would have to start 1000 new recursive queries. By sorting by the largest word: pokemon pokemon pokemon ... * 10 We only have to do 10 recursive searches, assuming larger words are less common, which will likely be the case most of the time Signed-off-by: William Casarin <jb55@jb55.com>
2025-01-13 18:11:38 -08:00
parent 04d4ff4e99
commit 46541694a0
1 changed files with 31 additions and 0 deletions
--- a/nostrdb/src/nostrdb.c
+++ b/nostrdb/src/nostrdb.c
@@ -3990,6 +3990,32 @@ void ndb_text_search_config_set_limit(struct ndb_text_search_config *cfg, int li
 	cfg->limit = limit;
 }

+static int compare_search_words(const void *pa, const void *pb)
+{
+	struct ndb_word *a, *b;
+
+	a = (struct ndb_word *)pa;
+	b = (struct ndb_word *)pb;
+
+	if (a->word_len == b->word_len) {
+		return 0;
+	} else if (a->word_len > b->word_len) {
+		// biggest words should be at the front of the list,
+		// so we say it's "smaller" here
+		return -1;
+	} else {
+		return 1;
+	}
+}
+
+// Sort search words from largest to smallest. Larger words are less likely
+// in the index, allowing our scan to walk fewer words at the root when
+// recursively matching.
+void sort_largest_to_smallest(struct ndb_search_words *words)
+{
+	qsort(words->words, words->num_words, sizeof(words->words[0]), compare_search_words);
+}
+
 int ndb_text_search(struct ndb_txn *txn, const char *query,
 		    struct ndb_text_search_results *results,
 		    struct ndb_text_search_config *config)
@@ -4038,6 +4064,11 @@ int ndb_text_search(struct ndb_txn *txn, const char *query,
 		return 0;
 	}

+	// TODO: sort words from largest to smallest. This should complete the
+	// query quicker because the larger words are likely to have fewer
+	// entries in the search index.
+	sort_largest_to_smallest(&search_words);
+
 	// for each word, we recursively find all of the submatches
 	while (results->num_results < limit) {
 		last_result = NULL;