nostrdb: Implement nip50 fulltext searching
This adds support for nip50 fulltext searches. This allows you to use the nostrdb query interface for executing fulltext searches instead of the typical `ndb_text_search` api. The benefits of this include a standardized query interface that also further filters on other fields in the filter. Changelog-Added: Add nip50 search filters and queries Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
committed by
Daniel D’Aquino
parent
094cf5e8cc
commit
37f9c93705
@@ -230,6 +230,7 @@ enum ndb_query_plan {
|
|||||||
NDB_PLAN_AUTHOR_KINDS,
|
NDB_PLAN_AUTHOR_KINDS,
|
||||||
NDB_PLAN_CREATED,
|
NDB_PLAN_CREATED,
|
||||||
NDB_PLAN_TAGS,
|
NDB_PLAN_TAGS,
|
||||||
|
NDB_PLAN_SEARCH,
|
||||||
};
|
};
|
||||||
|
|
||||||
// A id + u64 + timestamp
|
// A id + u64 + timestamp
|
||||||
@@ -334,27 +335,27 @@ static int ndb_make_noted_text_search_key(unsigned char *buf, int bufsize,
|
|||||||
|
|
||||||
static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
|
static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
|
||||||
int wordlen, const char *word,
|
int wordlen, const char *word,
|
||||||
|
uint64_t since,
|
||||||
int *keysize)
|
int *keysize)
|
||||||
{
|
{
|
||||||
uint64_t timestamp, note_id;
|
uint64_t note_id;
|
||||||
timestamp = 0;
|
|
||||||
note_id = 0;
|
note_id = 0;
|
||||||
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
|
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
|
||||||
timestamp, note_id, keysize);
|
since, note_id, keysize);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize,
|
static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize,
|
||||||
int wordlen, const char *word,
|
int wordlen, const char *word,
|
||||||
|
uint64_t until,
|
||||||
int *keysize)
|
int *keysize)
|
||||||
{
|
{
|
||||||
uint64_t timestamp, note_id;
|
uint64_t note_id;
|
||||||
timestamp = INT32_MAX;
|
|
||||||
note_id = INT32_MAX;
|
note_id = INT32_MAX;
|
||||||
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
|
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
|
||||||
timestamp, note_id, keysize);
|
until, note_id, keysize);
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, int *keysize);
|
typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, uint64_t timestamp, int *keysize);
|
||||||
|
|
||||||
/** From LMDB: Compare two items lexically */
|
/** From LMDB: Compare two items lexically */
|
||||||
static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
|
static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
|
||||||
@@ -3044,6 +3045,43 @@ static int query_is_full(struct ndb_query_results *results, int limit)
|
|||||||
return cursor_count(&results->cur, sizeof(struct ndb_query_result)) >= limit;
|
return cursor_count(&results->cur, sizeof(struct ndb_query_result)) >= limit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ndb_query_plan_execute_search(struct ndb_txn *txn,
|
||||||
|
struct ndb_filter *filter,
|
||||||
|
struct ndb_query_results *results,
|
||||||
|
int limit)
|
||||||
|
{
|
||||||
|
const char *search;
|
||||||
|
int i;
|
||||||
|
struct ndb_text_search_results text_results;
|
||||||
|
struct ndb_text_search_result *text_result;
|
||||||
|
struct ndb_text_search_config config;
|
||||||
|
struct ndb_query_result result;
|
||||||
|
|
||||||
|
ndb_default_text_search_config(&config);
|
||||||
|
|
||||||
|
if (!(search = ndb_filter_find_search(filter)))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (!ndb_text_search_with(txn, search, &text_results, &config, filter))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
for (i = 0; i < text_results.num_results; i++) {
|
||||||
|
if (query_is_full(results, limit))
|
||||||
|
break;
|
||||||
|
|
||||||
|
text_result = &text_results.results[i];
|
||||||
|
|
||||||
|
result.note = text_result->note;
|
||||||
|
result.note_size = text_result->note_size;
|
||||||
|
result.note_id = text_result->key.note_id;
|
||||||
|
|
||||||
|
if (!push_query_result(results, &result))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
static int ndb_query_plan_execute_ids(struct ndb_txn *txn,
|
static int ndb_query_plan_execute_ids(struct ndb_txn *txn,
|
||||||
struct ndb_filter *filter,
|
struct ndb_filter *filter,
|
||||||
struct ndb_query_results *results,
|
struct ndb_query_results *results,
|
||||||
@@ -3456,15 +3494,18 @@ next:
|
|||||||
|
|
||||||
static enum ndb_query_plan ndb_filter_plan(struct ndb_filter *filter)
|
static enum ndb_query_plan ndb_filter_plan(struct ndb_filter *filter)
|
||||||
{
|
{
|
||||||
struct ndb_filter_elements *ids, *kinds, *authors, *tags;
|
struct ndb_filter_elements *ids, *kinds, *authors, *tags, *search;
|
||||||
|
|
||||||
ids = ndb_filter_find_elements(filter, NDB_FILTER_IDS);
|
ids = ndb_filter_find_elements(filter, NDB_FILTER_IDS);
|
||||||
|
search = ndb_filter_find_elements(filter, NDB_FILTER_SEARCH);
|
||||||
kinds = ndb_filter_find_elements(filter, NDB_FILTER_KINDS);
|
kinds = ndb_filter_find_elements(filter, NDB_FILTER_KINDS);
|
||||||
authors = ndb_filter_find_elements(filter, NDB_FILTER_AUTHORS);
|
authors = ndb_filter_find_elements(filter, NDB_FILTER_AUTHORS);
|
||||||
tags = ndb_filter_find_elements(filter, NDB_FILTER_TAGS);
|
tags = ndb_filter_find_elements(filter, NDB_FILTER_TAGS);
|
||||||
|
|
||||||
// this is rougly similar to the heuristic in strfry's dbscan
|
// this is rougly similar to the heuristic in strfry's dbscan
|
||||||
if (ids) {
|
if (search) {
|
||||||
|
return NDB_PLAN_SEARCH;
|
||||||
|
} else if (ids) {
|
||||||
return NDB_PLAN_IDS;
|
return NDB_PLAN_IDS;
|
||||||
} else if (kinds && authors && authors->count <= 10) {
|
} else if (kinds && authors && authors->count <= 10) {
|
||||||
return NDB_PLAN_AUTHOR_KINDS;
|
return NDB_PLAN_AUTHOR_KINDS;
|
||||||
@@ -3483,6 +3524,7 @@ static const char *ndb_query_plan_name(int plan_id)
|
|||||||
{
|
{
|
||||||
switch (plan_id) {
|
switch (plan_id) {
|
||||||
case NDB_PLAN_IDS: return "ids";
|
case NDB_PLAN_IDS: return "ids";
|
||||||
|
case NDB_PLAN_SEARCH: return "search";
|
||||||
case NDB_PLAN_KINDS: return "kinds";
|
case NDB_PLAN_KINDS: return "kinds";
|
||||||
case NDB_PLAN_TAGS: return "tags";
|
case NDB_PLAN_TAGS: return "tags";
|
||||||
case NDB_PLAN_CREATED: return "created";
|
case NDB_PLAN_CREATED: return "created";
|
||||||
@@ -3518,6 +3560,11 @@ static int ndb_query_filter(struct ndb_txn *txn, struct ndb_filter *filter,
|
|||||||
return 0;
|
return 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case NDB_PLAN_SEARCH:
|
||||||
|
if (!ndb_query_plan_execute_search(txn, filter, &results, limit))
|
||||||
|
return 0;
|
||||||
|
break;
|
||||||
|
|
||||||
// We have just kinds, just scan the kind index
|
// We have just kinds, just scan the kind index
|
||||||
case NDB_PLAN_KINDS:
|
case NDB_PLAN_KINDS:
|
||||||
if (!ndb_query_plan_execute_kinds(txn, filter, &results, limit))
|
if (!ndb_query_plan_execute_kinds(txn, filter, &results, limit))
|
||||||
@@ -4031,6 +4078,7 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
|
|||||||
struct ndb_word *search_word;
|
struct ndb_word *search_word;
|
||||||
struct ndb_note *note;
|
struct ndb_note *note;
|
||||||
struct cursor cur;
|
struct cursor cur;
|
||||||
|
uint64_t since, until, timestamp_op, *pint, note_size;
|
||||||
ndb_text_search_key_order_fn key_order_fn;
|
ndb_text_search_key_order_fn key_order_fn;
|
||||||
MDB_dbi text_db;
|
MDB_dbi text_db;
|
||||||
MDB_cursor *cursor;
|
MDB_cursor *cursor;
|
||||||
@@ -4038,17 +4086,36 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
|
|||||||
int i, j, keysize, saved_size, limit;
|
int i, j, keysize, saved_size, limit;
|
||||||
MDB_cursor_op op, order_op;
|
MDB_cursor_op op, order_op;
|
||||||
|
|
||||||
|
note_size = 0;
|
||||||
|
note = 0;
|
||||||
saved = NULL;
|
saved = NULL;
|
||||||
ndb_text_search_results_init(results);
|
ndb_text_search_results_init(results);
|
||||||
ndb_search_words_init(&search_words);
|
ndb_search_words_init(&search_words);
|
||||||
|
|
||||||
// search config
|
until = UINT64_MAX;
|
||||||
|
since = 0;
|
||||||
limit = MAX_TEXT_SEARCH_RESULTS;
|
limit = MAX_TEXT_SEARCH_RESULTS;
|
||||||
|
|
||||||
|
// until, since from filter
|
||||||
|
if (filter != NULL) {
|
||||||
|
if ((pint = ndb_filter_get_int(filter, NDB_FILTER_UNTIL)))
|
||||||
|
until = *pint;
|
||||||
|
|
||||||
|
if ((pint = ndb_filter_get_int(filter, NDB_FILTER_SINCE)))
|
||||||
|
since = *pint;
|
||||||
|
|
||||||
|
if ((pint = ndb_filter_get_int(filter, NDB_FILTER_LIMIT)))
|
||||||
|
limit = *pint;
|
||||||
|
}
|
||||||
|
|
||||||
order_op = MDB_PREV;
|
order_op = MDB_PREV;
|
||||||
key_order_fn = ndb_make_text_search_key_high;
|
key_order_fn = ndb_make_text_search_key_high;
|
||||||
|
timestamp_op = until;
|
||||||
if (config) {
|
if (config) {
|
||||||
if (config->order == NDB_ORDER_ASCENDING) {
|
if (config->order == NDB_ORDER_ASCENDING) {
|
||||||
order_op = MDB_NEXT;
|
order_op = MDB_NEXT;
|
||||||
|
// set the min timestamp value to since when ascending
|
||||||
|
timestamp_op = since;
|
||||||
key_order_fn = ndb_make_text_search_key_low;
|
key_order_fn = ndb_make_text_search_key_low;
|
||||||
}
|
}
|
||||||
limit = min(limit, config->limit);
|
limit = min(limit, config->limit);
|
||||||
@@ -4067,9 +4134,11 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: sort words from largest to smallest. This should complete the
|
// This should complete the query quicker because the larger words are
|
||||||
// query quicker because the larger words are likely to have fewer
|
// likely to have fewer entries in the search index. This is not always
|
||||||
// entries in the search index.
|
// true. Words with higher frequency (like bitcoin on nostr in 2024)
|
||||||
|
// may be slower. TODO: Skip word recursion by leveraging a minimal
|
||||||
|
// perfect hashmap of parsed words on a note
|
||||||
sort_largest_to_smallest(&search_words);
|
sort_largest_to_smallest(&search_words);
|
||||||
|
|
||||||
// for each word, we recursively find all of the submatches
|
// for each word, we recursively find all of the submatches
|
||||||
@@ -4099,7 +4168,9 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
|
|||||||
// match
|
// match
|
||||||
if (!key_order_fn(buffer, sizeof(buffer),
|
if (!key_order_fn(buffer, sizeof(buffer),
|
||||||
search_words.words[0].word_len,
|
search_words.words[0].word_len,
|
||||||
search_words.words[0].word, &keysize))
|
search_words.words[0].word,
|
||||||
|
timestamp_op,
|
||||||
|
&keysize))
|
||||||
{
|
{
|
||||||
// word is too big to fit in 1024-sized key
|
// word is too big to fit in 1024-sized key
|
||||||
continue;
|
continue;
|
||||||
@@ -4172,10 +4243,12 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
|
|||||||
|
|
||||||
// save the first key match, since we will continue from
|
// save the first key match, since we will continue from
|
||||||
// this on the next root word result
|
// this on the next root word result
|
||||||
if (j == 0 && !saved) {
|
if (j == 0) {
|
||||||
memcpy(saved_buf, k.mv_data, k.mv_size);
|
if (!saved) {
|
||||||
saved = saved_buf;
|
memcpy(saved_buf, k.mv_data, k.mv_size);
|
||||||
saved_size = k.mv_size;
|
saved = saved_buf;
|
||||||
|
saved_size = k.mv_size;
|
||||||
|
}
|
||||||
|
|
||||||
// since we will be trying to match the same
|
// since we will be trying to match the same
|
||||||
// note_id on all subsequent word matches,
|
// note_id on all subsequent word matches,
|
||||||
@@ -4185,15 +4258,20 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
|
|||||||
// remaining word queries
|
// remaining word queries
|
||||||
if (filter) {
|
if (filter) {
|
||||||
if ((note = ndb_get_note_by_key(txn,
|
if ((note = ndb_get_note_by_key(txn,
|
||||||
result->key.note_id, NULL)))
|
result->key.note_id,
|
||||||
|
¬e_size)))
|
||||||
{
|
{
|
||||||
if (!ndb_filter_matches(filter, note)) {
|
if (!ndb_filter_matches(filter, note)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
result->note = note;
|
||||||
|
result->note_size = note_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
result->note = note;
|
||||||
|
result->note_size = note_size;
|
||||||
last_candidate = *result;
|
last_candidate = *result;
|
||||||
last_result = &last_candidate;
|
last_result = &last_candidate;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -314,6 +314,10 @@ struct ndb_text_search_key
|
|||||||
struct ndb_text_search_result {
|
struct ndb_text_search_result {
|
||||||
struct ndb_text_search_key key;
|
struct ndb_text_search_key key;
|
||||||
int prefix_chars;
|
int prefix_chars;
|
||||||
|
|
||||||
|
// This is only set if we passed a filter for nip50 searches
|
||||||
|
struct ndb_note *note;
|
||||||
|
uint64_t note_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ndb_text_search_results {
|
struct ndb_text_search_results {
|
||||||
|
|||||||
Reference in New Issue
Block a user