ndb: switch to nostrdb notes

This is a refactor of the codebase to use a more memory-efficient representation of notes. It should also be much faster at decoding since we're using a custom C json parser now. Changelog-Changed: Improved memory usage and performance when processing events
2023-07-26 08:46:44 -07:00
parent 55bbe8f855
commit cebd1f48ca
110 changed files with 2153 additions and 1799 deletions
--- a/nostrdb/NdbNote.swift
+++ b/nostrdb/NdbNote.swift
@@ -34,7 +34,7 @@ enum NdbData {
    }
 }

-class NdbNote: Equatable, Hashable {
+class NdbNote: Encodable, Equatable, Hashable {
    // we can have owned notes, but we can also have lmdb virtual-memory mapped notes so its optional
    private let owned: Bool
    let count: Int
@@ -52,6 +52,14 @@ class NdbNote: Equatable, Hashable {
        self.note = note
        self.owned = owned_size != nil
        self.count = owned_size ?? 0
+
+        if let owned_size {
+            NdbNote.total_ndb_size += Int(owned_size)
+            NdbNote.notes_created += 1
+
+            print("\(NdbNote.notes_created) ndb_notes, \(NdbNote.total_ndb_size) bytes")
+        }
+
    }

    var content: String {
@@ -67,13 +75,17 @@ class NdbNote: Equatable, Hashable {
    }

    /// NDBTODO: make this into data
-    var id: String {
-        hex_encode(Data(buffer: UnsafeBufferPointer(start: ndb_note_id(note), count: 32)))
+    var id: NoteId {
+        .init(Data(bytes: ndb_note_id(note), count: 32))
+    }
+
+    var sig: Signature {
+        .init(Data(bytes: ndb_note_sig(note), count: 64))
    }
    
    /// NDBTODO: make this into data
-    var pubkey: String {
-        hex_encode(Data(buffer: UnsafeBufferPointer(start: ndb_note_pubkey(note), count: 32)))
+    var pubkey: Pubkey {
+        .init(Data(bytes: ndb_note_pubkey(note), count: 32))
    }
    
    var created_at: UInt32 {
@@ -90,6 +102,10 @@ class NdbNote: Equatable, Hashable {

    deinit {
        if self.owned {
+            NdbNote.total_ndb_size -= Int(count)
+            NdbNote.notes_created -= 1
+
+            print("\(NdbNote.notes_created) ndb_notes, \(NdbNote.total_ndb_size) bytes")
            free(note)
        }
    }
@@ -102,58 +118,100 @@ class NdbNote: Equatable, Hashable {
        hasher.combine(id)
    }

-    static let max_note_size: Int = 2 << 18
+    private enum CodingKeys: String, CodingKey {
+        case id, sig, tags, pubkey, created_at, kind, content
+    }
+
+    // Implement the `Encodable` protocol
+    func encode(to encoder: Encoder) throws {
+        var container = encoder.container(keyedBy: CodingKeys.self)
+
+        try container.encode(hex_encode(id.id), forKey: .id)
+        try container.encode(hex_encode(sig.data), forKey: .sig)
+        try container.encode(pubkey, forKey: .pubkey)
+        try container.encode(created_at, forKey: .created_at)
+        try container.encode(kind, forKey: .kind)
+        try container.encode(content, forKey: .content)
+        try container.encode(tags, forKey: .tags)
+    }
+
+    static var total_ndb_size: Int = 0
+    static var notes_created: Int = 0

    init?(content: String, keypair: Keypair, kind: UInt32 = 1, tags: [[String]] = [], createdAt: UInt32 = UInt32(Date().timeIntervalSince1970)) {

        var builder = ndb_builder()
        let buflen = MAX_NOTE_SIZE
        let buf = malloc(buflen)
-        let idbuf = malloc(buflen)

        ndb_builder_init(&builder, buf, Int32(buflen))

-        guard var pk_raw = hex_decode(keypair.pubkey) else { return nil }
+        var pk_raw = keypair.pubkey.bytes

        ndb_builder_set_pubkey(&builder, &pk_raw)
        ndb_builder_set_kind(&builder, UInt32(kind))
        ndb_builder_set_created_at(&builder, createdAt)

+        var ok = true
        for tag in tags {
            ndb_builder_new_tag(&builder);
            for elem in tag {
-                _ = elem.withCString { eptr in
-                    ndb_builder_push_tag_str(&builder, eptr, Int32(elem.utf8.count))
+                ok = elem.withCString({ eptr in
+                    return ndb_builder_push_tag_str(&builder, eptr, Int32(elem.utf8.count)) > 0
+                })
+                if !ok {
+                    return nil
                }
            }
        }

-        _ = content.withCString { cptr in
-            ndb_builder_set_content(&builder, content, Int32(content.utf8.count));
+        ok = content.withCString { cptr in
+            return ndb_builder_set_content(&builder, cptr, Int32(content.utf8.count)) > 0
+        }
+        if !ok {
+            return nil
        }

        var n = UnsafeMutablePointer<ndb_note>?(nil)

-        let keypair = keypair.privkey.map { sec in
+
+        var the_kp: ndb_keypair? = nil
+
+        if let sec = keypair.privkey {
            var kp = ndb_keypair()
-            return sec.withCString { secptr in
-                ndb_decode_key(secptr, &kp)
-                return kp
+            memcpy(&kp.secret.0, sec.id.bytes, 32);
+
+            if ndb_create_keypair(&kp) <= 0 {
+                print("bad keypair")
+            } else {
+                the_kp = kp
            }
        }

        var len: Int32 = 0
-        if var keypair {
-            len = ndb_builder_finalize(&builder, &n, &keypair)
+        if var the_kp {
+            len = ndb_builder_finalize(&builder, &n, &the_kp)
        } else {
            len = ndb_builder_finalize(&builder, &n, nil)
        }

-        free(idbuf)
+        if len <= 0 {
+            free(buf)
+            return nil
+        }
+
+        //guard let n else { return nil }

        self.owned = true
        self.count = Int(len)
-        self.note = realloc(n, Int(len)).assumingMemoryBound(to: ndb_note.self)
+        //self.note = n
+        let r = realloc(buf, Int(len))
+        guard let r else {
+            free(buf)
+            return nil
+        }
+
+        self.note = r.assumingMemoryBound(to: ndb_note.self)
    }

    static func owned_from_json(json: String, bufsize: Int = 2 << 18) -> NdbNote? {
@@ -204,13 +262,8 @@ extension NdbNote {
        return !too_big
    }

-    
-    //var is_valid_id: Bool {
-     //   return calculate_event_id(ev: self) == self.id
-    //}
-
-    func get_blocks(content: String) -> Blocks {
-        return parse_note_content_ndb(note: self)
+    func get_blocks(privkey: Privkey?) -> Blocks {
+        return parse_note_content(content: .init(note: self, privkey: privkey))
    }

    func get_inner_event(cache: EventCache) -> NostrEvent? {
@@ -218,28 +271,41 @@ extension NdbNote {
            return nil
        }

-        if self.content == "", let ref = self.referenced_ids.first {
+        if self.content_len == 0, let id = self.referenced_ids.first {
            // TODO: raw id cache lookups
-            let id = ref.id.string()
            return cache.lookup(id)
        }

-        // TODO: how to handle inner events?
-        return nil
-        //return self.inner_event
+        return self.inner_event
    }

    // TODO: References iterator
-    public var referenced_ids: LazyFilterSequence<References> {
-        References.ids(tags: self.tags)
+    public var referenced_ids: References<NoteId> {
+        References<NoteId>(tags: self.tags)
    }

-    public var referenced_pubkeys: LazyFilterSequence<References> {
-        References.pubkeys(tags: self.tags)
+    public var referenced_noterefs: References<NoteRef> {
+        References<NoteRef>(tags: self.tags)
    }

-    public var referenced_hashtags: LazyFilterSequence<References> {
-        References.hashtags(tags: self.tags)
+    public var referenced_follows: References<FollowRef> {
+        References<FollowRef>(tags: self.tags)
+    }
+
+    public var referenced_pubkeys: References<Pubkey> {
+        References<Pubkey>(tags: self.tags)
+    }
+
+    public var referenced_hashtags: References<Hashtag> {
+        References<Hashtag>(tags: self.tags)
+    }
+
+    public var referenced_params: References<ReplaceableParam> {
+        References<ReplaceableParam>(tags: self.tags)
+    }
+
+    public var references: References<RefId> {
+        References<RefId>(tags: self.tags)
    }

    func event_refs(_ privkey: Privkey?) -> [EventRef] {
@@ -262,7 +328,7 @@ extension NdbNote {
    func blocks(_ privkey: Privkey?) -> Blocks {
        if let bs = _blocks { return bs }

-        let blocks = get_blocks(content: self.get_content(privkey))
+        let blocks = get_blocks(privkey: privkey)
        self._blocks = blocks
        return blocks
    }
@@ -273,11 +339,8 @@ extension NdbNote {
            return decrypted_content
        }

-        guard let key = privkey else {
-            return nil
-        }
-
-        guard let our_pubkey = privkey_to_pubkey(privkey: key) else {
+        guard let privkey,
+              let our_pubkey = privkey_to_pubkey(privkey: privkey) else {
            return nil
        }

@@ -285,37 +348,21 @@ extension NdbNote {
        var pubkey = self.pubkey
        // This is our DM, we need to use the pubkey of the person we're talking to instead

-        if our_pubkey == pubkey {
-            guard let refkey = self.referenced_pubkeys.first else {
-                return nil
-            }
-
-            pubkey = refkey.ref_id.string()
+        if our_pubkey == pubkey, let pk = self.referenced_pubkeys.first {
+            pubkey = pk
        }

        // NDBTODO: pass data to pubkey
-        let dec = decrypt_dm(key, pubkey: pubkey, content: self.content, encoding: .base64)
+        let dec = decrypt_dm(privkey, pubkey: pubkey, content: self.content, encoding: .base64)
        self.decrypted_content = dec

        return dec
    }

-    /*
-
-    var description: String {
-        return "NostrEvent { id: \(id) pubkey \(pubkey) kind \(kind) tags \(tags) content '\(content)' }"
-    }
-
-    // Not sure I should implement this
-    private func get_referenced_ids(key: String) -> [ReferencedId] {
-        return damus.get_referenced_ids(tags: self.tags, key: key)
-    }
-     */
-
-    public func direct_replies(_ privkey: Privkey?) -> [ReferencedId] {
+    public func direct_replies(_ privkey: Privkey?) -> [NoteId] {
        return event_refs(privkey).reduce(into: []) { acc, evref in
            if let direct_reply = evref.is_direct_reply {
-                acc.append(direct_reply)
+                acc.append(direct_reply.note_id)
            }
        }
    }
@@ -324,83 +371,62 @@ extension NdbNote {
    public func thread_id(privkey: Privkey?) -> NoteId {
        for ref in event_refs(privkey) {
            if let thread_id = ref.is_thread_id {
-                return thread_id.ref_id
+                return thread_id.note_id
            }
        }

        return self.id
    }

-    public func last_refid() -> ReferencedId? {
-        return self.referenced_ids.last?.to_referenced_id()
+    public func last_refid() -> NoteId? {
+        return self.referenced_ids.last
    }

    // NDBTODO: id -> data
+    /*
    public func references(id: String, key: AsciiCharacter) -> Bool {
+        var matcher: (Reference) -> Bool = { ref in ref.ref_id.matches_str(id) }
+        if id.count == 64, let decoded = hex_decode(id) {
+            matcher = { ref in ref.ref_id.matches_id(decoded) }
+        }
        for ref in References(tags: self.tags) {
-            if ref.key == key && ref.id.string() == id {
+            if ref.key == key && matcher(ref) {
                return true
            }
        }

        return false
    }
+     */

    func is_reply(_ privkey: Privkey?) -> Bool {
        return event_is_reply(self.event_refs(privkey))
    }

-    func note_language(_ privkey: Privkey?) async -> String? {
-        let t = Task.detached {
-            // Rely on Apple's NLLanguageRecognizer to tell us which language it thinks the note is in
-            // and filter on only the text portions of the content as URLs and hashtags confuse the language recognizer.
-            let originalBlocks = self.blocks(privkey).blocks
-            let originalOnlyText = originalBlocks.compactMap { $0.is_text }.joined(separator: " ")
+    func note_language(_ privkey: Privkey?) -> String? {
+        assert(!Thread.isMainThread, "This function must not be run on the main thread.")

-            // Only accept language recognition hypothesis if there's at least a 50% probability that it's accurate.
-            let languageRecognizer = NLLanguageRecognizer()
-            languageRecognizer.processString(originalOnlyText)
+        // Rely on Apple's NLLanguageRecognizer to tell us which language it thinks the note is in
+        // and filter on only the text portions of the content as URLs and hashtags confuse the language recognizer.
+        let originalBlocks = self.blocks(privkey).blocks
+        let originalOnlyText = originalBlocks.compactMap { $0.is_text }.joined(separator: " ")

-            guard let locale = languageRecognizer.languageHypotheses(withMaximum: 1).first(where: { $0.value >= 0.5 })?.key.rawValue else {
-                let nstr: String? = nil
-                return nstr
-            }
+        // Only accept language recognition hypothesis if there's at least a 50% probability that it's accurate.
+        let languageRecognizer = NLLanguageRecognizer()
+        languageRecognizer.processString(originalOnlyText)

-            // Remove the variant component and just take the language part as translation services typically only supports the variant-less language.
-            // Moreover, speakers of one variant can generally understand other variants.
-            return localeToLanguage(locale)
+        guard let locale = languageRecognizer.languageHypotheses(withMaximum: 1).first(where: { $0.value >= 0.5 })?.key.rawValue else {
+            let nstr: String? = nil
+            return nstr
        }

-        return await t.value
-    }
-
-    /*
-
-    func calculate_id() {
-        self.id = calculate_event_id(ev: self)
-    }
-
-    func sign(privkey: String) {
-        self.sig = sign_event(privkey: privkey, ev: self)
+        // Remove the variant component and just take the language part as translation services typically only supports the variant-less language.
+        // Moreover, speakers of one variant can generally understand other variants.
+        return localeToLanguage(locale)
    }

    var age: TimeInterval {
        let event_date = Date(timeIntervalSince1970: TimeInterval(created_at))
        return Date.now.timeIntervalSince(event_date)
    }
-     */
-}
-
-extension LazyFilterSequence {
-    var first: Element? {
-        self.first(where: { _ in true })
-    }
-
-    var last: Element? {
-        var ev: Element? = nil
-        for e in self {
-            ev = e
-        }
-        return ev
-    }
 }
--- a/nostrdb/NdbTagElem.swift
+++ b/nostrdb/NdbTagElem.swift
@@ -31,8 +31,7 @@ struct NdbStrIter: IteratorProtocol {
    }
 }

-struct NdbTagElem: Sequence, Hashable {
-
+struct NdbTagElem: Sequence, Hashable, Equatable {
    let note: NdbNote
    let tag: UnsafeMutablePointer<ndb_tag>
    let index: Int32
@@ -71,6 +70,13 @@ struct NdbTagElem: Sequence, Hashable {
        return str.flag == NDB_PACKED_ID
    }

+    var isEmpty: Bool {
+        if str.flag == NDB_PACKED_ID {
+            return false
+        }
+        return str.str[0] == 0
+    }
+
    var count: Int {
        if str.flag == NDB_PACKED_ID {
            return 32
@@ -79,11 +85,24 @@ struct NdbTagElem: Sequence, Hashable {
        }
    }

+    var single_char: AsciiCharacter? {
+        let c = str.str[0]
+        guard c != 0 && str.str[1] == 0 else { return nil }
+        return AsciiCharacter(c)
+    }
+
    func matches_char(_ c: AsciiCharacter) -> Bool {
        return str.str[0] == c.cchar && str.str[1] == 0
    }

-    func matches_str(_ s: String) -> Bool {
+    func matches_id(_ d: Data) -> Bool {
+        if str.flag == NDB_PACKED_ID, d.count == 32 {
+            return memcmp(d.bytes, str.id, 32) == 0
+        }
+        return false
+    }
+
+    func matches_str(_ s: String, tag_len: Int? = nil) -> Bool {
        if str.flag == NDB_PACKED_ID,
           s.utf8.count == 64,
           var decoded = hex_decode(s), decoded.count == 32
@@ -91,18 +110,19 @@ struct NdbTagElem: Sequence, Hashable {
            return memcmp(&decoded, str.id, 32) == 0
        }

-        let len = strlen(str.str)
-        guard len == s.utf8.count else { return false }
-        return s.withCString { cstr in memcmp(str.str, cstr, len) == 0 }
-    }
+        // Ensure the Swift string's utf8 count matches the C string's length.
+        guard (tag_len ?? strlen(str.str)) == s.utf8.count else {
+            return false
+        }

-    var ndbstr: ndb_str {
-        return ndb_tag_str(note.note, tag, index)
+        // Compare directly using the utf8 view.
+        return s.utf8.withContiguousStorageIfAvailable { buffer in
+            memcmp(buffer.baseAddress, str.str, buffer.count) == 0
+        } ?? false
    }

    func data() -> NdbData {
-        let s = ndb_tag_str(note.note, tag, index)
-        return NdbData(note: note, str: s)
+        return NdbData(note: note, str: self.str)
    }

    func id() -> Data? {
--- a/nostrdb/NdbTagsIterator.swift
+++ b/nostrdb/NdbTagsIterator.swift
@@ -35,7 +35,7 @@ struct TagsIterator: IteratorProtocol {
    }
 }

-struct TagsSequence: Sequence {
+struct TagsSequence: Encodable, Sequence {
    let note: NdbNote

    var count: UInt16 {
@@ -48,10 +48,19 @@ struct TagsSequence: Sequence {
        }
    }

+    func encode(to encoder: Encoder) throws {
+        var container = encoder.unkeyedContainer()
+        
+        // Iterate and create the [[String]] for encoding
+        for tag in self {
+            try container.encode(tag.map { $0.string() })
+        }
+    }
+
    // no O(1) indexing on top-level tag lists unfortunately :(
    // bit it's very fast to iterate over each tag since the number of tags
    // are stored and the elements are fixed size.
-    subscript(index: Int) -> Iterator.Element? {
+    subscript(index: Int) -> Iterator.Element {
        var i = 0
        for element in self {
            if i == index {
@@ -59,11 +68,9 @@ struct TagsSequence: Sequence {
            }
            i += 1
        }
-        return nil
-    }
-
-    func references() -> References {
-        return References(tags: self)
+        precondition(false, "sequence subscript oob")
+        // it seems like the compiler needs this or it gets bitchy
+        return .init(note: .init(note: .allocate(capacity: 1), owned_size: nil), tag: .allocate(capacity: 1))
    }

    func makeIterator() -> TagsIterator {
--- a/nostrdb/Test/NdbTests.swift
+++ b/nostrdb/Test/NdbTests.swift
@@ -18,13 +18,27 @@ final class NdbTests: XCTestCase {
        // Put teardown code here. This method is called after the invocation of each test method in the class.
    }

+    func test_decode_eose() throws {
+        let json = "[\"EOSE\",\"DC268DBD-55DA-458A-B967-540925AF3497\"]"
+        let resp = decode_nostr_event(txt: json)
+        XCTAssertNotNil(resp)
+    }
+
+    func test_decode_command_result() throws {
+        let json = "[\"OK\",\"b1d8f68d39c07ce5c5ea10c235100d529b2ed2250140b36a35d940b712dc6eff\",true,\"\"]"
+        let resp = decode_nostr_event(txt: json)
+        XCTAssertNotNil(resp)
+
+    }
+
    func test_ndb_note() throws {
        let note = NdbNote.owned_from_json(json: test_contact_list_json)
        XCTAssertNotNil(note)
        guard let note else { return }

-        let id = "20d0ff27d6fcb13de8366328c5b1a7af26bcac07f2e558fbebd5e9242e608c09"
-        let pubkey = "32e1827635450ebb3c5a7d12c1f8e7b2b514439ac10a67eef3d9fd9c5c68e245"
+        let id = NoteId(hex: "20d0ff27d6fcb13de8366328c5b1a7af26bcac07f2e558fbebd5e9242e608c09")!
+        let pubkey = Pubkey(hex: "32e1827635450ebb3c5a7d12c1f8e7b2b514439ac10a67eef3d9fd9c5c68e245")!
+
        XCTAssertEqual(note.id, id)
        XCTAssertEqual(note.pubkey, pubkey)

--- a/nostrdb/nostrdb.c
+++ b/nostrdb/nostrdb.c
@@ -521,6 +521,8 @@ static int ndb_builder_make_json_str(struct ndb_builder *builder,
 {
 	// let's not care about de-duping these. we should just unescape
 	// in-place directly into the strings table. 
+	if (written)
+		*written = len;

 	const char *p, *end, *start;
 	unsigned char *builder_start;