ndb: switch to nostrdb notes

This is a refactor of the codebase to use a more memory-efficient
representation of notes. It should also be much faster at decoding since
we're using a custom C json parser now.

Changelog-Changed: Improved memory usage and performance when processing events
This commit is contained in:
William Casarin
2023-07-26 08:46:44 -07:00
parent 55bbe8f855
commit cebd1f48ca
110 changed files with 2153 additions and 1799 deletions

View File

@@ -34,7 +34,7 @@ enum NdbData {
}
}
class NdbNote: Equatable, Hashable {
class NdbNote: Encodable, Equatable, Hashable {
// we can have owned notes, but we can also have lmdb virtual-memory mapped notes so its optional
private let owned: Bool
let count: Int
@@ -52,6 +52,14 @@ class NdbNote: Equatable, Hashable {
self.note = note
self.owned = owned_size != nil
self.count = owned_size ?? 0
if let owned_size {
NdbNote.total_ndb_size += Int(owned_size)
NdbNote.notes_created += 1
print("\(NdbNote.notes_created) ndb_notes, \(NdbNote.total_ndb_size) bytes")
}
}
var content: String {
@@ -67,13 +75,17 @@ class NdbNote: Equatable, Hashable {
}
/// NDBTODO: make this into data
var id: String {
hex_encode(Data(buffer: UnsafeBufferPointer(start: ndb_note_id(note), count: 32)))
var id: NoteId {
.init(Data(bytes: ndb_note_id(note), count: 32))
}
var sig: Signature {
.init(Data(bytes: ndb_note_sig(note), count: 64))
}
/// NDBTODO: make this into data
var pubkey: String {
hex_encode(Data(buffer: UnsafeBufferPointer(start: ndb_note_pubkey(note), count: 32)))
var pubkey: Pubkey {
.init(Data(bytes: ndb_note_pubkey(note), count: 32))
}
var created_at: UInt32 {
@@ -90,6 +102,10 @@ class NdbNote: Equatable, Hashable {
deinit {
if self.owned {
NdbNote.total_ndb_size -= Int(count)
NdbNote.notes_created -= 1
print("\(NdbNote.notes_created) ndb_notes, \(NdbNote.total_ndb_size) bytes")
free(note)
}
}
@@ -102,58 +118,100 @@ class NdbNote: Equatable, Hashable {
hasher.combine(id)
}
static let max_note_size: Int = 2 << 18
private enum CodingKeys: String, CodingKey {
case id, sig, tags, pubkey, created_at, kind, content
}
// Implement the `Encodable` protocol
func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
try container.encode(hex_encode(id.id), forKey: .id)
try container.encode(hex_encode(sig.data), forKey: .sig)
try container.encode(pubkey, forKey: .pubkey)
try container.encode(created_at, forKey: .created_at)
try container.encode(kind, forKey: .kind)
try container.encode(content, forKey: .content)
try container.encode(tags, forKey: .tags)
}
static var total_ndb_size: Int = 0
static var notes_created: Int = 0
init?(content: String, keypair: Keypair, kind: UInt32 = 1, tags: [[String]] = [], createdAt: UInt32 = UInt32(Date().timeIntervalSince1970)) {
var builder = ndb_builder()
let buflen = MAX_NOTE_SIZE
let buf = malloc(buflen)
let idbuf = malloc(buflen)
ndb_builder_init(&builder, buf, Int32(buflen))
guard var pk_raw = hex_decode(keypair.pubkey) else { return nil }
var pk_raw = keypair.pubkey.bytes
ndb_builder_set_pubkey(&builder, &pk_raw)
ndb_builder_set_kind(&builder, UInt32(kind))
ndb_builder_set_created_at(&builder, createdAt)
var ok = true
for tag in tags {
ndb_builder_new_tag(&builder);
for elem in tag {
_ = elem.withCString { eptr in
ndb_builder_push_tag_str(&builder, eptr, Int32(elem.utf8.count))
ok = elem.withCString({ eptr in
return ndb_builder_push_tag_str(&builder, eptr, Int32(elem.utf8.count)) > 0
})
if !ok {
return nil
}
}
}
_ = content.withCString { cptr in
ndb_builder_set_content(&builder, content, Int32(content.utf8.count));
ok = content.withCString { cptr in
return ndb_builder_set_content(&builder, cptr, Int32(content.utf8.count)) > 0
}
if !ok {
return nil
}
var n = UnsafeMutablePointer<ndb_note>?(nil)
let keypair = keypair.privkey.map { sec in
var the_kp: ndb_keypair? = nil
if let sec = keypair.privkey {
var kp = ndb_keypair()
return sec.withCString { secptr in
ndb_decode_key(secptr, &kp)
return kp
memcpy(&kp.secret.0, sec.id.bytes, 32);
if ndb_create_keypair(&kp) <= 0 {
print("bad keypair")
} else {
the_kp = kp
}
}
var len: Int32 = 0
if var keypair {
len = ndb_builder_finalize(&builder, &n, &keypair)
if var the_kp {
len = ndb_builder_finalize(&builder, &n, &the_kp)
} else {
len = ndb_builder_finalize(&builder, &n, nil)
}
free(idbuf)
if len <= 0 {
free(buf)
return nil
}
//guard let n else { return nil }
self.owned = true
self.count = Int(len)
self.note = realloc(n, Int(len)).assumingMemoryBound(to: ndb_note.self)
//self.note = n
let r = realloc(buf, Int(len))
guard let r else {
free(buf)
return nil
}
self.note = r.assumingMemoryBound(to: ndb_note.self)
}
static func owned_from_json(json: String, bufsize: Int = 2 << 18) -> NdbNote? {
@@ -204,13 +262,8 @@ extension NdbNote {
return !too_big
}
//var is_valid_id: Bool {
// return calculate_event_id(ev: self) == self.id
//}
func get_blocks(content: String) -> Blocks {
return parse_note_content_ndb(note: self)
func get_blocks(privkey: Privkey?) -> Blocks {
return parse_note_content(content: .init(note: self, privkey: privkey))
}
func get_inner_event(cache: EventCache) -> NostrEvent? {
@@ -218,28 +271,41 @@ extension NdbNote {
return nil
}
if self.content == "", let ref = self.referenced_ids.first {
if self.content_len == 0, let id = self.referenced_ids.first {
// TODO: raw id cache lookups
let id = ref.id.string()
return cache.lookup(id)
}
// TODO: how to handle inner events?
return nil
//return self.inner_event
return self.inner_event
}
// TODO: References iterator
public var referenced_ids: LazyFilterSequence<References> {
References.ids(tags: self.tags)
public var referenced_ids: References<NoteId> {
References<NoteId>(tags: self.tags)
}
public var referenced_pubkeys: LazyFilterSequence<References> {
References.pubkeys(tags: self.tags)
public var referenced_noterefs: References<NoteRef> {
References<NoteRef>(tags: self.tags)
}
public var referenced_hashtags: LazyFilterSequence<References> {
References.hashtags(tags: self.tags)
public var referenced_follows: References<FollowRef> {
References<FollowRef>(tags: self.tags)
}
public var referenced_pubkeys: References<Pubkey> {
References<Pubkey>(tags: self.tags)
}
public var referenced_hashtags: References<Hashtag> {
References<Hashtag>(tags: self.tags)
}
public var referenced_params: References<ReplaceableParam> {
References<ReplaceableParam>(tags: self.tags)
}
public var references: References<RefId> {
References<RefId>(tags: self.tags)
}
func event_refs(_ privkey: Privkey?) -> [EventRef] {
@@ -262,7 +328,7 @@ extension NdbNote {
func blocks(_ privkey: Privkey?) -> Blocks {
if let bs = _blocks { return bs }
let blocks = get_blocks(content: self.get_content(privkey))
let blocks = get_blocks(privkey: privkey)
self._blocks = blocks
return blocks
}
@@ -273,11 +339,8 @@ extension NdbNote {
return decrypted_content
}
guard let key = privkey else {
return nil
}
guard let our_pubkey = privkey_to_pubkey(privkey: key) else {
guard let privkey,
let our_pubkey = privkey_to_pubkey(privkey: privkey) else {
return nil
}
@@ -285,37 +348,21 @@ extension NdbNote {
var pubkey = self.pubkey
// This is our DM, we need to use the pubkey of the person we're talking to instead
if our_pubkey == pubkey {
guard let refkey = self.referenced_pubkeys.first else {
return nil
}
pubkey = refkey.ref_id.string()
if our_pubkey == pubkey, let pk = self.referenced_pubkeys.first {
pubkey = pk
}
// NDBTODO: pass data to pubkey
let dec = decrypt_dm(key, pubkey: pubkey, content: self.content, encoding: .base64)
let dec = decrypt_dm(privkey, pubkey: pubkey, content: self.content, encoding: .base64)
self.decrypted_content = dec
return dec
}
/*
var description: String {
return "NostrEvent { id: \(id) pubkey \(pubkey) kind \(kind) tags \(tags) content '\(content)' }"
}
// Not sure I should implement this
private func get_referenced_ids(key: String) -> [ReferencedId] {
return damus.get_referenced_ids(tags: self.tags, key: key)
}
*/
public func direct_replies(_ privkey: Privkey?) -> [ReferencedId] {
public func direct_replies(_ privkey: Privkey?) -> [NoteId] {
return event_refs(privkey).reduce(into: []) { acc, evref in
if let direct_reply = evref.is_direct_reply {
acc.append(direct_reply)
acc.append(direct_reply.note_id)
}
}
}
@@ -324,83 +371,62 @@ extension NdbNote {
public func thread_id(privkey: Privkey?) -> NoteId {
for ref in event_refs(privkey) {
if let thread_id = ref.is_thread_id {
return thread_id.ref_id
return thread_id.note_id
}
}
return self.id
}
public func last_refid() -> ReferencedId? {
return self.referenced_ids.last?.to_referenced_id()
public func last_refid() -> NoteId? {
return self.referenced_ids.last
}
// NDBTODO: id -> data
/*
public func references(id: String, key: AsciiCharacter) -> Bool {
var matcher: (Reference) -> Bool = { ref in ref.ref_id.matches_str(id) }
if id.count == 64, let decoded = hex_decode(id) {
matcher = { ref in ref.ref_id.matches_id(decoded) }
}
for ref in References(tags: self.tags) {
if ref.key == key && ref.id.string() == id {
if ref.key == key && matcher(ref) {
return true
}
}
return false
}
*/
func is_reply(_ privkey: Privkey?) -> Bool {
return event_is_reply(self.event_refs(privkey))
}
func note_language(_ privkey: Privkey?) async -> String? {
let t = Task.detached {
// Rely on Apple's NLLanguageRecognizer to tell us which language it thinks the note is in
// and filter on only the text portions of the content as URLs and hashtags confuse the language recognizer.
let originalBlocks = self.blocks(privkey).blocks
let originalOnlyText = originalBlocks.compactMap { $0.is_text }.joined(separator: " ")
func note_language(_ privkey: Privkey?) -> String? {
assert(!Thread.isMainThread, "This function must not be run on the main thread.")
// Only accept language recognition hypothesis if there's at least a 50% probability that it's accurate.
let languageRecognizer = NLLanguageRecognizer()
languageRecognizer.processString(originalOnlyText)
// Rely on Apple's NLLanguageRecognizer to tell us which language it thinks the note is in
// and filter on only the text portions of the content as URLs and hashtags confuse the language recognizer.
let originalBlocks = self.blocks(privkey).blocks
let originalOnlyText = originalBlocks.compactMap { $0.is_text }.joined(separator: " ")
guard let locale = languageRecognizer.languageHypotheses(withMaximum: 1).first(where: { $0.value >= 0.5 })?.key.rawValue else {
let nstr: String? = nil
return nstr
}
// Only accept language recognition hypothesis if there's at least a 50% probability that it's accurate.
let languageRecognizer = NLLanguageRecognizer()
languageRecognizer.processString(originalOnlyText)
// Remove the variant component and just take the language part as translation services typically only supports the variant-less language.
// Moreover, speakers of one variant can generally understand other variants.
return localeToLanguage(locale)
guard let locale = languageRecognizer.languageHypotheses(withMaximum: 1).first(where: { $0.value >= 0.5 })?.key.rawValue else {
let nstr: String? = nil
return nstr
}
return await t.value
}
/*
func calculate_id() {
self.id = calculate_event_id(ev: self)
}
func sign(privkey: String) {
self.sig = sign_event(privkey: privkey, ev: self)
// Remove the variant component and just take the language part as translation services typically only supports the variant-less language.
// Moreover, speakers of one variant can generally understand other variants.
return localeToLanguage(locale)
}
var age: TimeInterval {
let event_date = Date(timeIntervalSince1970: TimeInterval(created_at))
return Date.now.timeIntervalSince(event_date)
}
*/
}
extension LazyFilterSequence {
var first: Element? {
self.first(where: { _ in true })
}
var last: Element? {
var ev: Element? = nil
for e in self {
ev = e
}
return ev
}
}

View File

@@ -31,8 +31,7 @@ struct NdbStrIter: IteratorProtocol {
}
}
struct NdbTagElem: Sequence, Hashable {
struct NdbTagElem: Sequence, Hashable, Equatable {
let note: NdbNote
let tag: UnsafeMutablePointer<ndb_tag>
let index: Int32
@@ -71,6 +70,13 @@ struct NdbTagElem: Sequence, Hashable {
return str.flag == NDB_PACKED_ID
}
var isEmpty: Bool {
if str.flag == NDB_PACKED_ID {
return false
}
return str.str[0] == 0
}
var count: Int {
if str.flag == NDB_PACKED_ID {
return 32
@@ -79,11 +85,24 @@ struct NdbTagElem: Sequence, Hashable {
}
}
var single_char: AsciiCharacter? {
let c = str.str[0]
guard c != 0 && str.str[1] == 0 else { return nil }
return AsciiCharacter(c)
}
func matches_char(_ c: AsciiCharacter) -> Bool {
return str.str[0] == c.cchar && str.str[1] == 0
}
func matches_str(_ s: String) -> Bool {
func matches_id(_ d: Data) -> Bool {
if str.flag == NDB_PACKED_ID, d.count == 32 {
return memcmp(d.bytes, str.id, 32) == 0
}
return false
}
func matches_str(_ s: String, tag_len: Int? = nil) -> Bool {
if str.flag == NDB_PACKED_ID,
s.utf8.count == 64,
var decoded = hex_decode(s), decoded.count == 32
@@ -91,18 +110,19 @@ struct NdbTagElem: Sequence, Hashable {
return memcmp(&decoded, str.id, 32) == 0
}
let len = strlen(str.str)
guard len == s.utf8.count else { return false }
return s.withCString { cstr in memcmp(str.str, cstr, len) == 0 }
}
// Ensure the Swift string's utf8 count matches the C string's length.
guard (tag_len ?? strlen(str.str)) == s.utf8.count else {
return false
}
var ndbstr: ndb_str {
return ndb_tag_str(note.note, tag, index)
// Compare directly using the utf8 view.
return s.utf8.withContiguousStorageIfAvailable { buffer in
memcmp(buffer.baseAddress, str.str, buffer.count) == 0
} ?? false
}
func data() -> NdbData {
let s = ndb_tag_str(note.note, tag, index)
return NdbData(note: note, str: s)
return NdbData(note: note, str: self.str)
}
func id() -> Data? {

View File

@@ -35,7 +35,7 @@ struct TagsIterator: IteratorProtocol {
}
}
struct TagsSequence: Sequence {
struct TagsSequence: Encodable, Sequence {
let note: NdbNote
var count: UInt16 {
@@ -48,10 +48,19 @@ struct TagsSequence: Sequence {
}
}
func encode(to encoder: Encoder) throws {
var container = encoder.unkeyedContainer()
// Iterate and create the [[String]] for encoding
for tag in self {
try container.encode(tag.map { $0.string() })
}
}
// no O(1) indexing on top-level tag lists unfortunately :(
// bit it's very fast to iterate over each tag since the number of tags
// are stored and the elements are fixed size.
subscript(index: Int) -> Iterator.Element? {
subscript(index: Int) -> Iterator.Element {
var i = 0
for element in self {
if i == index {
@@ -59,11 +68,9 @@ struct TagsSequence: Sequence {
}
i += 1
}
return nil
}
func references() -> References {
return References(tags: self)
precondition(false, "sequence subscript oob")
// it seems like the compiler needs this or it gets bitchy
return .init(note: .init(note: .allocate(capacity: 1), owned_size: nil), tag: .allocate(capacity: 1))
}
func makeIterator() -> TagsIterator {

View File

@@ -18,13 +18,27 @@ final class NdbTests: XCTestCase {
// Put teardown code here. This method is called after the invocation of each test method in the class.
}
func test_decode_eose() throws {
let json = "[\"EOSE\",\"DC268DBD-55DA-458A-B967-540925AF3497\"]"
let resp = decode_nostr_event(txt: json)
XCTAssertNotNil(resp)
}
func test_decode_command_result() throws {
let json = "[\"OK\",\"b1d8f68d39c07ce5c5ea10c235100d529b2ed2250140b36a35d940b712dc6eff\",true,\"\"]"
let resp = decode_nostr_event(txt: json)
XCTAssertNotNil(resp)
}
func test_ndb_note() throws {
let note = NdbNote.owned_from_json(json: test_contact_list_json)
XCTAssertNotNil(note)
guard let note else { return }
let id = "20d0ff27d6fcb13de8366328c5b1a7af26bcac07f2e558fbebd5e9242e608c09"
let pubkey = "32e1827635450ebb3c5a7d12c1f8e7b2b514439ac10a67eef3d9fd9c5c68e245"
let id = NoteId(hex: "20d0ff27d6fcb13de8366328c5b1a7af26bcac07f2e558fbebd5e9242e608c09")!
let pubkey = Pubkey(hex: "32e1827635450ebb3c5a7d12c1f8e7b2b514439ac10a67eef3d9fd9c5c68e245")!
XCTAssertEqual(note.id, id)
XCTAssertEqual(note.pubkey, pubkey)

View File

@@ -521,6 +521,8 @@ static int ndb_builder_make_json_str(struct ndb_builder *builder,
{
// let's not care about de-duping these. we should just unescape
// in-place directly into the strings table.
if (written)
*written = len;
const char *p, *end, *start;
unsigned char *builder_start;