From 0bb65eec3d570e8a0f6bd5c6a72f10641b97c71e Mon Sep 17 00:00:00 2001 From: Terry Yiu <963907+tyiu@users.noreply.github.com> Date: Tue, 11 Jun 2024 00:40:57 -0400 Subject: [PATCH] Add trie insertion options for including non-prefixed, case insensitive, and diacritic insensitive matches --- Sources/SwiftTrie/Trie.swift | 109 +++++++++++++++++------- Tests/SwiftTrieTests/TrieTests.swift | 121 ++++++++++++++++++++++++--- 2 files changed, 188 insertions(+), 42 deletions(-) diff --git a/Sources/SwiftTrie/Trie.swift b/Sources/SwiftTrie/Trie.swift index 2cca4da..ebf4030 100644 --- a/Sources/SwiftTrie/Trie.swift +++ b/Sources/SwiftTrie/Trie.swift @@ -41,6 +41,25 @@ public class Trie { public init() { } } +/// The transformation options that can be applied to the original key when inserting a value into a trie +/// as additional keys that map to the value. +public struct TrieInsertionOptions: OptionSet { + public let rawValue: Int + + public init(rawValue: Int) { + self.rawValue = rawValue + } + + /// Inserts all permutations of non-prefixed substring versions of the original key. + public static let includeNonPrefixedMatches = TrieInsertionOptions(rawValue: 1 << 0) + + /// Inserts the localized lowercase version of the original key. + public static let includeCaseInsensitiveMatches = TrieInsertionOptions(rawValue: 1 << 1) + + /// Inserts the original key with all diactritics removed. + public static let includeDiacriticsInsensitiveMatches = TrieInsertionOptions(rawValue: 1 << 2) +} + public extension Trie { /// Finds the branch that matches the specified key and returns the values from all of its descendant nodes. /// Note: If `key` is an empty string, all values are returned. @@ -75,6 +94,7 @@ public extension Trie { return Array(currentNode.exactMatchValues) + (substringMatches.subtracting(currentNode.exactMatchValues)) } + // swiftlint:disable cyclomatic_complexity /// Inserts a value into this trie for the specified key. /// This function stores all substring endings of the key, not only the key itself. /// Runtime performance is O(n^2) and storage cost is O(n), where n is the number of characters in the key. @@ -82,40 +102,69 @@ public extension Trie { /// - Parameters: /// - key: The key to insert that maps to `value`. /// - value: The value that is mapped from `key`. - /// - includeNonPrefixedMatches: Whether the key and value should be inserted to allow for non-prefixed matches. - /// By default, it is `false`. If it is `true`, more memory will be used. - func insert(key: String, value: V, includeNonPrefixedMatches: Bool = false) { - // Create root branches for each character of the key to enable substring searches - // instead of only just prefix searches. - // Hence the nested loop. - for keyIndex in 0.. [String] { + let includeNonPrefixedMatches = options.contains(.includeNonPrefixedMatches) + let includeCaseInsensitiveMatches = options.contains(.includeCaseInsensitiveMatches) + let includeDiacriticsInsensitiveMatches = options.contains(.includeDiacriticsInsensitiveMatches) - // Find branch with matching prefix. - for char in key[key.index(key.startIndex, offsetBy: keyIndex)...] { - if let child = currentNode.children[char] { - currentNode = child - } else { - let child = Trie() - child.parent = currentNode - currentNode.children[char] = child - currentNode = child - } - } - - if keyIndex == 0 { - currentNode.exactMatchValues.insert(value) - - // If includeNonPrefixedMatches is true, the first character of the key can be the only root branch - // and we terminate the loop early. - if !includeNonPrefixedMatches { - return - } - } else { - currentNode.substringMatchValues.insert(value) + var keys = [originalKey] + if includeCaseInsensitiveMatches { + let localizedLowercase = originalKey.localizedLowercase + if localizedLowercase != originalKey { + keys.append(localizedLowercase) } } + if includeDiacriticsInsensitiveMatches, + let keyWithoutDiacritics = originalKey.applyingTransform(.stripDiacritics, reverse: false), + keyWithoutDiacritics != originalKey { + keys.append(keyWithoutDiacritics) + + if includeCaseInsensitiveMatches { + let localizedLowercaseWithoutDiacritics = keyWithoutDiacritics.localizedLowercase + if localizedLowercaseWithoutDiacritics != originalKey { + keys.append(localizedLowercaseWithoutDiacritics) + } + } + } + + for key in keys { + // Create root branches for each character of the key to enable substring searches + // instead of only just prefix searches. + // Hence the nested loop. + for keyIndex in 0..() - let keys = ["foobar", "food", "foo", "somethingelse", "duplicate", "duplicate"] + let keys = ["foobar", "food", "foo", "somethingelse", "duplicate", "duplicate", "first: second", "août"] keys.forEach { - trie.insert(key: $0, value: $0) + XCTAssertEqual(trie.insert(key: $0, value: $0), [$0]) } let allResults = trie.find(key: "") - XCTAssertEqual(Set(allResults), Set(["foobar", "food", "foo", "somethingelse", "duplicate"])) + XCTAssertEqual(Set(allResults), Set(keys)) let fooResults = trie.find(key: "foo") XCTAssertEqual(fooResults.first, "foo") @@ -29,11 +29,20 @@ final class TrieTests: XCTestCase { XCTAssertEqual(foodResults, ["food"]) let ooResults = trie.find(key: "oo") - XCTAssertEqual(Set(ooResults), Set([])) + XCTAssertEqual(ooResults, []) + + let multipleWordsResults = trie.find(key: "second") + XCTAssertEqual(multipleWordsResults, []) let notFoundResults = trie.find(key: "notfound") XCTAssertEqual(notFoundResults, []) + let caseSensitiveResults = trie.find(key: "FOO") + XCTAssertEqual(caseSensitiveResults, []) + + let diacriticResults = trie.find(key: "aout") + XCTAssertEqual(diacriticResults, []) + // Sanity check that the root node has children. XCTAssertTrue(trie.hasChildren) @@ -44,13 +53,13 @@ final class TrieTests: XCTestCase { func testFindNonPrefixedMatches() throws { let trie = Trie() - let keys = ["foobar", "food", "foo", "somethingelse", "duplicate", "duplicate"] + let keys = ["foobar", "food", "foo", "somethingelse", "duplicate", "duplicate", "first: second", "août"] keys.forEach { - trie.insert(key: $0, value: $0, includeNonPrefixedMatches: true) + XCTAssertEqual(trie.insert(key: $0, value: $0, options: [.includeNonPrefixedMatches]), [$0]) } let allResults = trie.find(key: "") - XCTAssertEqual(Set(allResults), Set(["foobar", "food", "foo", "somethingelse", "duplicate"])) + XCTAssertEqual(Set(allResults), Set(keys)) let fooResults = trie.find(key: "foo") XCTAssertEqual(fooResults.first, "foo") @@ -62,12 +71,87 @@ final class TrieTests: XCTestCase { let ooResults = trie.find(key: "oo") XCTAssertEqual(Set(ooResults), Set(["foobar", "food", "foo"])) + let multipleWordsResults = trie.find(key: "second") + XCTAssertEqual(multipleWordsResults, ["first: second"]) + let aResults = trie.find(key: "a") - XCTAssertEqual(Set(aResults), Set(["foobar", "duplicate"])) + XCTAssertEqual(Set(aResults), Set(["foobar", "duplicate", "août"])) let notFoundResults = trie.find(key: "notfound") XCTAssertEqual(notFoundResults, []) + let caseSensitiveResults = trie.find(key: "FOO") + XCTAssertEqual(caseSensitiveResults, []) + + let diacriticResults = trie.find(key: "aout") + XCTAssertEqual(diacriticResults, []) + + // Sanity check that the root node has children. + XCTAssertTrue(trie.hasChildren) + + // Sanity check that the root node has no values. + XCTAssertFalse(trie.hasValues) + } + + func testFindCaseInsensitive() throws { + let trie = Trie() + + let key = "FoObAr" + XCTAssertEqual(trie.insert(key: key, value: key, options: [.includeCaseInsensitiveMatches]), [key, "foobar"]) + + let allResults = trie.find(key: "") + XCTAssertEqual(Set(allResults), Set([key])) + + let fooResults = trie.find(key: "foo") + XCTAssertEqual(fooResults, [key]) + + // Sanity check that the root node has children. + XCTAssertTrue(trie.hasChildren) + + // Sanity check that the root node has no values. + XCTAssertFalse(trie.hasValues) + } + + func testFindDiacriticInsensitive() throws { + let trie = Trie() + + let key = "Laïcité" + XCTAssertEqual( + trie.insert(key: key, value: key, options: [.includeDiacriticsInsensitiveMatches]), + [key, "Laicite"] + ) + + let allResults = trie.find(key: "") + XCTAssertEqual(Set(allResults), Set([key])) + + let laiciteResults = trie.find(key: "Laicite") + XCTAssertEqual(laiciteResults, [key]) + + // Sanity check that the root node has children. + XCTAssertTrue(trie.hasChildren) + + // Sanity check that the root node has no values. + XCTAssertFalse(trie.hasValues) + } + + func testFindCaseAndDiacriticInsensitive() throws { + let trie = Trie() + + let key = "Laïcité" + XCTAssertEqual( + trie.insert( + key: key, + value: key, + options: [.includeCaseInsensitiveMatches, .includeDiacriticsInsensitiveMatches] + ), + [key, "laïcité", "Laicite", "laicite"]) + + let allResults = trie.find(key: "") + XCTAssertEqual(Set(allResults), Set([key])) + + let laiciteResults = trie.find(key: "laicite") + XCTAssertEqual(laiciteResults, [key]) + // Sanity check that the root node has children. XCTAssertTrue(trie.hasChildren) @@ -78,13 +162,26 @@ final class TrieTests: XCTestCase { func testRemove() { let trie = Trie() - let keys = ["foobar", "food", "foo", "somethingelse", "duplicate", "duplicate"] + let keys = ["FoObAr", "FOOD", "foo", "Sométhingëlse", "duplicate", "duplicate"] + var insertedKeysMap = [String: [String]]() keys.forEach { - trie.insert(key: $0, value: $0) + insertedKeysMap[$0] = trie.insert(key: $0, value: $0, + options: [ + .includeNonPrefixedMatches, + .includeCaseInsensitiveMatches, + .includeDiacriticsInsensitiveMatches + ]) } - keys.forEach { - trie.remove(key: $0, value: $0) + XCTAssertEqual( + Set(insertedKeysMap.values.reduce([], +)), + Set(keys + ["foobar", "food", "Somethingelse", "somethingelse", "sométhingëlse"]) + ) + + insertedKeysMap.forEach { originalKey, insertedKeys in + insertedKeys.forEach { insertedKey in + trie.remove(key: insertedKey, value: originalKey) + } } let allResults = trie.find(key: "")