translate: implement string distance for close matches

Implement the levenshtein string distance algorithm for determining
if the translation is too similar to the original content.

Closes: https://github.com/damus-io/damus/issues/1996

Lightning-address: kernelkind@getalby.com
Signed-off-by: kernelkind <kernelkind@gmail.com>
Reviewed-by: William Casarin <jb55@jb55.com>
Link: 20240214032018.57812-1-kernelkind@gmail.com
Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
kernelkind
2024-02-13 22:20:18 -05:00
committed by William Casarin
parent 90180202b6
commit 58326f679e
3 changed files with 119 additions and 0 deletions

View File

@@ -0,0 +1,58 @@
//
// TranslationTests.swift
// damusTests
//
// Created by KernelKind on 2/13/24.
//
import XCTest
@testable import damus
final class TranslationTests : XCTestCase {
let translationStringDistanceCases = [
("test", "test ", false),
("wat", "what", false),
("wat's the wether like", "what's the weather like", true),
("GM GZY⚡\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4 ", "GM GZY⚡\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4", false),
("Fucking nostr forever typos lol 😂", "Fucking nostr forever typo's lol 😂", false),
("where's the library", "donde esta la libreria", true),
("In America", "En América", true)
]
func testStringDistanceRequirements() {
for (original, translated, expectedVal) in translationStringDistanceCases {
XCTAssertEqual(translationMeetsStringDistanceRequirements(original: original, translated: translated), expectedVal)
}
}
let levenshteinDistanceCases = [
// (original string, mutated string, number of changes from original to mutated)
("hello", "hello", 0), // No change
("123", "1234", 1), // Addition at the end
("abcd", "abcde", 1), // Addition at the end
("abc", "a", 2), // Multiple deletions
("abcdef", "abc", 3), // Multiple deletions
("2024", "2025", 1), // Single substitution
("openai", "opnai", 1), // Single deletion
("swift", "swiift", 1), // Single addition
("language", "languag", 1), // Single deletion at the end
("example", "sxample", 1), // Single substitution at the beginning
("distance", "d1stanc3", 2), // Substitutions
("python", "pyth0n", 1), // Single substitution
("algorithm", "algor1thm", 1), // Single substitution in the middle
("implementation", "implemenation", 1), // Single deletion (typo)
("correction", "correctionn", 1), // Single addition at the end
("levenshtein", "levenshtien", 2), // Transposition
("threshold", "threshhold", 1), // Single addition (double letter)
("functionality", "fuctionality", 1), // Single deletion (common typo)
("assessment", "assesment", 1), // Single deletion (common typo)
("performance", "performence", 1), // Single substitution (common typo)
]
func testLevenshteinDistance() {
for (original, mutated, numChanges) in levenshteinDistanceCases {
XCTAssertTrue(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges))
XCTAssertFalse(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges+1))
}
}
}