From 92cdbe35bf2b38165ec171703f039c074cb8ebed Mon Sep 17 00:00:00 2001 From: Terry Yiu <963907+tyiu@users.noreply.github.com> Date: Sun, 9 Jun 2024 00:20:09 -0400 Subject: [PATCH] Add support for localized emoji keywords --- Sources/EmojiKit/Emoji.swift | 6 +- .../CLDRAnnotationsXMLHandler.swift | 13 +- Sources/EmojiSourceKit/EmojiDownloader.swift | 241 +++++++++++++++--- Sources/EmojiSourceKit/UnicodeParser.swift | 29 +-- 4 files changed, 225 insertions(+), 64 deletions(-) diff --git a/Sources/EmojiKit/Emoji.swift b/Sources/EmojiKit/Emoji.swift index 1d263e7..20c5303 100644 --- a/Sources/EmojiKit/Emoji.swift +++ b/Sources/EmojiKit/Emoji.swift @@ -9,10 +9,10 @@ import Foundation public struct Emoji: Codable, Hashable { public let value: String - public let keywords: [String] + public let localizedKeywords: [String: [String]] - public init(value: String, keywords: [String]) { + public init(value: String, localizedKeywords: [String: [String]]) { self.value = value - self.keywords = keywords + self.localizedKeywords = localizedKeywords } } diff --git a/Sources/EmojiSourceKit/CLDRAnnotationsXMLHandler.swift b/Sources/EmojiSourceKit/CLDRAnnotationsXMLHandler.swift index bce21e5..d89ccc5 100644 --- a/Sources/EmojiSourceKit/CLDRAnnotationsXMLHandler.swift +++ b/Sources/EmojiSourceKit/CLDRAnnotationsXMLHandler.swift @@ -9,15 +9,21 @@ import Foundation import EmojiKit class CLDRAnnotationsXMLHandler: NSObject, XMLParserDelegate { + let locale: String + var currentElement = "" var currentEmoji: Emoji? var emojis = [Emoji]() var currentEmojiValue = "" + init(locale: String) { + self.locale = locale + } + func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) { if elementName == "annotation" && attributeDict["type"] != "tts" { if let cp = attributeDict["cp"] { - currentEmoji = Emoji(value: cp, keywords: []) + currentEmoji = Emoji(value: cp, localizedKeywords: [:]) currentEmojiValue = "" } } @@ -33,8 +39,9 @@ class CLDRAnnotationsXMLHandler: NSObject, XMLParserDelegate { func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) { if elementName == "annotation" { if let emoji = currentEmoji { - let values = currentEmojiValue.split(separator: "|").map { $0.trim() } - emojis.append(Emoji(value: emoji.value, keywords: values)) + var localizedKeywords = [String: [String]]() + localizedKeywords[locale] = currentEmojiValue.split(separator: "|").map { $0.trim() } + emojis.append(Emoji(value: emoji.value, localizedKeywords: localizedKeywords)) } } currentElement = "" diff --git a/Sources/EmojiSourceKit/EmojiDownloader.swift b/Sources/EmojiSourceKit/EmojiDownloader.swift index c8b25b0..d418659 100644 --- a/Sources/EmojiSourceKit/EmojiDownloader.swift +++ b/Sources/EmojiSourceKit/EmojiDownloader.swift @@ -45,43 +45,54 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand { return } - guard let cldrAnnotationsURL = await getURLForCLDRAnnotations() else { - return - } - - guard let cldrAnnotationsDerivedURL = await getURLForCLDRAnnotationsDerived() else { - return - } - print("🎉", "Successfully retrieved temporary URLs for version \(version.rawValue).\n") print("⚙️", "Starting to parse content...\n") - print("Trying CLDR data at \(cldrAnnotationsURL)\n") + var allCLDRAnnotations = [String: Emoji]() + for locale in supportedLocales { + guard let cldrAnnotationsURL = await getURLForCLDRAnnotations(locale: locale) else { + return + } - let cldrAnnotationsHandle = try FileHandle(forReadingFrom: cldrAnnotationsURL) - guard let cldrAnnotationsData = try cldrAnnotationsHandle.readToEnd() else { - print("⚠️", "Could not read CLDR annotations data.\n") - return + guard let cldrAnnotationsDerivedURL = await getURLForCLDRAnnotationsDerived(locale: locale) else { + return + } + + print("Trying CLDR data at \(cldrAnnotationsURL)\n") + + let cldrAnnotationsHandle = try FileHandle(forReadingFrom: cldrAnnotationsURL) + guard let cldrAnnotationsData = try cldrAnnotationsHandle.readToEnd() else { + print("⚠️", "Could not read CLDR annotations data.\n") + return + } + + let cldrAnnotationsMap = emojisMap(data: cldrAnnotationsData, locale: locale) ?? [:] + + print("Trying CLDR data at \(cldrAnnotationsDerivedURL)\n") + + let cldrAnnotationsDerivedHandle = try FileHandle(forReadingFrom: cldrAnnotationsDerivedURL) + guard let cldrAnnotationsDerivedData = try cldrAnnotationsDerivedHandle.readToEnd() else { + print("⚠️", "Could not read CLDR annotations derived data.\n") + return + } + let cldrAnnotationsDerivedMap = emojisMap(data: cldrAnnotationsDerivedData, locale: locale) ?? [:] + + allCLDRAnnotations.merge(cldrAnnotationsMap) { (current, new) in + let combinedKeywords = current.localizedKeywords.merging(new.localizedKeywords) { (current, _) in current } + return Emoji(value: current.value, localizedKeywords: combinedKeywords) + } + + allCLDRAnnotations.merge(cldrAnnotationsDerivedMap) { (current, new) in + let combinedKeywords = current.localizedKeywords.merging(new.localizedKeywords) { (current, _) in current } + return Emoji(value: current.value, localizedKeywords: combinedKeywords) + } } - let cldrAnnotationsMap = emojisMap(data: cldrAnnotationsData) ?? [:] - - print("Trying CLDR data at \(cldrAnnotationsDerivedURL)\n") - - let cldrAnnotationsDerivedHandle = try FileHandle(forReadingFrom: cldrAnnotationsDerivedURL) - guard let cldrAnnotationsDerivedData = try cldrAnnotationsDerivedHandle.readToEnd() else { - print("⚠️", "Could not read CLDR annotations derived data.\n") - return - } - let cldrAnnotationsDerivedMap = emojisMap(data: cldrAnnotationsDerivedData) ?? [:] - - let allCLDRAnnotationsMap = cldrAnnotationsMap.merging(cldrAnnotationsDerivedMap) { (current, _) in current } - let parser = UnicodeParser() do { - let emojisByCategory: [UnicodeEmojiCategory] = try await parser.parseEmojiList(for: emojiListURL, emojisMap: allCLDRAnnotationsMap) + let emojisByCategory: [UnicodeEmojiCategory] = try await parser.parseEmojiList(for: emojiListURL, emojisMap: allCLDRAnnotations) let emojiCounts: [UnicodeEmojiCategory.Name: Int] = parser.parseCountHTML(for: emojiCountsURL) @@ -100,10 +111,9 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand { } } - func emojisMap(data: Data) -> [String: Emoji]? { - print("emojisMap\n") + func emojisMap(data: Data, locale: String) -> [String: Emoji]? { let parser = XMLParser(data: data) - let handler = CLDRAnnotationsXMLHandler() + let handler = CLDRAnnotationsXMLHandler(locale: locale) parser.delegate = handler if parser.parse() { @@ -114,12 +124,175 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand { } } - func getURLForCLDRAnnotations() async -> URL? { - return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/c1dc8c7ef6584668345cf741e51b1722d8114bc8/common/annotations/en.xml") + let supportedLocales = [ + "af", + "am", + "ar", + "ar_SA", + "as", + "ast", + "az", + "be", + "bew", + "bg", + "bgn", + "bn", + "br", + "bs", + "ca", + "ccp", + "ceb", + "chr", + "ckb", + "cs", + "cv", + "cy", + "da", + "de", + "de_CH", + "doi", + "dsb", + "el", + "en", + "en_001", + "en_AU", + "en_CA", + "en_GB", + "en_IN", + "es", + "es_419", + "es_MX", + "es_US", + "et", + "eu", + "fa", + "ff", + "ff_Adlm", + "fi", + "fil", + "fo", + "fr", + "fr_CA", + "ga", + "gd", + "gl", + "gu", + "ha", + "ha_NE", + "he", + "hi", + "hi_Latn", + "hr", + "hsb", + "hu", + "hy", + "ia", + "id", + "ig", + "is", + "it", + "ja", + "jv", + "ka", + "kab", + "kk", + "kl", + "km", + "kn", + "ko", + "kok", + "ku", + "ky", + "lb", + "lij", + "lo", + "lt", + "lv", + "mai", + "mi", + "mk", + "ml", + "mn", + "mni", + "mr", + "ms", + "mt", + "my", + "nb", + "ne", + "nl", + "nn", + "no", + "nso", + "oc", + "om", + "or", + "pa", + "pa_Arab", + "pcm", + "pl", + "ps", + "pt", + "pt_PT", + "qu", + "quc", + "rhg", + "rm", + "ro", + "root", + "ru", + "rw", + "sa", + "sat", + "sc", + "sd", + "si", + "sk", + "sl", + "so", + "sq", + "sr", + "sr_Cyrl", + "sr_Cyrl_BA", + "sr_Latn", + "sr_Latn_BA", + "su", + "sv", + "sw", + "sw_KE", + "ta", + "te", + "tg", + "th", + "ti", + "tk", + "tn", + "to", + "tr", + "tt", + "ug", + "uk", + "ur", + "uz", + "vi", + "wo", + "xh", + "yo", + "yo_BJ", + "yue", + "yue_Hans", + "zh", + "zh_Hant", + "zh_Hant_HK", + "zu" + ] + + func getURLForCLDRAnnotations(locale: String) async -> URL? { + return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/main/common/annotations/\(locale).xml") } - func getURLForCLDRAnnotationsDerived() async -> URL? { - return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/c1dc8c7ef6584668345cf741e51b1722d8114bc8/common/annotationsDerived/en.xml") + func getURLForCLDRAnnotationsDerived(locale: String) async -> URL? { + return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/main/common/annotationsDerived/\(locale).xml") } func getTemporaryURLForEmojiList(version: EmojiManager.Version) async -> URL? { diff --git a/Sources/EmojiSourceKit/UnicodeParser.swift b/Sources/EmojiSourceKit/UnicodeParser.swift index 2a47e53..43d7af6 100644 --- a/Sources/EmojiSourceKit/UnicodeParser.swift +++ b/Sources/EmojiSourceKit/UnicodeParser.swift @@ -19,25 +19,6 @@ class UnicodeParser { case minimallyQualified = "minimally-qualified" } - func parseEmojis(for fileUrl: URL) async throws { - URLSession.shared.dataTask(with: fileUrl) { data, response, error in - guard let data = data, error == nil else { - print("Failed to download data: \(error?.localizedDescription ?? "Unknown error")") - return - } - - // Parse the downloaded XML data - self.parseXML(data: data) - } - } - - func parseXML(data: Data) { - let parser = XMLParser(data: data) - let handler = CLDRAnnotationsXMLHandler() - parser.delegate = handler - parser.parse() - } - func parseEmojiList(for fileUrl: URL, emojisMap: [String: Emoji]) async throws -> [UnicodeEmojiCategory] { let handle = try FileHandle(forReadingFrom: fileUrl) var currentGroup: UnicodeEmojiCategory.Name = .activities @@ -89,21 +70,21 @@ class UnicodeParser { if multiHexEmoji.isEmpty == false { if let mapLookup = emojisMap[makeEmojiUnqualified(emoji: multiHexEmoji)] { - if mapLookup.keywords.isEmpty == true { + if mapLookup.localizedKeywords.isEmpty == true { print("Could not find keywords in emojis map for multiHex: \(multiHexEmoji)\n") } - emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, keywords: mapLookup.keywords) + emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, localizedKeywords: mapLookup.localizedKeywords) } else { print("Could not find in emojis map at all for multiHex: \(multiHexEmoji)\n") - emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, keywords: []) + emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, localizedKeywords: [:]) } } } else { if let unicode = hexString.asEmoji(), unicode.isEmpty == false { if let mapLookup = emojisMap[makeEmojiUnqualified(emoji: unicode)] { - emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, keywords: mapLookup.keywords) + emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, localizedKeywords: mapLookup.localizedKeywords) } else { - emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, keywords: []) + emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, localizedKeywords: [:]) } } }