Add support for localized emoji keywords

This commit is contained in:
2024-06-09 00:20:09 -04:00
parent 99744e7e8d
commit 92cdbe35bf
4 changed files with 225 additions and 64 deletions

View File

@@ -9,10 +9,10 @@ import Foundation
public struct Emoji: Codable, Hashable {
public let value: String
public let keywords: [String]
public let localizedKeywords: [String: [String]]
public init(value: String, keywords: [String]) {
public init(value: String, localizedKeywords: [String: [String]]) {
self.value = value
self.keywords = keywords
self.localizedKeywords = localizedKeywords
}
}

View File

@@ -9,15 +9,21 @@ import Foundation
import EmojiKit
class CLDRAnnotationsXMLHandler: NSObject, XMLParserDelegate {
let locale: String
var currentElement = ""
var currentEmoji: Emoji?
var emojis = [Emoji]()
var currentEmojiValue = ""
init(locale: String) {
self.locale = locale
}
func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) {
if elementName == "annotation" && attributeDict["type"] != "tts" {
if let cp = attributeDict["cp"] {
currentEmoji = Emoji(value: cp, keywords: [])
currentEmoji = Emoji(value: cp, localizedKeywords: [:])
currentEmojiValue = ""
}
}
@@ -33,8 +39,9 @@ class CLDRAnnotationsXMLHandler: NSObject, XMLParserDelegate {
func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) {
if elementName == "annotation" {
if let emoji = currentEmoji {
let values = currentEmojiValue.split(separator: "|").map { $0.trim() }
emojis.append(Emoji(value: emoji.value, keywords: values))
var localizedKeywords = [String: [String]]()
localizedKeywords[locale] = currentEmojiValue.split(separator: "|").map { $0.trim() }
emojis.append(Emoji(value: emoji.value, localizedKeywords: localizedKeywords))
}
}
currentElement = ""

View File

@@ -45,18 +45,20 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
return
}
guard let cldrAnnotationsURL = await getURLForCLDRAnnotations() else {
return
}
guard let cldrAnnotationsDerivedURL = await getURLForCLDRAnnotationsDerived() else {
return
}
print("🎉", "Successfully retrieved temporary URLs for version \(version.rawValue).\n")
print("⚙️", "Starting to parse content...\n")
var allCLDRAnnotations = [String: Emoji]()
for locale in supportedLocales {
guard let cldrAnnotationsURL = await getURLForCLDRAnnotations(locale: locale) else {
return
}
guard let cldrAnnotationsDerivedURL = await getURLForCLDRAnnotationsDerived(locale: locale) else {
return
}
print("Trying CLDR data at \(cldrAnnotationsURL)\n")
let cldrAnnotationsHandle = try FileHandle(forReadingFrom: cldrAnnotationsURL)
@@ -65,7 +67,7 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
return
}
let cldrAnnotationsMap = emojisMap(data: cldrAnnotationsData) ?? [:]
let cldrAnnotationsMap = emojisMap(data: cldrAnnotationsData, locale: locale) ?? [:]
print("Trying CLDR data at \(cldrAnnotationsDerivedURL)\n")
@@ -74,14 +76,23 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
print("⚠️", "Could not read CLDR annotations derived data.\n")
return
}
let cldrAnnotationsDerivedMap = emojisMap(data: cldrAnnotationsDerivedData) ?? [:]
let cldrAnnotationsDerivedMap = emojisMap(data: cldrAnnotationsDerivedData, locale: locale) ?? [:]
let allCLDRAnnotationsMap = cldrAnnotationsMap.merging(cldrAnnotationsDerivedMap) { (current, _) in current }
allCLDRAnnotations.merge(cldrAnnotationsMap) { (current, new) in
let combinedKeywords = current.localizedKeywords.merging(new.localizedKeywords) { (current, _) in current }
return Emoji(value: current.value, localizedKeywords: combinedKeywords)
}
allCLDRAnnotations.merge(cldrAnnotationsDerivedMap) { (current, new) in
let combinedKeywords = current.localizedKeywords.merging(new.localizedKeywords) { (current, _) in current }
return Emoji(value: current.value, localizedKeywords: combinedKeywords)
}
}
let parser = UnicodeParser()
do {
let emojisByCategory: [UnicodeEmojiCategory] = try await parser.parseEmojiList(for: emojiListURL, emojisMap: allCLDRAnnotationsMap)
let emojisByCategory: [UnicodeEmojiCategory] = try await parser.parseEmojiList(for: emojiListURL, emojisMap: allCLDRAnnotations)
let emojiCounts: [UnicodeEmojiCategory.Name: Int] = parser.parseCountHTML(for: emojiCountsURL)
@@ -100,10 +111,9 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
}
}
func emojisMap(data: Data) -> [String: Emoji]? {
print("emojisMap\n")
func emojisMap(data: Data, locale: String) -> [String: Emoji]? {
let parser = XMLParser(data: data)
let handler = CLDRAnnotationsXMLHandler()
let handler = CLDRAnnotationsXMLHandler(locale: locale)
parser.delegate = handler
if parser.parse() {
@@ -114,12 +124,175 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
}
}
func getURLForCLDRAnnotations() async -> URL? {
return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/c1dc8c7ef6584668345cf741e51b1722d8114bc8/common/annotations/en.xml")
let supportedLocales = [
"af",
"am",
"ar",
"ar_SA",
"as",
"ast",
"az",
"be",
"bew",
"bg",
"bgn",
"bn",
"br",
"bs",
"ca",
"ccp",
"ceb",
"chr",
"ckb",
"cs",
"cv",
"cy",
"da",
"de",
"de_CH",
"doi",
"dsb",
"el",
"en",
"en_001",
"en_AU",
"en_CA",
"en_GB",
"en_IN",
"es",
"es_419",
"es_MX",
"es_US",
"et",
"eu",
"fa",
"ff",
"ff_Adlm",
"fi",
"fil",
"fo",
"fr",
"fr_CA",
"ga",
"gd",
"gl",
"gu",
"ha",
"ha_NE",
"he",
"hi",
"hi_Latn",
"hr",
"hsb",
"hu",
"hy",
"ia",
"id",
"ig",
"is",
"it",
"ja",
"jv",
"ka",
"kab",
"kk",
"kl",
"km",
"kn",
"ko",
"kok",
"ku",
"ky",
"lb",
"lij",
"lo",
"lt",
"lv",
"mai",
"mi",
"mk",
"ml",
"mn",
"mni",
"mr",
"ms",
"mt",
"my",
"nb",
"ne",
"nl",
"nn",
"no",
"nso",
"oc",
"om",
"or",
"pa",
"pa_Arab",
"pcm",
"pl",
"ps",
"pt",
"pt_PT",
"qu",
"quc",
"rhg",
"rm",
"ro",
"root",
"ru",
"rw",
"sa",
"sat",
"sc",
"sd",
"si",
"sk",
"sl",
"so",
"sq",
"sr",
"sr_Cyrl",
"sr_Cyrl_BA",
"sr_Latn",
"sr_Latn_BA",
"su",
"sv",
"sw",
"sw_KE",
"ta",
"te",
"tg",
"th",
"ti",
"tk",
"tn",
"to",
"tr",
"tt",
"ug",
"uk",
"ur",
"uz",
"vi",
"wo",
"xh",
"yo",
"yo_BJ",
"yue",
"yue_Hans",
"zh",
"zh_Hant",
"zh_Hant_HK",
"zu"
]
func getURLForCLDRAnnotations(locale: String) async -> URL? {
return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/main/common/annotations/\(locale).xml")
}
func getURLForCLDRAnnotationsDerived() async -> URL? {
return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/c1dc8c7ef6584668345cf741e51b1722d8114bc8/common/annotationsDerived/en.xml")
func getURLForCLDRAnnotationsDerived(locale: String) async -> URL? {
return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/main/common/annotationsDerived/\(locale).xml")
}
func getTemporaryURLForEmojiList(version: EmojiManager.Version) async -> URL? {

View File

@@ -19,25 +19,6 @@ class UnicodeParser {
case minimallyQualified = "minimally-qualified"
}
func parseEmojis(for fileUrl: URL) async throws {
URLSession.shared.dataTask(with: fileUrl) { data, response, error in
guard let data = data, error == nil else {
print("Failed to download data: \(error?.localizedDescription ?? "Unknown error")")
return
}
// Parse the downloaded XML data
self.parseXML(data: data)
}
}
func parseXML(data: Data) {
let parser = XMLParser(data: data)
let handler = CLDRAnnotationsXMLHandler()
parser.delegate = handler
parser.parse()
}
func parseEmojiList(for fileUrl: URL, emojisMap: [String: Emoji]) async throws -> [UnicodeEmojiCategory] {
let handle = try FileHandle(forReadingFrom: fileUrl)
var currentGroup: UnicodeEmojiCategory.Name = .activities
@@ -89,21 +70,21 @@ class UnicodeParser {
if multiHexEmoji.isEmpty == false {
if let mapLookup = emojisMap[makeEmojiUnqualified(emoji: multiHexEmoji)] {
if mapLookup.keywords.isEmpty == true {
if mapLookup.localizedKeywords.isEmpty == true {
print("Could not find keywords in emojis map for multiHex: \(multiHexEmoji)\n")
}
emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, keywords: mapLookup.keywords)
emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, localizedKeywords: mapLookup.localizedKeywords)
} else {
print("Could not find in emojis map at all for multiHex: \(multiHexEmoji)\n")
emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, keywords: [])
emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, localizedKeywords: [:])
}
}
} else {
if let unicode = hexString.asEmoji(), unicode.isEmpty == false {
if let mapLookup = emojisMap[makeEmojiUnqualified(emoji: unicode)] {
emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, keywords: mapLookup.keywords)
emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, localizedKeywords: mapLookup.localizedKeywords)
} else {
emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, keywords: [])
emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, localizedKeywords: [:])
}
}
}