Add support for emoji keywords using CLDR annotation data
This commit is contained in:
56
Sources/EmojiSourceKit/CLDRAnnotationsXMLHandler.swift
Normal file
56
Sources/EmojiSourceKit/CLDRAnnotationsXMLHandler.swift
Normal file
@@ -0,0 +1,56 @@
|
||||
//
|
||||
// CLDRAnnotationsXMLHandler.swift
|
||||
//
|
||||
//
|
||||
// Created by Terry Yiu on 6/2/24.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
import EmojiKit
|
||||
|
||||
class CLDRAnnotationsXMLHandler: NSObject, XMLParserDelegate {
|
||||
var currentElement = ""
|
||||
var currentEmoji: Emoji?
|
||||
var emojis = [Emoji]()
|
||||
var currentEmojiValue = ""
|
||||
|
||||
func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) {
|
||||
if elementName == "annotation" && attributeDict["type"] != "tts" {
|
||||
if let cp = attributeDict["cp"] {
|
||||
currentEmoji = Emoji(value: cp, keywords: [])
|
||||
currentEmojiValue = ""
|
||||
}
|
||||
}
|
||||
currentElement = elementName
|
||||
}
|
||||
|
||||
func parser(_ parser: XMLParser, foundCharacters string: String) {
|
||||
if currentElement == "annotation" {
|
||||
currentEmojiValue += string.trim()
|
||||
}
|
||||
}
|
||||
|
||||
func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) {
|
||||
if elementName == "annotation" {
|
||||
if let emoji = currentEmoji {
|
||||
let values = currentEmojiValue.split(separator: "|").map { $0.trim() }
|
||||
emojis.append(Emoji(value: emoji.value, keywords: values))
|
||||
}
|
||||
}
|
||||
currentElement = ""
|
||||
currentEmojiValue = ""
|
||||
currentEmoji = nil
|
||||
}
|
||||
|
||||
func parser(_ parser: XMLParser, parseErrorOccurred parseError: Error) {
|
||||
print("Parse error: \(parseError.localizedDescription)\n")
|
||||
}
|
||||
|
||||
var emojisMap: [String: Emoji] {
|
||||
emojis.reduce(into: [String: Emoji]()) { $0[$1.value] = $1 }
|
||||
}
|
||||
}
|
||||
|
||||
extension Character {
|
||||
var isEmoji: Bool { unicodeScalars.contains(where: { $0.properties.isEmoji }) }
|
||||
}
|
||||
@@ -24,7 +24,7 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
|
||||
#if DEBUG
|
||||
var url = URL(filePath: #file)
|
||||
url = url.deletingLastPathComponent().deletingLastPathComponent()
|
||||
url.append(path: "EmojiKitLibrary/Resources")
|
||||
url.append(path: "EmojiKit/Resources")
|
||||
|
||||
return url.absoluteString
|
||||
#else
|
||||
@@ -35,8 +35,21 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
|
||||
func run() async throws {
|
||||
print("⚙️", "Starting to download all emojis for version \(version.rawValue) from unicode.org...\n")
|
||||
|
||||
guard let emojiListURL = await getTemporaryURLForEmojiList(version: version), let emojiCountsURL = await getTemporaryURLForEmojiCounts(version: version) else {
|
||||
print("⚠️", "Could not get content from unicode.org. Either the emoji list or the emoji count file is not available.\n")
|
||||
guard let emojiListURL = await getTemporaryURLForEmojiList(version: version) else {
|
||||
print("⚠️", "Could not get content from unicode.org. The emoji list is not available.\n")
|
||||
return
|
||||
}
|
||||
|
||||
guard let emojiCountsURL = await getTemporaryURLForEmojiCounts(version: version) else {
|
||||
print("⚠️", "Could not get content from unicode.org. The emoji count file is not available.\n")
|
||||
return
|
||||
}
|
||||
|
||||
guard let cldrAnnotationsURL = await getURLForCLDRAnnotations() else {
|
||||
return
|
||||
}
|
||||
|
||||
guard let cldrAnnotationsDerivedURL = await getURLForCLDRAnnotationsDerived() else {
|
||||
return
|
||||
}
|
||||
|
||||
@@ -44,15 +57,36 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
|
||||
|
||||
print("⚙️", "Starting to parse content...\n")
|
||||
|
||||
print("Trying CLDR data at \(cldrAnnotationsURL)\n")
|
||||
|
||||
let cldrAnnotationsHandle = try FileHandle(forReadingFrom: cldrAnnotationsURL)
|
||||
guard let cldrAnnotationsData = try cldrAnnotationsHandle.readToEnd() else {
|
||||
print("⚠️", "Could not read CLDR annotations data.\n")
|
||||
return
|
||||
}
|
||||
|
||||
let cldrAnnotationsMap = emojisMap(data: cldrAnnotationsData) ?? [:]
|
||||
|
||||
print("Trying CLDR data at \(cldrAnnotationsDerivedURL)\n")
|
||||
|
||||
let cldrAnnotationsDerivedHandle = try FileHandle(forReadingFrom: cldrAnnotationsDerivedURL)
|
||||
guard let cldrAnnotationsDerivedData = try cldrAnnotationsDerivedHandle.readToEnd() else {
|
||||
print("⚠️", "Could not read CLDR annotations derived data.\n")
|
||||
return
|
||||
}
|
||||
let cldrAnnotationsDerivedMap = emojisMap(data: cldrAnnotationsDerivedData) ?? [:]
|
||||
|
||||
let allCLDRAnnotationsMap = cldrAnnotationsMap.merging(cldrAnnotationsDerivedMap) { (current, _) in current }
|
||||
|
||||
let parser = UnicodeParser()
|
||||
|
||||
do {
|
||||
let emojisByCategory: [UnicodeEmojiCategory] = try await parser.parseEmojiList(for: emojiListURL)
|
||||
let emojisByCategory: [UnicodeEmojiCategory] = try await parser.parseEmojiList(for: emojiListURL, emojisMap: allCLDRAnnotationsMap)
|
||||
|
||||
let emojiCounts: [UnicodeEmojiCategory.Name: Int] = parser.parseCountHTML(for: emojiCountsURL)
|
||||
|
||||
for category in emojisByCategory {
|
||||
assert(emojiCounts[category.name] == category.values.count)
|
||||
assert(emojiCounts[category.name] == category.emojis.count)
|
||||
}
|
||||
|
||||
print("🎉", "Successfully parsed emojis and matched counts to the count file.\n")
|
||||
@@ -66,6 +100,28 @@ struct EmojiDownloader: ParsableCommand, AsyncParsableCommand {
|
||||
}
|
||||
}
|
||||
|
||||
func emojisMap(data: Data) -> [String: Emoji]? {
|
||||
print("emojisMap\n")
|
||||
let parser = XMLParser(data: data)
|
||||
let handler = CLDRAnnotationsXMLHandler()
|
||||
parser.delegate = handler
|
||||
|
||||
if parser.parse() {
|
||||
return handler.emojisMap
|
||||
} else {
|
||||
print("Failed to parse XML\n")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func getURLForCLDRAnnotations() async -> URL? {
|
||||
return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/c1dc8c7ef6584668345cf741e51b1722d8114bc8/common/annotations/en.xml")
|
||||
}
|
||||
|
||||
func getURLForCLDRAnnotationsDerived() async -> URL? {
|
||||
return await load(urlString: "https://raw.githubusercontent.com/unicode-org/cldr/c1dc8c7ef6584668345cf741e51b1722d8114bc8/common/annotationsDerived/en.xml")
|
||||
}
|
||||
|
||||
func getTemporaryURLForEmojiList(version: EmojiManager.Version) async -> URL? {
|
||||
return await load(urlString: "https://unicode.org/Public/emoji/\(version.versionIdentifier)/emoji-test.txt")
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
import Foundation
|
||||
import SwiftSoup
|
||||
import EmojiKit
|
||||
import OrderedCollections
|
||||
|
||||
class UnicodeParser {
|
||||
|
||||
@@ -18,10 +19,29 @@ class UnicodeParser {
|
||||
case minimallyQualified = "minimally-qualified"
|
||||
}
|
||||
|
||||
func parseEmojiList(for fileUrl: URL) async throws -> [UnicodeEmojiCategory] {
|
||||
func parseEmojis(for fileUrl: URL) async throws {
|
||||
URLSession.shared.dataTask(with: fileUrl) { data, response, error in
|
||||
guard let data = data, error == nil else {
|
||||
print("Failed to download data: \(error?.localizedDescription ?? "Unknown error")")
|
||||
return
|
||||
}
|
||||
|
||||
// Parse the downloaded XML data
|
||||
self.parseXML(data: data)
|
||||
}
|
||||
}
|
||||
|
||||
func parseXML(data: Data) {
|
||||
let parser = XMLParser(data: data)
|
||||
let handler = CLDRAnnotationsXMLHandler()
|
||||
parser.delegate = handler
|
||||
parser.parse()
|
||||
}
|
||||
|
||||
func parseEmojiList(for fileUrl: URL, emojisMap: [String: Emoji]) async throws -> [UnicodeEmojiCategory] {
|
||||
let handle = try FileHandle(forReadingFrom: fileUrl)
|
||||
var currentGroup: UnicodeEmojiCategory.Name = .activities
|
||||
var emojisByGroup: [UnicodeEmojiCategory.Name: [String]] = [:]
|
||||
var emojisByGroup: [UnicodeEmojiCategory.Name: OrderedDictionary<String, Emoji>] = [:]
|
||||
|
||||
for try await line in handle.bytes.lines {
|
||||
|
||||
@@ -38,7 +58,7 @@ class UnicodeParser {
|
||||
continue
|
||||
}
|
||||
currentGroup = category
|
||||
emojisByGroup[category] = []
|
||||
emojisByGroup[category] = [:]
|
||||
}
|
||||
|
||||
/// Split line into list of entries
|
||||
@@ -67,13 +87,24 @@ class UnicodeParser {
|
||||
if hexComponents.count > 1 {
|
||||
let multiHexEmoji = hexComponents.compactMap({ $0.asEmoji() }).joined()
|
||||
|
||||
|
||||
if multiHexEmoji.isEmpty == false {
|
||||
emojisByGroup[currentGroup]?.append(multiHexEmoji)
|
||||
if let mapLookup = emojisMap[makeEmojiUnqualified(emoji: multiHexEmoji)] {
|
||||
if mapLookup.keywords.isEmpty == true {
|
||||
print("Could not find keywords in emojis map for multiHex: \(multiHexEmoji)\n")
|
||||
}
|
||||
emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, keywords: mapLookup.keywords)
|
||||
} else {
|
||||
print("Could not find in emojis map at all for multiHex: \(multiHexEmoji)\n")
|
||||
emojisByGroup[currentGroup]?[multiHexEmoji] = Emoji(value: multiHexEmoji, keywords: [])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if let unicode = hexString.asEmoji(), unicode.isEmpty == false {
|
||||
emojisByGroup[currentGroup]?.append(unicode)
|
||||
if let mapLookup = emojisMap[makeEmojiUnqualified(emoji: unicode)] {
|
||||
emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, keywords: mapLookup.keywords)
|
||||
} else {
|
||||
emojisByGroup[currentGroup]?[unicode] = Emoji(value: unicode, keywords: [])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -82,11 +113,25 @@ class UnicodeParser {
|
||||
var result: [UnicodeEmojiCategory] = []
|
||||
|
||||
for category in UnicodeEmojiCategory.Name.allCases {
|
||||
result.append(UnicodeEmojiCategory(name: category, values: emojisByGroup[category] ?? []))
|
||||
result.append(UnicodeEmojiCategory(name: category, emojis: emojisByGroup[category] ?? OrderedDictionary<String, Emoji>()))
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func makeEmojiUnqualified(emoji: String) -> String {
|
||||
let variationSelector: Character = "\u{FE0F}"
|
||||
var unqualifiedEmoji = ""
|
||||
|
||||
for scalar in emoji.unicodeScalars {
|
||||
let character = Character(scalar)
|
||||
if character != variationSelector {
|
||||
unqualifiedEmoji.append(character)
|
||||
}
|
||||
}
|
||||
|
||||
return unqualifiedEmoji
|
||||
}
|
||||
|
||||
func parseCountHTML(for url: URL) -> [UnicodeEmojiCategory.Name: Int] {
|
||||
do {
|
||||
let html = try String(contentsOf: url)
|
||||
|
||||
Reference in New Issue
Block a user