Files
EmojiKit/Sources/EmojiSourceKit/UnicodeParser.swift
2023-06-16 17:04:40 +02:00

148 lines
4.7 KiB
Swift

//
// UnicodeParser.swift
//
//
// Created by Niklas Amslgruber on 12.06.23.
//
import Foundation
import SwiftSoup
import EmojiKit
class UnicodeParser {
enum Tags: String {
case comment = "#"
case group = "# group:"
case unqualified
case minimallyQualified = "minimally-qualified"
}
func parseEmojiList(for fileUrl: URL) async throws -> [EmojiCategory] {
let handle = try FileHandle(forReadingFrom: fileUrl)
var currentGroup: EmojiCategory.Name = .activities
var emojisByGroup: [EmojiCategory.Name: [String]] = [:]
for try await line in handle.bytes.lines {
/// Skip comments, but keep groups
if isLine(line, ofType: .comment), isLine(line, ofType: .group) == false {
continue
}
/// Get current group
if isLine(line, ofType: .group) {
let name = line.split(separator: ":")
let categoryName = name.last?.trim() ?? ""
guard let category = EmojiCategory.Name(rawValue: categoryName) else {
continue
}
currentGroup = category
emojisByGroup[category] = []
}
/// Split line into list of entries
let lineComponents = line.split(separator: ";")
/// Get hex-string from compenents
guard let hexString = lineComponents.map({ $0.trim() }).first else {
continue
}
/// Check if category exists
guard lineComponents.count > 1 else {
continue
}
let category = lineComponents[1].trim()
/// Remove `unqualified` or `minimally-qualified` entries
guard (isLine(category, ofType: .unqualified) || isLine(category, ofType: .minimallyQualified)) == false else {
continue
}
let hexComponents = hexString.split(separator: " ")
/// Check for multi-hex emojis
if hexComponents.count > 1 {
let multiHexEmoji = hexComponents.compactMap({ $0.asEmoji() }).joined()
if multiHexEmoji.isEmpty == false {
emojisByGroup[currentGroup]?.append(multiHexEmoji)
}
} else {
if let unicode = hexString.asEmoji(), unicode.isEmpty == false {
emojisByGroup[currentGroup]?.append(unicode)
}
}
}
try handle.close()
var result: [EmojiCategory] = []
for category in EmojiCategory.Name.allCases {
result.append(EmojiCategory(name: category, values: emojisByGroup[category] ?? []))
}
return result
}
func parseCountHTML(for url: URL) -> [EmojiCategory.Name: Int] {
do {
let html = try String(contentsOf: url)
let doc: Document = try SwiftSoup.parse(html)
guard let table = try doc.select("table").first() else {
return [:]
}
let rows: Elements = try table.select("tbody tr")
let categories = rows.first
let totals = rows.last
guard let categories, let totals, let categoryEntries = try? categories.select("th"), let countEntries = try? totals.select("th") else {
return [:]
}
var categoryNames: [EmojiCategory.Name] = []
var countNumbers: [Int] = []
for categoryElement in categoryEntries {
if categoryElement == categoryEntries.first || categoryElement == categoryEntries.last {
continue
}
guard let text = try? categoryElement.text(), let category = EmojiCategory.Name(rawValue: text) else {
continue
}
categoryNames.append(category)
}
for countElement in countEntries {
if countElement == countEntries.first || countElement == countEntries.last {
continue
}
guard let text = try? countElement.text(), let number = Int(text) else {
continue
}
countNumbers.append(number)
}
var result: [EmojiCategory.Name: Int] = [:]
for (index, categoryName) in categoryNames.enumerated() {
result[categoryName] = countNumbers[index]
}
return result
} catch {
print("Error parsing HTML: \(error)")
}
return [:]
}
private func isLine(_ line: String, ofType type: Tags) -> Bool {
return line.starts(with: type.rawValue)
}
}