Rewrite note parsing in C

This eliminates any parsing choppyness

Fixes: #32
Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
William Casarin
2022-10-17 15:20:38 -07:00
parent 277ead6efc
commit eb99e6c323
10 changed files with 657 additions and 7 deletions

View File

@@ -0,0 +1,5 @@
//
// Use this file to import your target's public headers that you would like to expose to Swift.
//
#include "damus.h"

257
damus-c/damus.c Normal file
View File

@@ -0,0 +1,257 @@
//
// damus.c
// damus
//
// Created by William Casarin on 2022-10-17.
//
#include "damus.h"
#include <stdlib.h>
#include <string.h>
typedef unsigned char u8;
struct cursor {
const u8 *p;
const u8 *start;
const u8 *end;
};
static inline int is_whitespace(char c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
}
static void make_cursor(struct cursor *c, const u8 *content, size_t len)
{
c->start = content;
c->end = content + len;
c->p = content;
}
static int consume_until_whitespace(struct cursor *cur, int or_end) {
char c;
while (cur->p < cur->end) {
c = *cur->p;
if (is_whitespace(c))
return 1;
cur->p++;
}
return or_end;
}
static int parse_char(struct cursor *cur, char c) {
if (cur->p >= cur->end)
return 0;
if (*cur->p == c) {
cur->p++;
return 1;
}
return 0;
}
static inline int peek_char(struct cursor *cur, int ind) {
if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
return -1;
return *(cur->p + ind);
}
static int parse_digit(struct cursor *cur, int *digit) {
int c;
if ((c = peek_char(cur, 0)) == -1)
return 0;
c -= '0';
if (c >= 0 && c <= 9) {
*digit = c;
cur->p++;
return 1;
}
return 0;
}
static int parse_str(struct cursor *cur, const char *str) {
unsigned long len = strlen(str);
if (cur->p + len >= cur->end)
return 0;
if (!memcmp(cur->p, str, len)) {
cur->p += len;
return 1;
}
return 0;
}
static int parse_mention(struct cursor *cur, struct block *block) {
int d1, d2, d3, ind;
const u8 *start = cur->p;
if (!parse_str(cur, "#["))
return 0;
if (!parse_digit(cur, &d1)) {
cur->p = start;
return 0;
}
ind = d1;
if (parse_digit(cur, &d2))
ind = (d1 * 10) + d2;
if (parse_digit(cur, &d3))
ind = (d1 * 100) + (d2 * 10) + d3;
if (!parse_char(cur, ']')) {
cur->p = start;
return 0;
}
block->type = BLOCK_MENTION;
block->block.mention = ind;
return 1;
}
static int parse_hashtag(struct cursor *cur, struct block *block) {
int c;
const u8 *start = cur->p;
if (!parse_char(cur, '#'))
return 0;
c = peek_char(cur, 0);
if (c == -1 || is_whitespace(c) || c == '#') {
cur->p = start;
return 0;
}
consume_until_whitespace(cur, 1);
block->type = BLOCK_HASHTAG;
block->block.str.start = (const char*)(start + 1);
block->block.str.end = (const char*)cur->p;
return 1;
}
static int add_block(struct blocks *blocks, struct block block)
{
if (blocks->num_blocks + 1 >= MAX_BLOCKS)
return 0;
blocks->blocks[blocks->num_blocks++] = block;
return 1;
}
static int add_text_block(struct blocks *blocks, const u8 *start, const u8 *end)
{
struct block b;
b.type = BLOCK_TEXT;
b.block.str.start = (const char*)start;
b.block.str.end = (const char*)end;
return add_block(blocks, b);
}
static int parse_url(struct cursor *cur, struct block *block) {
const u8 *start = cur->p;
if (!parse_str(cur, "http"))
return 0;
if (parse_char(cur, 's')) {
if (!parse_str(cur, "://")) {
cur->p = start;
return 0;
}
} else {
if (!parse_str(cur, "://")) {
cur->p = start;
return 0;
}
}
if (!consume_until_whitespace(cur, 1)) {
cur->p = start;
return 0;
}
block->type = BLOCK_URL;
block->block.str.start = (const char *)start;
block->block.str.end = (const char *)cur->p;
return 1;
}
int damus_parse_content(struct blocks *blocks, const char *content) {
int cp, c;
struct cursor cur;
struct block block;
const u8 *start, *pre_mention;
blocks->num_blocks = 0;
make_cursor(&cur, (const u8*)content, strlen(content));
start = cur.p;
while (cur.p < cur.end && blocks->num_blocks < MAX_BLOCKS) {
cp = peek_char(&cur, -1);
c = peek_char(&cur, 0);
pre_mention = cur.p;
if (cp == -1 || is_whitespace(cp)) {
if (c == '#' && (parse_mention(&cur, &block) || parse_hashtag(&cur, &block))) {
if (!add_text_block(blocks, start, pre_mention))
return 0;
start = cur.p;
if (!add_block(blocks, block))
return 0;
continue;
} else if (c == 'h' && parse_url(&cur, &block)) {
if (!add_text_block(blocks, start, pre_mention))
return 0;
start = cur.p;
if (!add_block(blocks, block))
return 0;
continue;
}
}
cur.p++;
}
if (cur.p - start > 0) {
if (!add_text_block(blocks, start, cur.p))
return 0;
}
return 1;
}
void blocks_init(struct blocks *blocks) {
blocks->blocks = malloc(sizeof(struct block) * MAX_BLOCKS);
blocks->num_blocks = 0;
}
void blocks_free(struct blocks *blocks) {
if (blocks->blocks) {
free(blocks->blocks);
blocks->num_blocks = 0;
}
}

44
damus-c/damus.h Normal file
View File

@@ -0,0 +1,44 @@
//
// damus.h
// damus
//
// Created by William Casarin on 2022-10-17.
//
#ifndef damus_h
#define damus_h
#include <stdio.h>
#define MAX_BLOCKS 1024
enum block_type {
BLOCK_HASHTAG = 1,
BLOCK_TEXT = 2,
BLOCK_MENTION = 3,
BLOCK_URL = 4,
};
typedef struct str_block {
const char *start;
const char *end;
} str_block_t;
typedef struct block {
enum block_type type;
union {
struct str_block str;
int mention;
} block;
} block_t;
typedef struct blocks {
int num_blocks;
struct block *blocks;
} blocks_t;
void blocks_init(struct blocks *blocks);
void blocks_free(struct blocks *blocks);
int damus_parse_content(struct blocks *blocks, const char *content);
#endif /* damus_h */

180
damus-c/utf8.c Normal file
View File

@@ -0,0 +1,180 @@
/* MIT (BSD) license - see LICENSE file for details - taken from ccan. thanks rusty! */
#include "utf8.h"
#include <errno.h>
#include <stdlib.h>
/* I loved this table, so I stole it: */
/*
* Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
* <https://github.com/chansen/c-utf8-valid>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* UTF-8 Encoding Form
*
* U+0000..U+007F 0xxxxxxx <= 7 bits
* U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits
* U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits
* U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits
*
*
* U+0000..U+007F 00..7F
* N C0..C1 80..BF 1100000x 10xxxxxx
* U+0080..U+07FF C2..DF 80..BF
* N E0 80..9F 80..BF 11100000 100xxxxx
* U+0800..U+0FFF E0 A0..BF 80..BF
* U+1000..U+CFFF E1..EC 80..BF 80..BF
* U+D000..U+D7FF ED 80..9F 80..BF
* S ED A0..BF 80..BF 11101101 101xxxxx
* U+E000..U+FFFF EE..EF 80..BF 80..BF
* N F0 80..8F 80..BF 80..BF 11110000 1000xxxx
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx
*
* Legend:
* N = Non-shortest form
* S = Surrogates
*/
bool utf8_decode(struct utf8_state *utf8_state, char c)
{
if (utf8_state->used_len == utf8_state->total_len) {
utf8_state->used_len = 1;
/* First character in sequence. */
if (((unsigned char)c & 0x80) == 0) {
/* ASCII, easy. */
if (c == 0)
goto bad_encoding;
utf8_state->total_len = 1;
utf8_state->c = c;
goto finished_decoding;
} else if (((unsigned char)c & 0xE0) == 0xC0) {
utf8_state->total_len = 2;
utf8_state->c = ((unsigned char)c & 0x1F);
return false;
} else if (((unsigned char)c & 0xF0) == 0xE0) {
utf8_state->total_len = 3;
utf8_state->c = ((unsigned char)c & 0x0F);
return false;
} else if (((unsigned char)c & 0xF8) == 0xF0) {
utf8_state->total_len = 4;
utf8_state->c = ((unsigned char)c & 0x07);
return false;
}
goto bad_encoding;
}
if (((unsigned char)c & 0xC0) != 0x80)
goto bad_encoding;
utf8_state->c <<= 6;
utf8_state->c |= ((unsigned char)c & 0x3F);
utf8_state->used_len++;
if (utf8_state->used_len == utf8_state->total_len)
goto finished_decoding;
return false;
finished_decoding:
if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
errno = ERANGE;
/* The UTF-16 "surrogate range": illegal in UTF-8 */
else if (utf8_state->total_len == 3
&& (utf8_state->c & 0xFFFFF800) == 0x0000D800)
errno = ERANGE;
else {
int min_bits;
switch (utf8_state->total_len) {
case 1:
min_bits = 0;
break;
case 2:
min_bits = 7;
break;
case 3:
min_bits = 11;
break;
case 4:
min_bits = 16;
break;
default:
abort();
}
if ((utf8_state->c >> min_bits) == 0)
errno = EFBIG;
else
errno = 0;
}
return true;
bad_encoding:
utf8_state->total_len = utf8_state->used_len;
errno = EINVAL;
return true;
}
size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
{
if ((point >> 7) == 0) {
if (point == 0) {
errno = ERANGE;
return 0;
}
/* 0xxxxxxx */
dest[0] = point;
return 1;
}
if ((point >> 11) == 0) {
/* 110xxxxx 10xxxxxx */
dest[1] = 0x80 | (point & 0x3F);
dest[0] = 0xC0 | (point >> 6);
return 2;
}
if ((point >> 16) == 0) {
if (point >= 0xD800 && point <= 0xDFFF) {
errno = ERANGE;
return 0;
}
/* 1110xxxx 10xxxxxx 10xxxxxx */
dest[2] = 0x80 | (point & 0x3F);
dest[1] = 0x80 | ((point >> 6) & 0x3F);
dest[0] = 0xE0 | (point >> 12);
return 3;
}
if (point > 0x10FFFF) {
errno = ERANGE;
return 0;
}
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
dest[3] = 0x80 | (point & 0x3F);
dest[2] = 0x80 | ((point >> 6) & 0x3F);
dest[1] = 0x80 | ((point >> 12) & 0x3F);
dest[0] = 0xF0 | (point >> 18);
return 4;
}

54
damus-c/utf8.h Normal file
View File

@@ -0,0 +1,54 @@
/* MIT (BSD) license - see LICENSE file for details */
#ifndef CCAN_UTF8_H
#define CCAN_UTF8_H
#include <inttypes.h>
#include <stdbool.h>
#include <string.h>
/* Unicode is limited to 21 bits. */
#define UTF8_MAX_LEN 4
struct utf8_state {
/* How many characters we are expecting as part of this Unicode point */
uint16_t total_len;
/* How many characters we've already seen. */
uint16_t used_len;
/* Compound character, aka Unicode point. */
uint32_t c;
};
#define UTF8_STATE_INIT { 0, 0, 0 }
static inline void utf8_state_init(struct utf8_state *utf8_state)
{
memset(utf8_state, 0, sizeof(*utf8_state));
}
/**
* utf8_decode - continue UTF8 decoding with this character.
* @utf8_state - initialized UTF8 state.
* @c - the character.
*
* Returns false if it needs another character to give results.
* Otherwise returns true, @utf8_state can be reused without initializeation,
* and sets errno:
* 0: success
* EINVAL: bad encoding (including a NUL character).
* EFBIG: not a minimal encoding.
* ERANGE: encoding of invalid character.
*
* You can extract the character from @utf8_state->c; @utf8_state->used_len
* indicates how many characters have been consumed.
*/
bool utf8_decode(struct utf8_state *utf8_state, char c);
/**
* utf8_encode - encode a point into UTF8.
* @point - Unicode point to include.
* @dest - buffer to fill.
*
* Returns 0 if point was invalid, otherwise bytes of dest used.
* Sets errno to ERANGE if point was invalid.
*/
size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]);
#endif /* CCAN_UTF8_H */