Rewrite note parsing in C

This eliminates any parsing choppyness Fixes: #32 Signed-off-by: William Casarin <jb55@jb55.com>
2022-10-17 15:22:28 -07:00
parent 277ead6efc
commit eb99e6c323
10 changed files with 657 additions and 7 deletions
@@ -0,0 +1,5 @@
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#include "damus.h"
@@ -0,0 +1,257 @@
+//
+//  damus.c
+//  damus
+//
+//  Created by William Casarin on 2022-10-17.
+//
+
+#include "damus.h"
+#include <stdlib.h>
+#include <string.h>
+
+typedef unsigned char u8;
+
+struct cursor {
+    const u8 *p;
+    const u8 *start;
+    const u8 *end;
+};
+
+static inline int is_whitespace(char c) {
+    return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void make_cursor(struct cursor *c, const u8 *content, size_t len)
+{
+    c->start = content;
+    c->end = content + len;
+    c->p = content;
+}
+
+static int consume_until_whitespace(struct cursor *cur, int or_end) {
+    char c;
+    
+    while (cur->p < cur->end) {
+        c = *cur->p;
+        
+        if (is_whitespace(c))
+            return 1;
+        
+        cur->p++;
+    }
+    
+    return or_end;
+}
+
+static int parse_char(struct cursor *cur, char c) {
+    if (cur->p >= cur->end)
+        return 0;
+        
+    if (*cur->p == c) {
+        cur->p++;
+        return 1;
+    }
+    
+    return 0;
+}
+
+static inline int peek_char(struct cursor *cur, int ind) {
+    if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
+        return -1;
+    
+    return *(cur->p + ind);
+}
+
+static int parse_digit(struct cursor *cur, int *digit) {
+    int c;
+    if ((c = peek_char(cur, 0)) == -1)
+        return 0;
+    
+    c -= '0';
+    
+    if (c >= 0 && c <= 9) {
+        *digit = c;
+        cur->p++;
+        return 1;
+    }
+    return 0;
+}
+
+static int parse_str(struct cursor *cur, const char *str) {
+    unsigned long len = strlen(str);
+    
+    if (cur->p + len >= cur->end)
+        return 0;
+    
+    if (!memcmp(cur->p, str, len)) {
+        cur->p += len;
+        return 1;
+    }
+    
+    return 0;
+}
+
+static int parse_mention(struct cursor *cur, struct block *block) {
+    int d1, d2, d3, ind;
+    const u8 *start = cur->p;
+    
+    if (!parse_str(cur, "#["))
+        return 0;
+    
+    if (!parse_digit(cur, &d1)) {
+        cur->p = start;
+        return 0;
+    }
+    
+    ind = d1;
+    
+    if (parse_digit(cur, &d2))
+        ind = (d1 * 10) + d2;
+    
+    if (parse_digit(cur, &d3))
+        ind = (d1 * 100) + (d2 * 10) + d3;
+    
+    if (!parse_char(cur, ']')) {
+        cur->p = start;
+        return 0;
+    }
+    
+    block->type = BLOCK_MENTION;
+    block->block.mention = ind;
+    
+    return 1;
+}
+
+static int parse_hashtag(struct cursor *cur, struct block *block) {
+    int c;
+    const u8 *start = cur->p;
+    
+    if (!parse_char(cur, '#'))
+        return 0;
+    
+    c = peek_char(cur, 0);
+    if (c == -1 || is_whitespace(c) || c == '#') {
+        cur->p = start;
+        return 0;
+    }
+    
+    consume_until_whitespace(cur, 1);
+    
+    block->type = BLOCK_HASHTAG;
+    block->block.str.start = (const char*)(start + 1);
+    block->block.str.end = (const char*)cur->p;
+    
+    return 1;
+}
+
+static int add_block(struct blocks *blocks, struct block block)
+{
+    if (blocks->num_blocks + 1 >= MAX_BLOCKS)
+        return 0;
+    
+    blocks->blocks[blocks->num_blocks++] = block;
+    return 1;
+}
+
+static int add_text_block(struct blocks *blocks, const u8 *start, const u8 *end)
+{
+    struct block b;
+    
+    b.type = BLOCK_TEXT;
+    b.block.str.start = (const char*)start;
+    b.block.str.end = (const char*)end;
+    
+    return add_block(blocks, b);
+}
+
+static int parse_url(struct cursor *cur, struct block *block) {
+    const u8 *start = cur->p;
+    
+    if (!parse_str(cur, "http"))
+        return 0;
+    
+    if (parse_char(cur, 's')) {
+        if (!parse_str(cur, "://")) {
+            cur->p = start;
+            return 0;
+        }
+    } else {
+        if (!parse_str(cur, "://")) {
+            cur->p = start;
+            return 0;
+        }
+    }
+    
+    if (!consume_until_whitespace(cur, 1)) {
+        cur->p = start;
+        return 0;
+    }
+    
+    block->type = BLOCK_URL;
+    block->block.str.start = (const char *)start;
+    block->block.str.end = (const char *)cur->p;
+    
+    return 1;
+}
+
+int damus_parse_content(struct blocks *blocks, const char *content) {
+    int cp, c;
+    struct cursor cur;
+    struct block block;
+    const u8 *start, *pre_mention;
+    
+    blocks->num_blocks = 0;
+    make_cursor(&cur, (const u8*)content, strlen(content));
+    
+    start = cur.p;
+    while (cur.p < cur.end && blocks->num_blocks < MAX_BLOCKS) {
+        cp = peek_char(&cur, -1);
+        c  = peek_char(&cur, 0);
+        
+        pre_mention = cur.p;
+        if (cp == -1 || is_whitespace(cp)) {
+            if (c == '#' && (parse_mention(&cur, &block) || parse_hashtag(&cur, &block))) {
+                if (!add_text_block(blocks, start, pre_mention))
+                    return 0;
+                
+                start = cur.p;
+                
+                if (!add_block(blocks, block))
+                    return 0;
+                
+                continue;
+            } else if (c == 'h' && parse_url(&cur, &block)) {
+                if (!add_text_block(blocks, start, pre_mention))
+                    return 0;
+                
+                start = cur.p;
+                
+                if (!add_block(blocks, block))
+                    return 0;
+                
+                continue;
+            }
+        }
+        
+        cur.p++;
+    }
+    
+    if (cur.p - start > 0) {
+        if (!add_text_block(blocks, start, cur.p))
+            return 0;
+    }
+    
+    return 1;
+}
+
+void blocks_init(struct blocks *blocks) {
+    blocks->blocks = malloc(sizeof(struct block) * MAX_BLOCKS);
+    blocks->num_blocks = 0;
+}
+
+void blocks_free(struct blocks *blocks) {
+    if (blocks->blocks) {
+        free(blocks->blocks);
+        blocks->num_blocks = 0;
+    }
+}
@@ -0,0 +1,44 @@
+//
+//  damus.h
+//  damus
+//
+//  Created by William Casarin on 2022-10-17.
+//
+
+#ifndef damus_h
+#define damus_h
+
+#include <stdio.h>
+
+#define MAX_BLOCKS 1024
+
+enum block_type {
+    BLOCK_HASHTAG = 1,
+    BLOCK_TEXT = 2,
+    BLOCK_MENTION = 3,
+    BLOCK_URL = 4,
+};
+
+typedef struct str_block {
+    const char *start;
+    const char *end;
+} str_block_t;
+
+typedef struct block {
+    enum block_type type;
+    union {
+        struct str_block str;
+        int mention;
+    } block;
+} block_t;
+
+typedef struct blocks {
+    int num_blocks;
+    struct block *blocks;
+} blocks_t;
+
+void blocks_init(struct blocks *blocks);
+void blocks_free(struct blocks *blocks);
+int damus_parse_content(struct blocks *blocks, const char *content);
+
+#endif /* damus_h */
@@ -0,0 +1,180 @@
+/* MIT (BSD) license - see LICENSE file for details - taken from ccan. thanks rusty! */
+
+#include "utf8.h"
+#include <errno.h>
+#include <stdlib.h>
+
+/* I loved this table, so I stole it: */
+/*
+ * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
+ * <https://github.com/chansen/c-utf8-valid>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ *    UTF-8 Encoding Form
+ *
+ *    U+0000..U+007F       0xxxxxxx                <= 7 bits
+ *    U+0080..U+07FF       110xxxxx 10xxxxxx            <= 11 bits
+ *    U+0800..U+FFFF       1110xxxx 10xxxxxx 10xxxxxx        <= 16 bits
+ *   U+10000..U+10FFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx    <= 21 bits
+ *
+ *
+ *    U+0000..U+007F       00..7F
+ *                      N  C0..C1  80..BF                   1100000x 10xxxxxx
+ *    U+0080..U+07FF       C2..DF  80..BF
+ *                      N  E0      80..9F  80..BF           11100000 100xxxxx
+ *    U+0800..U+0FFF       E0      A0..BF  80..BF
+ *    U+1000..U+CFFF       E1..EC  80..BF  80..BF
+ *    U+D000..U+D7FF       ED      80..9F  80..BF
+ *                      S  ED      A0..BF  80..BF           11101101 101xxxxx
+ *    U+E000..U+FFFF       EE..EF  80..BF  80..BF
+ *                      N  F0      80..8F  80..BF  80..BF   11110000 1000xxxx
+ *   U+10000..U+3FFFF      F0      90..BF  80..BF  80..BF
+ *   U+40000..U+FFFFF      F1..F3  80..BF  80..BF  80..BF
+ *  U+100000..U+10FFFF     F4      80..8F  80..BF  80..BF   11110100 1000xxxx
+ *
+ *  Legend:
+ *    N = Non-shortest form
+ *    S = Surrogates
+ */
+bool utf8_decode(struct utf8_state *utf8_state, char c)
+{
+    if (utf8_state->used_len == utf8_state->total_len) {
+        utf8_state->used_len = 1;
+        /* First character in sequence. */
+        if (((unsigned char)c & 0x80) == 0) {
+            /* ASCII, easy. */
+            if (c == 0)
+                goto bad_encoding;
+            utf8_state->total_len = 1;
+            utf8_state->c = c;
+            goto finished_decoding;
+        } else if (((unsigned char)c & 0xE0) == 0xC0) {
+            utf8_state->total_len = 2;
+            utf8_state->c = ((unsigned char)c & 0x1F);
+            return false;
+        } else if (((unsigned char)c & 0xF0) == 0xE0) {
+            utf8_state->total_len = 3;
+            utf8_state->c = ((unsigned char)c & 0x0F);
+            return false;
+        } else if (((unsigned char)c & 0xF8) == 0xF0) {
+            utf8_state->total_len = 4;
+            utf8_state->c = ((unsigned char)c & 0x07);
+            return false;
+        }
+        goto bad_encoding;
+    }
+
+    if (((unsigned char)c & 0xC0) != 0x80)
+        goto bad_encoding;
+
+    utf8_state->c <<= 6;
+    utf8_state->c |= ((unsigned char)c & 0x3F);
+    
+    utf8_state->used_len++;
+    if (utf8_state->used_len == utf8_state->total_len)
+        goto finished_decoding;
+    return false;
+
+finished_decoding:
+    if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
+        errno = ERANGE;
+    /* The UTF-16 "surrogate range": illegal in UTF-8 */
+    else if (utf8_state->total_len == 3
+         && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
+        errno = ERANGE;
+    else {
+        int min_bits;
+        switch (utf8_state->total_len) {
+        case 1:
+            min_bits = 0;
+            break;
+        case 2:
+            min_bits = 7;
+            break;
+        case 3:
+            min_bits = 11;
+            break;
+        case 4:
+            min_bits = 16;
+            break;
+        default:
+            abort();
+        }
+        if ((utf8_state->c >> min_bits) == 0)
+            errno = EFBIG;
+        else
+            errno = 0;
+    }
+    return true;
+
+bad_encoding:
+    utf8_state->total_len = utf8_state->used_len;
+    errno = EINVAL;
+    return true;
+}
+
+size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
+{
+    if ((point >> 7) == 0) {
+        if (point == 0) {
+            errno = ERANGE;
+            return 0;
+        }
+        /* 0xxxxxxx */
+        dest[0] = point;
+        return 1;
+    }
+
+    if ((point >> 11) == 0) {
+        /* 110xxxxx 10xxxxxx */
+        dest[1] = 0x80 | (point & 0x3F);
+        dest[0] = 0xC0 | (point >> 6);
+        return 2;
+    }
+
+    if ((point >> 16) == 0) {
+        if (point >= 0xD800 && point <= 0xDFFF) {
+            errno = ERANGE;
+            return 0;
+        }
+        /* 1110xxxx 10xxxxxx 10xxxxxx */
+        dest[2] = 0x80 | (point & 0x3F);
+        dest[1] = 0x80 | ((point >> 6) & 0x3F);
+        dest[0] = 0xE0 | (point >> 12);
+        return 3;
+    }
+
+    if (point > 0x10FFFF) {
+        errno = ERANGE;
+        return 0;
+    }
+
+    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+    dest[3] = 0x80 | (point & 0x3F);
+    dest[2] = 0x80 | ((point >> 6) & 0x3F);
+    dest[1] = 0x80 | ((point >> 12) & 0x3F);
+    dest[0] = 0xF0 | (point >> 18);
+    return 4;
+}
+
@@ -0,0 +1,54 @@
+/* MIT (BSD) license - see LICENSE file for details */
+#ifndef CCAN_UTF8_H
+#define CCAN_UTF8_H
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+/* Unicode is limited to 21 bits. */
+#define UTF8_MAX_LEN    4
+
+struct utf8_state {
+    /* How many characters we are expecting as part of this Unicode point */
+    uint16_t total_len;
+    /* How many characters we've already seen. */
+    uint16_t used_len;
+    /* Compound character, aka Unicode point. */
+    uint32_t c;
+};
+
+#define UTF8_STATE_INIT { 0, 0, 0 }
+
+static inline void utf8_state_init(struct utf8_state *utf8_state)
+{
+    memset(utf8_state, 0, sizeof(*utf8_state));
+}
+
+/**
+ * utf8_decode - continue UTF8 decoding with this character.
+ * @utf8_state - initialized UTF8 state.
+ * @c - the character.
+ *
+ * Returns false if it needs another character to give results.
+ * Otherwise returns true, @utf8_state can be reused without initializeation,
+ * and sets errno:
+ * 0: success
+ * EINVAL: bad encoding (including a NUL character).
+ * EFBIG: not a minimal encoding.
+ * ERANGE: encoding of invalid character.
+ *
+ * You can extract the character from @utf8_state->c; @utf8_state->used_len
+ * indicates how many characters have been consumed.
+ */
+bool utf8_decode(struct utf8_state *utf8_state, char c);
+
+/**
+ * utf8_encode - encode a point into UTF8.
+ * @point - Unicode point to include.
+ * @dest - buffer to fill.
+ *
+ * Returns 0 if point was invalid, otherwise bytes of dest used.
+ * Sets errno to ERANGE if point was invalid.
+ */
+size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]);
+#endif /* CCAN_UTF8_H */