Fix issue with emojis next to hashtags and urls

Treat utf8 bytes next to hashtags and urls as boundary conditions

Changelog-Fixed: Fix issue with emojis next to hashtags and urls
This commit is contained in:
William Casarin
2023-07-16 11:00:48 -07:00
parent 8d14fdffb5
commit 77331644cb
2 changed files with 11 additions and 3 deletions

View File

@@ -431,10 +431,18 @@ static inline int is_whitespace(char c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
}
static inline int is_boundary(char c) {
static inline int is_utf8_byte(u8 c) {
return c & 0x80;
}
static inline int is_right_boundary(char c) {
return is_whitespace(c) || ispunct(c);
}
static inline int is_left_boundary(char c) {
return is_right_boundary(c) || is_utf8_byte(c);
}
static inline int is_invalid_url_ending(char c) {
return c == '!' || c == '?' || c == ')' || c == '.' || c == ',' || c == ';';
}
@@ -449,7 +457,7 @@ static inline int consume_until_boundary(struct cursor *cur) {
while (cur->p < cur->end) {
c = *cur->p;
if (is_boundary(c))
if (is_right_boundary(c))
return 1;
cur->p++;