nostrdb/parser: handle period at end of url
Fix parsing URL when encountering a period at the end of the url by setting it as disallowed from being present at the end of a URL. Some characters are disallowed to be present at the end of URLs. Presently, the period character is the only disallowed character. A character is the last character in the URL if it is followed by is_whitespace() or if it's the last character in the string. Signed-off-by: kernelkind <kernelkind@gmail.com> Tested-by: William Casarin <jb55@jb55.com> Signed-off-by: William Casarin <jb55@jb5.com> Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
committed by
Daniel D’Aquino
parent
d73422db38
commit
6f9bd6c4f4
@@ -363,6 +363,53 @@ fail:
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) {
|
||||||
|
unsigned char *next = cur + 1;
|
||||||
|
|
||||||
|
if (next > end)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (next == end)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return is_whitespace(*next);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int char_disallowed_at_end_url(char c)
|
||||||
|
{
|
||||||
|
return c == '.' || c == ',';
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static int is_final_url_char(unsigned char *cur, unsigned char *end)
|
||||||
|
{
|
||||||
|
if (is_whitespace(*cur))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (next_char_is_whitespace(cur, end)) {
|
||||||
|
// next char is whitespace so this char could be the final char in the url
|
||||||
|
return char_disallowed_at_end_url(*cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// next char isn't whitespace so it can't be a final char
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int consume_until_end_url(struct cursor *cur, int or_end) {
|
||||||
|
unsigned char *start = cur->p;
|
||||||
|
|
||||||
|
while (cur->p < cur->end) {
|
||||||
|
if (is_final_url_char(cur->p, cur->end))
|
||||||
|
return cur->p != start;
|
||||||
|
|
||||||
|
cur->p++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return or_end;
|
||||||
|
}
|
||||||
|
|
||||||
static int consume_url_fragment(struct cursor *cur)
|
static int consume_url_fragment(struct cursor *cur)
|
||||||
{
|
{
|
||||||
int c;
|
int c;
|
||||||
@@ -376,7 +423,7 @@ static int consume_url_fragment(struct cursor *cur)
|
|||||||
|
|
||||||
cur->p++;
|
cur->p++;
|
||||||
|
|
||||||
return consume_until_whitespace(cur, 1);
|
return consume_until_end_url(cur, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int consume_url_path(struct cursor *cur)
|
static int consume_url_path(struct cursor *cur)
|
||||||
@@ -393,7 +440,7 @@ static int consume_url_path(struct cursor *cur)
|
|||||||
while (cur->p < cur->end) {
|
while (cur->p < cur->end) {
|
||||||
c = *cur->p;
|
c = *cur->p;
|
||||||
|
|
||||||
if (c == '?' || c == '#' || is_whitespace(c)) {
|
if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -411,7 +458,7 @@ static int consume_url_host(struct cursor *cur)
|
|||||||
while (cur->p < cur->end) {
|
while (cur->p < cur->end) {
|
||||||
c = *cur->p;
|
c = *cur->p;
|
||||||
// TODO: handle IDNs
|
// TODO: handle IDNs
|
||||||
if (is_alphanumeric(c) || c == '.' || c == '-')
|
if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end))
|
||||||
{
|
{
|
||||||
count++;
|
count++;
|
||||||
cur->p++;
|
cur->p++;
|
||||||
|
|||||||
Reference in New Issue
Block a user