fts-tokenizer-address.c revision c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3
/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "str.h"
#include "buffer.h"
#include "fts-tokenizer-private.h"
/* Return not only our tokens, but also data for parent to process.*/
#define FTS_DEFAULT_HAVE_PARENT 1
enum email_address_parser_state {
};
struct email_address_fts_tokenizer {
struct fts_tokenizer tokenizer;
TODO: could be buffer_t maybe */
unsigned int have_parent; /* Setting for stand-alone usage.
Might be superfluous. */
};
/*
Extracted from core rfc822-parser.c
atext = ALPHA / DIGIT / ; Any character except controls,
"!" / "#" / ; SP, and specials.
"$" / "%" / ; Used for atoms
"&" / "'" /
"*" / "+" /
"-" / "/" /
"=" / "?" /
"^" / "_" /
"`" / "{" /
"|" / "}" /
"~"
MIME:
token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
or tspecials>
tspecials := "(" / ")" / "<" / ">" / "@" /
"," / ";" / ":" / "\" / <">
"/" / "[" / "]" / "?" / "="
So token is same as dot-atom, except stops also at '/', '?' and '='.
*/
/* atext chars are marked with 1, alpha and digits with 2,
atext-but-mime-tspecials with 4 */
unsigned char rfc822_atext_chars[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0-15 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 4, /* 32-47 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 4, 0, 4, /* 48-63 */
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, /* 80-95 */
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 96-111 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, /* 112-127 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
};
#define IS_ATEXT(c) \
(rfc822_atext_chars[(int)(unsigned char)(c)] != 0)
#define IS_DTEXT(c) \
(rfc822_atext_chars[(int)(unsigned char)(c)] == 2)
static int
fts_tokenizer_email_address_create(const char *const *settings,
struct fts_tokenizer **tokenizer_r,
const char **error_r)
{
struct email_address_fts_tokenizer *tok;
unsigned int have_parent = FTS_DEFAULT_HAVE_PARENT;
unsigned int i;
"Invalid parent setting: %s", value);
return -1;
}
} else {
return -1;
}
}
return 0;
}
{
struct email_address_fts_tokenizer *tok =
(struct email_address_fts_tokenizer *)_tok;
}
static const char *
{
}
static const char *
{
const char *ret;
return ret;
}
/* Used to rewind past characters that can not be the start of a new localpart.
Returns size that can be skipped. */
{
const unsigned char *p = data;
/* Yes, a dot can start an address. De facto before de jure. */
skip++;
p++;
}
return skip;
}
/* TODO:
- DONT dereference *p past size!
*/
static enum email_address_parser_state
{
const unsigned char *p = data;
if (*p == '@')
pos++;
p++;
if (at)
break;
}
/* localpart and @ */
}
/* localpart, @ not included yet */
}
/* not a localpart. skip past rest of no-good chars. */
return EMAIL_ADDRESS_PARSER_STATE_NONE;
}
/* TODO: might be nice if error was -1, but that requires a _r
param */
{
const unsigned char *at;
const unsigned char *str;
return 0;
}
/* TODO:
- allow address literals
- reject "@..."
- reject "@.host.tld"
*/
static enum email_address_parser_state
{
const unsigned char *p = data;
pos++;
p++;
}
/* A complete domain name */
}
}
/* not a domain. skip past no-good chars. */
return EMAIL_ADDRESS_PARSER_STATE_NONE;
}
/* Buffer raw data for parent. */
static void
{
if (tok->have_parent > 0)
}
static const char *
{
struct email_address_fts_tokenizer *tok =
(struct email_address_fts_tokenizer *)_tok;
return fts_tokenizer_address_current_token(tok);
}
/* end of data, output lingering tokens. first the parents data, then
possibly our token, if complete enough */
if (size == 0) {
return fts_tokenizer_address_parent_data(tok);
&& chars_after_at(tok) > 0)
return fts_tokenizer_address_current_token(tok);
}
/* 1) regular input data OR
2) circle around to return completed address */
/* no part of address found yet. remove possible
earlier data */
/* fall through */
/* last_word is empty or has the beginnings of a valid
local-part, but no '@' found yet. continue parsing
the beginning of data to see if it contains a full
local-part@ */
&local_skip);
pos += local_skip;
break;
/* last_word has a local-part@ and maybe the beginning
of a domain. continue parsing the beginning of data
to see if it contains a valid domain. */
&local_skip);
pos += local_skip;
break;
/* skip tailing non-atext */
if (tok->have_parent > 0)
return fts_tokenizer_address_parent_data(tok);
else {
return fts_tokenizer_address_current_token(tok);
}
default:
i_unreached();
}
}
return NULL;
}
static const struct fts_tokenizer_vfuncs email_address_tokenizer_vfuncs = {
};
static const struct fts_tokenizer fts_tokenizer_email_address_real = {
};
const struct fts_tokenizer *fts_tokenizer_email_address =