test-fts-tokenizer.c revision 5c97732871842800816aea0215c56bf701f623a6
2454dfa32c93c20a8522c6ed42fe057baaac9f9aStephan Bosch/* Copyright (c) 2014-2017 Dovecot authors, see the included COPYING file */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen/*there should be a trailing space ' ' at the end of each string except the last one*/
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "Bar Baz <bar@example.org>" \
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen "foo, foo@domain " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "trailing, period@blue.com. " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "multi-trialing, mul@trail.com..... " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hypen@hypen-hypen.com " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hypen@hypen-hypen-sick.com.-"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic const char *test_inputs[] = {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* generic things and word truncation: */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "abc@example.com, "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "Bar Baz <bar@example.org>, "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "foo@domain "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "(\"Hello world\")3.14 3,14 last",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "'1234567890123456789012345678\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567x'\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x're,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x',"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x'',"
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch "12345678901234567890123456789x',"
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen "12345678901234567890123456789x'',"
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi "123456789012345678901234567890x',"
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi "123456789012345678901234567890x'',"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch "\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch "you\xE2\x80\x99re\xE2\x80\x99xyz",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
a326f9da3c18a4ccfb28e72f87161eaf3624eaf2Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
a326f9da3c18a4ccfb28e72f87161eaf3624eaf2Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello world\xEF\xBC\x8E",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* TR29 WB5a */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
1e2b3bd82f2d4fbae0963f4a220df30b7b5ae628Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
60670187b0dd0e7f23f99a58feab11b862ad77acStephan Bosch test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic unsigned int
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const char *const *expected_output,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* test all input at once */
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch test_assert_strcmp(token, expected_output[outi]);
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* test input one byte at a time */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* test input in random chunks */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const char *const *expected_output)
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen unsigned int i, outi = 0;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen static const char *const expected_output[] = {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen This definitely needs to be remapped. */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen static const char *const expected_output[] = {
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen "1234567890123456789012345678\xC3\xA4",
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen "12345678901234567890123456789",
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
f32c6ed9db6f4c535f97a2020401572efc8abf86Timo Sirainen "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenconst char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
static void test_fts_tokenizer_generic_tr29_wb5a(void)
static const char *const expected_output[] = {
"l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
const char *error;
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
test_end();
static void test_fts_tokenizer_address_only(void)
static const char *const expected_output[] = {
"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
const char *error;
test_end();
static const char *const expected_output[] = {
"foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
const char *error;
test_end();
static void test_fts_tokenizer_address_parent_simple(void)
static void test_fts_tokenizer_address_parent_tr29(void)
static void test_fts_tokenizer_address_search(void)
static const char *const expected_output[] = {
"foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
test_end();
static void test_fts_tokenizer_delete_trailing_partial_char(void)
const char *str;
unsigned int truncated_len;
} tests[] = {
test_end();
static void test_fts_tokenizer_address_maxlen(void)
test_end();
static void test_fts_tokenizer_random(void)
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
for (unsigned int j = 0; j < sizeof(addr); j++)
} T_END;
test_end();
int main(void)
static void (*const test_functions[])(void) = {
int ret;
return ret;