test-fts-tokenizer.c revision b7324e421e2132cbbf753e6fdbe675bbaecdf929
/* Copyright (c) 2014-2016 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "unichar.h"
#include "str.h"
#include "test-common.h"
#include "fts-tokenizer.h"
#include "fts-tokenizer-common.h"
#include "fts-tokenizer-private.h"
#include "fts-tokenizer-generic-private.h"
/*there should be a trailing space ' ' at the end of each string except the last one*/
#define TEST_INPUT_ADDRESS \
"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
"Bar Baz <bar@example.org>" \
"Foo Bar (comment)foo.bar@host.example.org " \
"foo, foo@domain " \
"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
"trailing, period@blue.com. " \
"multi-trialing, mul@trail.com..... " \
"m@s " \
"hypen@hypen-hypen.com " \
"hypen@hypen-hypen-sick.com.-"
static const char *test_inputs[] = {
/* generic things and word truncation: */
"hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
"abc@example.com, "
"Bar Baz <bar@example.org>, "
"foo@domain "
"1234567890123456789012345678\xC3\xA4,"
"12345678901234567890123456789\xC3\xA4,"
"123456789012345678901234567890\xC3\xA4,"
"and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
"(\"Hello world\")3.14 3,14 last",
"1.",
"' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
"'1234567890123456789012345678\xC3\xA4,"
"123456789012345678901234567x'\xC3\xA4,"
"1234567890123456789012345678x're,"
"1234567890123456789012345678x',"
"1234567890123456789012345678x'',"
"12345678901234567890123456789x',"
"12345678901234567890123456789x'',"
"123456789012345678901234567890x',"
"123456789012345678901234567890x'',"
/* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
"\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
"you\xE2\x80\x99re\xE2\x80\x99xyz",
/* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
U+205A(e2 81 9a) and U+205F(e2 81 9f) */
"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
/* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
"hello world\xEF\xBC\x8E",
/* TR29 WB5a */
"l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
};
static void test_fts_tokenizer_find(void)
{
test_begin("fts tokenizer find");
test_end();
}
static unsigned int
const char *const *expected_output,
unsigned int first_outi)
{
/* test all input at once */
outi = first_outi;
outi++;
}
outi++;
}
/* test input one byte at a time */
outi = first_outi;
outi++;
}
}
outi++;
}
/* test input in random chunks */
outi = first_outi;
outi++;
}
}
outi++;
}
return outi+1;
}
static void
const char *const *expected_output)
{
unsigned int i, outi = 0;
for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
}
}
static void test_fts_tokenizer_generic_only(void)
{
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galor\xC3\xA9",
"abc", "example", "com", "Bar", "Baz",
"bar", "example", "org", "foo", "domain",
"1234567890123456789012345678\xC3\xA4",
"12345678901234567890123456789",
"123456789012345678901234567890",
"and", "longlonglongabcdefghijklmnopqr",
"1", NULL,
"quoted", "text", "word", "hlo", "words", "you're", "bad",
"1234567890123456789012345678\xC3\xA4",
"123456789012345678901234567x'",
"1234567890123456789012345678x'",
"1234567890123456789012345678x",
"1234567890123456789012345678x",
"12345678901234567890123456789x",
"12345678901234567890123456789x",
"123456789012345678901234567890",
"123456789012345678901234567890",
"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
"you're'xyz", NULL,
"hello", "world", "And",
"there", "was", "text", "galore",
};
struct fts_tokenizer *tok;
const char *error;
test_begin("fts tokenizer generic simple");
test_end();
}
/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
static void test_fts_tokenizer_generic_tr29_only(void)
{
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galor\xC3\xA9",
"abc", "example", "com", "Bar", "Baz",
"bar", "example", "org", "foo", "domain",
"1234567890123456789012345678\xC3\xA4",
"12345678901234567890123456789",
"123456789012345678901234567890",
"and", "longlonglongabcdefghijklmnopqr",
"1", NULL,
"quoted", "text", "word", "hlo", "words", "you're", "bad",
"1234567890123456789012345678\xC3\xA4",
"123456789012345678901234567x'",
"1234567890123456789012345678x'",
"1234567890123456789012345678x",
"1234567890123456789012345678x",
"12345678901234567890123456789x",
"12345678901234567890123456789x",
"123456789012345678901234567890",
"123456789012345678901234567890",
"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
"you're'xyz", NULL,
"hello", "world", "And",
"there", "was", "text", "galore",
};
struct fts_tokenizer *tok;
const char *error;
test_begin("fts tokenizer generic TR29");
test_end();
}
/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
static void test_fts_tokenizer_generic_tr29_wb5a(void)
{
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galor\xC3\xA9",
"abc", "example", "com", "Bar", "Baz",
"bar", "example", "org", "foo", "domain",
"1234567890123456789012345678\xC3\xA4",
"12345678901234567890123456789",
"123456789012345678901234567890",
"and", "longlonglongabcdefghijklmnopqr",
"1", NULL,
"quoted", "text", "word", "hlo", "words", "you're", "bad",
"1234567890123456789012345678\xC3\xA4",
"123456789012345678901234567x'",
"1234567890123456789012345678x'",
"1234567890123456789012345678x",
"1234567890123456789012345678x",
"12345678901234567890123456789x",
"12345678901234567890123456789x",
"123456789012345678901234567890",
"123456789012345678901234567890",
"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
"you're'xyz", NULL,
"hello", "world", "And",
"there", "was", "text", "galore",
"l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
};
struct fts_tokenizer *tok;
const char *error;
test_begin("fts tokenizer generic TR29 with WB5a");
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
test_end();
}
static void test_fts_tokenizer_address_only(void)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"abc.dfg@example.com", "bar@example.org",
"foo.bar@host.example.org", "foo@domain",
"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
"period@blue.com", /*trailing period '.' in email */
"mul@trail.com",
"m@s", /*one letter local-part and domain name */
"hypen@hypen-hypen.com",
"hypen@hypen-hypen-sick.com",
};
struct fts_tokenizer *tok;
const char *error;
test_begin("fts tokenizer email address only");
test_end();
}
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
"Bar", "Baz", "bar", "example", "org", "bar@example.org",
"Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
"foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
"trailing", "period", "blue", "com", "period@blue.com",
"multi", "trialing", "mul", "trail", "com", "mul@trail.com",
"m", "s", "m@s",
"hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com",
"hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
};
const char *error;
test_end();
}
static void test_fts_tokenizer_address_parent_simple(void)
{
}
static void test_fts_tokenizer_address_parent_tr29(void)
{
}
static void test_fts_tokenizer_address_search(void)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
"Bar", "Baz", "bar@example.org",
"Foo", "Bar", "comment", "foo.bar@host.example.org",
"foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
"trailing", "period@blue.com",
"multi", "trialing", "mul@trail.com",
"m@s",
"hypen@hypen-hypen.com",
"hypen@hypen-hypen-sick.com",
};
test_begin("fts tokenizer search email address + parent");
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
/* make sure state is forgotten at EOF */
/* test reset explicitly */
test_end();
}
static void test_fts_tokenizer_delete_trailing_partial_char(void)
{
static const struct {
const char *str;
unsigned int truncated_len;
} tests[] = {
/* non-truncated */
{ "\x7f", 1 },
{ "\xC2\x80", 2 },
{ "\xE0\x80\x80", 3 },
{ "\xF0\x80\x80\x80", 4 },
/* truncated */
{ "\xF0\x80\x80", 0 },
{ "x\xF0\x80\x80", 1 },
};
unsigned int i;
test_begin("fts tokenizer delete trailing partial char");
for (i = 0; i < N_ELEMENTS(tests); i++) {
}
test_end();
}
static void test_fts_tokenizer_address_maxlen(void)
{
const char *input = "...\357\277\275@a";
struct fts_tokenizer *tok;
test_begin("fts tokenizer address maxlen");
test_end();
}
static void test_fts_tokenizer_random(void)
{
unsigned int i;
unsigned char addr[10] = { 0 };
test_begin("fts tokenizer random");
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
for (i = 0; i < 10000; i++) T_BEGIN {
for (unsigned int j = 0; j < sizeof(addr); j++)
str_truncate(str, 0);
} T_END;
test_end();
}
int main(void)
{
static void (*test_functions[])(void) = {
};
int ret;
return ret;
}