test-fts-tokenizer.c revision fdf70410de49eadfbb77997bb60ebba19aee4752
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "Bar Baz <bar@example.org>" \
cf63dc8723b971cc80638fccbf494d961cbafc7fTimo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "foo, foo@domain"
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainenstatic const char *test_inputs[] = {
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen /* generic things and word truncation: */
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "hello world\r\n\nAnd there\twas: text galore, "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "abc@example.com, "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "Bar Baz <bar@example.org>, "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "foo@domain "
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen "1234567890123456789012345678ä,"
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "12345678901234567890123456789ä,"
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "123456789012345678901234567890ä,"
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "(\"Hello world\")3.14 3,14 last",
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "hello world\xEF\xBC\x8E"
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
98e8f95ffee4eacca72b1bcf082f2c735592301bTimo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainenstatic unsigned int
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen const char *const *expected_output,
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen /* test all input at once */
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
71df09024cea5f2faa93da3bb9513ee96ba6bf22Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
10b8040903b1d1591f1d44552ff466c8789b8814Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
0a9cb42cbb135e3200cbfbb657820304cca8ecb8Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen /* test input one byte at a time */
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
f7141101e27d766b695ef27726f755117332a58eTimo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
7358272563d8ef77366447708ab0e58c0cff4151Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen /* test input in random chunks */
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
a8281b7c770f4a9a842b19303083fc7f6859e756Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
a8281b7c770f4a9a842b19303083fc7f6859e756Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
7358272563d8ef77366447708ab0e58c0cff4151Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
23878bd03d1de531e3261a25598beec621351910Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen const char *const *expected_output)
ad58b50aef8125981ebdbc89513236558bcccf60Timo Sirainen unsigned int i, outi = 0;
344bb4abc3acb63d04131cb63f1503a6ca01fb40Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
eff34528733a7893b2914a26023aac227ef4ae7fTimo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
bd417d416988d11a6b555b9aa57779e7ed976951Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
a9efdb661eb7a8a33aacfdcc3486dcc675a21543Timo Sirainen static const char *const expected_output[] = {
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "1234567890123456789012345678ä",
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "12345678901234567890123456789",
5685e60e62a8e0d368bd28a1526056f97bbba022Timo Sirainen "123456789012345678901234567890",
5685e60e62a8e0d368bd28a1526056f97bbba022Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
c14c5561e85853d91280235a7611b6050feaebb2Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
97afa073e3e1e0301dc41173ec34beb08edcce50Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
636f017be100bce67d66fd3ae1544a47681efd33Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
23878bd03d1de531e3261a25598beec621351910Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
23878bd03d1de531e3261a25598beec621351910Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
23878bd03d1de531e3261a25598beec621351910Timo Sirainen This definitely needs to be remapped. */
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen static const char *const expected_output[] = {
bb25bed75eefd011138ebf1b8e033fc8ef55ca74Timo Sirainen "1234567890123456789012345678ä",
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen "12345678901234567890123456789",
2a15ce3abe14099b94535f6dfc2d4ee023a7c455Timo Sirainen "123456789012345678901234567890",
2a15ce3abe14099b94535f6dfc2d4ee023a7c455Timo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi "quoted", "text", "word", "hlo", "words", "you're", "bad",
5ba6009f4e5493c4e6be9ffb3134525004a7975cAki Tuomi const char *error;
e1d08b1c39c63de92f0e914064a508bbf6c6fcc5Aki Tuomi test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
14102a0c5db8828ca8c7751ec96587fadc97a0bcTimo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
14102a0c5db8828ca8c7751ec96587fadc97a0bcTimo Sirainen static const char *const expected_output[] = {
14102a0c5db8828ca8c7751ec96587fadc97a0bcTimo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen test_begin("fts tokenizer email address only");
test_end();
static const char *const expected_output[] = {
const char *error;
test_end();
static void test_fts_tokenizer_address_parent_simple(void)
static void test_fts_tokenizer_address_parent_tr29(void)
static void test_fts_tokenizer_address_search(void)
static const char *const expected_output[] = {
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
test_end();
int main(void)
static void (*test_functions[])(void) = {
int ret;
return ret;