test-fts-tokenizer.c revision fdf70410de49eadfbb77997bb60ebba19aee4752
02c335c23bf5fa225a467c19f2c063fb0dc7b8c3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen "Bar Baz <bar@example.org>" \
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "foo, foo@domain"
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenstatic const char *test_inputs[] = {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* generic things and word truncation: */
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "hello world\r\n\nAnd there\twas: text galore, "
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen "abc@example.com, "
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "Bar Baz <bar@example.org>, "
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "foo@domain "
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "1234567890123456789012345678ä,"
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen "12345678901234567890123456789ä,"
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "123456789012345678901234567890ä,"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "(\"Hello world\")3.14 3,14 last",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
e2ce8d4a6ac5d82a906178148453e7613fab9ba0Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
e2ce8d4a6ac5d82a906178148453e7613fab9ba0Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
e2ce8d4a6ac5d82a906178148453e7613fab9ba0Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "hello world\xEF\xBC\x8E"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainenstatic unsigned int
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen const char *const *expected_output,
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* test all input at once */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
46b823ac3bce2c0f9f0fc73911e48d3a77b04fbeTimo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
e245fb1302121d2bc2580f61e040c2c8a558ee9eTimo Sirainen /* test input one byte at a time */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* test input in random chunks */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen const char *const *expected_output)
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen unsigned int i, outi = 0;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen static const char *const expected_output[] = {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "1234567890123456789012345678ä",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "12345678901234567890123456789",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "123456789012345678901234567890",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen This definitely needs to be remapped. */
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen static const char *const expected_output[] = {
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "1234567890123456789012345678ä",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "12345678901234567890123456789",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "123456789012345678901234567890",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenstatic void test_fts_tokenizer_address_only(void)
46631c1d903c409444b1b1c4a1d41a033c09ee37Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen static const char *const expected_output[] = {
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen test_begin("fts tokenizer email address only");
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainenstatic void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen static const char *const expected_output[] = {
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen "Bar", "Baz", "bar", "example", "org", "bar@example.org",
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainenconst char *const simple_settings[] = {"algorithm", "simple", NULL};
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainenstatic void test_fts_tokenizer_address_parent_simple(void)
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_fts_tokenizer_address_parent("simple", simple_settings);
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainenstatic void test_fts_tokenizer_address_parent_tr29(void)
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen test_fts_tokenizer_address_parent("tr29", tr29_settings);
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen static const char *const expected_output[] = {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen static const char *const settings[] = { "search", "", NULL };
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_begin("fts tokenizer search email address + parent");
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* make sure state is forgotten at EOF */
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* test reset explicitly */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen static void (*test_functions[])(void) = {