test-fts-tokenizer.c revision 3448096d5b1cd324ed5132045de0345cd7120a25
2454dfa32c93c20a8522c6ed42fe057baaac9f9aStephan Bosch/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
c4ec7cb598805b1387dc3aab59ec8f32d8cc24e1Timo Sirainen "Bar Baz <bar@example.org>" \
b55f914c0ade77252cfd798ea8eb9a84bda56315Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "foo, foo@domain"
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainenstatic const char *test_inputs[] = {
92c49f3005f4dff1a6f576fffa8112ef6d1cae7fTimo Sirainen /* generic things and word truncation: */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
1d2c463d23f09f15727edae9c78b07ec6a7a27daTimo Sirainen "abc@example.com, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain "
de754cb78f75e8b3b994cddafe41c9ed1467c33dTimo Sirainen "1234567890123456789012345678ä,"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "12345678901234567890123456789ä,"
4ead43ecc06d10047998966c4dc0b142ecce4b66Timo Sirainen "123456789012345678901234567890ä,"
1d4f710106fb498750456724628da6063e012e6dTimo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
1d4f710106fb498750456724628da6063e012e6dTimo Sirainen "(\"Hello world\")3.14 3,14 last",
ab0d9eecd85f74acae18fe88529302e0776cc500Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "'1234567890123456789012345678ä,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567x'ä,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x're,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x'',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "12345678901234567890123456789x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "12345678901234567890123456789x'',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567890x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567890x'',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "you\xE2\x80\x99re\xE2\x80\x99xyz",
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "hello world\xEF\xBC\x8E"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainenstatic unsigned int
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen const char *const *expected_output,
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* test all input at once */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen /* test input one byte at a time */
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen /* test input in random chunks */
2b96880f2d789d125aff6a95eaa7b51f558a6a1cTimo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
ed16ab579bd058ec5e2b5d02bb41fdadd9e05b31Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen const char *const *expected_output)
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen unsigned int i, outi = 0;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainenstatic void test_fts_tokenizer_generic_only(void)
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen static const char *const expected_output[] = {
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "1234567890123456789012345678ä",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "12345678901234567890123456789",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "123456789012345678901234567890",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
f059a046515f4b2b15a6c2a10a6f12f6166e39a5Timo Sirainen "1234567890123456789012345678ä",
f059a046515f4b2b15a6c2a10a6f12f6166e39a5Timo Sirainen "123456789012345678901234567x'",
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch "1234567890123456789012345678x'",
f29756821a4c6b12b73e4a2a3e1c230117a43773Timo Sirainen "1234567890123456789012345678x",
f29756821a4c6b12b73e4a2a3e1c230117a43773Timo Sirainen "1234567890123456789012345678x",
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Boschconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
c12d96f12cac9af464ab2e59046bd59b0c06b4eaTimo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
c12d96f12cac9af464ab2e59046bd59b0c06b4eaTimo Sirainen This definitely needs to be remapped. */
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Boschstatic void test_fts_tokenizer_generic_tr29_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen "1234567890123456789012345678ä",
d3d769026fae5d21c2d29614d3bc4579e8d79e81Timo Sirainen "12345678901234567890123456789",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "123456789012345678901234567890",
5a9e240ebf8d0daaf029973973b52e415148070bTimo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678ä",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567x'",
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen "1234567890123456789012345678x'",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x",
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen "12345678901234567890123456789x",
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
a117008f03ad9e2d54258b30d3fb03ffa502a448Timo Sirainenstatic void test_fts_tokenizer_address_only(void)
6da2d4faed507f513c68b94bb56a13caeeb3ff4aTimo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address only");
c7fca6cbb32388556d9f6d8313486cc4e4a3c058Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
c7fca6cbb32388556d9f6d8313486cc4e4a3c058Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
5f1d689131a75c39f064cbd4202373e7edf78f18Josef 'Jeff' Sipek "Bar", "Baz", "bar", "example", "org", "bar@example.org",
5a9e240ebf8d0daaf029973973b52e415148070bTimo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenconst char *const simple_settings[] = {"algorithm", "simple", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_parent_simple(void)
1093de32efb2a231949566d4bd8aa55a8f43fb70Timo Sirainen test_fts_tokenizer_address_parent("simple", simple_settings);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_parent_tr29(void)
ab90f702ceedb7ba445a9a592be0b213b27cbafaStephan Bosch test_fts_tokenizer_address_parent("tr29", tr29_settings);
4de2a17e0a2aed3b57a6c1057329b6a132b56ae2Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
2aac7ca853f63b62ea79ef8eae9ded83ed6063a5Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
2aac7ca853f63b62ea79ef8eae9ded83ed6063a5Timo Sirainen static const char *const expected_output[] = {
9ddd3d7d8651985e373a6c48e0ddc76b8a4ef1c7Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
5d2e7ec2ea725c8a6a63f56b771e746f93e782ecTimo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const settings[] = { "search", "", NULL };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer search email address + parent");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen /* make sure state is forgotten at EOF */
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
bdb9f7f7fbf828fb85a393bd2803167b1bb8ff0dTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen /* test reset explicitly */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static void (*test_functions[])(void) = {