test-fts-tokenizer.c revision 72c4ef3b44c50c662b37bba93b463b0caeb63a4f
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar Baz <bar@example.org>" \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "foo, foo@domain"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic const char *test_inputs[] = {
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* generic things and word truncation: */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello world\r\n\nAnd there\twas: text galore, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "abc@example.com, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "Bar Baz <bar@example.org>, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "foo@domain "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "1234567890123456789012345678ä,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "12345678901234567890123456789ä,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "123456789012345678901234567890ä,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "(\"Hello world\")3.14 3,14 last",
72c4ef3b44c50c662b37bba93b463b0caeb63a4fTimo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello world\xEF\xBC\x8E"
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic unsigned int
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output,
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test all input at once */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input one byte at a time */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input in random chunks */
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output)
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen unsigned int i, outi = 0;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "1234567890123456789012345678ä",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen This definitely needs to be remapped. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "1234567890123456789012345678ä",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "more", "Hello", "world", "3.14", "3,14", "last", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainenstatic void test_fts_tokenizer_address_only(void)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer email address only");
908c417cc19ec4a2a01db542498c13ade3943601Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainenstatic void test_fts_tokenizer_address_parent(void)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar", "Baz", "bar", "example", "org", "bar@example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer email address + parent");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char *const settings[] = { "search", "", NULL };
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer search email address + parent");
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* make sure state is forgotten at EOF */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* test reset explicitly */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static void (*test_functions[])(void) = {