bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang/*there should be a trailing space ' ' at the end of each string except the last one*/
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar Baz <bar@example.org>" \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
998395f6743fbecc07ee65ae08c416fa6cea9e09Teemu Huovila "foo, foo@domain " \
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "trailing, period@blue.com. " \
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "multi-trialing, mul@trail.com..... " \
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen.com " \
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen-sick.com.-"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* generic things and word truncation: */
3448096d5b1cd324ed5132045de0345cd7120a25Timo Sirainen "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "abc@example.com, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "Bar Baz <bar@example.org>, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "foo@domain "
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4,"
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "12345678901234567890123456789\xC3\xA4,"
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "123456789012345678901234567890\xC3\xA4,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "(\"Hello world\")3.14 3,14 last",
72c4ef3b44c50c662b37bba93b463b0caeb63a4fTimo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "'1234567890123456789012345678\xC3\xA4,"
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "123456789012345678901234567x'\xC3\xA4,"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x're,"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x'',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x'',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890x',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890x'',"
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "you\xE2\x80\x99re\xE2\x80\x99xyz",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "hello world\xEF\xBC\x8E",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila /* TR29 WB5a */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic unsigned int
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output,
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test all input at once */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input one byte at a time */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input in random chunks */
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output)
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen This definitely needs to be remapped. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilaconst char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila/* TODO: U+206F is in "Format" and therefore currently not word break.
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila This definitely needs to be remapped. */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilastatic void test_fts_tokenizer_generic_tr29_wb5a(void)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila static const char *const expected_output[] = {
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "12345678901234567890123456789",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567890",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567x'",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "1234567890123456789012345678x'",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "1234567890123456789012345678x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "1234567890123456789012345678x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "12345678901234567890123456789x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "12345678901234567890123456789x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567890",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567890",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila test_begin("fts tokenizer generic TR29 with WB5a");
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainenstatic void test_fts_tokenizer_address_only(void)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "period@blue.com", /*trailing period '.' in email */
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "mul@trail.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "m@s", /*one letter local-part and domain name */
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen.com",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen-sick.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer email address only");
908c417cc19ec4a2a01db542498c13ade3943601Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilastatic void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar", "Baz", "bar", "example", "org", "bar@example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "trailing", "period", "blue", "com", "period@blue.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "multi", "trialing", "mul", "trail", "com", "mul@trail.com",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilaconst char *const simple_settings[] = {"algorithm", "simple", NULL};
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilastatic void test_fts_tokenizer_address_parent_simple(void)
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_fts_tokenizer_address_parent("simple", simple_settings);
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilastatic void test_fts_tokenizer_address_parent_tr29(void)
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_fts_tokenizer_address_parent("tr29", tr29_settings);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen.com",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen-sick.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char *const settings[] = { "search", "", NULL };
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer search email address + parent");
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* make sure state is forgotten at EOF */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* test reset explicitly */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainenstatic void test_fts_tokenizer_delete_trailing_partial_char(void)
b7324e421e2132cbbf753e6fdbe675bbaecdf929Timo Sirainen static const struct {
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen /* non-truncated */
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen /* truncated */
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen { "\xF0\x80\x80", 0 },
0d6f8e7e231ac3fc8647d8fc3072d7d1e477a7cfBaofeng Wang unsigned int i;
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen test_begin("fts tokenizer delete trailing partial char");
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainenstatic void test_fts_tokenizer_address_maxlen(void)
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen const char *const settings[] = {"maxlen", "5", NULL};
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen while (fts_tokenizer_next(tok, (const unsigned char *)input,
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) ;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen const char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen const char *const settings[] = {"algorithm", "simple", NULL};
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen const char *const email_settings[] = {"maxlen", "9", NULL};
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen unsigned int i;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen for (unsigned int j = 0; j < sizeof(addr); j++)
62461eb609e1d852e027cf4e07d30d51288678a2Aki Tuomi addr[j] = test_chars[i_rand() % N_ELEMENTS(test_chars)];
5c97732871842800816aea0215c56bf701f623a6Aki Tuomi (void)uni_utf8_get_valid_data(addr, sizeof(addr), str);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen while (fts_tokenizer_next(tok, str_data(str), str_len(str),
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) ;
baf3e87e186453fda13bd21f7cbcb2efc8492e8bTimo Sirainen static void (*const test_functions[])(void) = {