bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "lib.h"
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen#include "unichar.h"
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen#include "str.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "test-common.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer.h"
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang#include "fts-tokenizer-common.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-private.h"
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen#include "fts-tokenizer-generic-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang/*there should be a trailing space ' ' at the end of each string except the last one*/
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen#define TEST_INPUT_ADDRESS \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar Baz <bar@example.org>" \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
998395f6743fbecc07ee65ae08c416fa6cea9e09Teemu Huovila "foo, foo@domain " \
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "trailing, period@blue.com. " \
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "multi-trialing, mul@trail.com..... " \
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "m@s " \
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen.com " \
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen-sick.com.-"
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic const char *test_inputs[] = {
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* generic things and word truncation: */
3448096d5b1cd324ed5132045de0345cd7120a25Timo Sirainen "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "abc@example.com, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "Bar Baz <bar@example.org>, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "foo@domain "
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4,"
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "12345678901234567890123456789\xC3\xA4,"
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "123456789012345678901234567890\xC3\xA4,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "(\"Hello world\")3.14 3,14 last",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen "1.",
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen
72c4ef3b44c50c662b37bba93b463b0caeb63a4fTimo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "'1234567890123456789012345678\xC3\xA4,"
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "123456789012345678901234567x'\xC3\xA4,"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x're,"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x'',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x'',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890x',"
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890x'',"
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "you\xE2\x80\x99re\xE2\x80\x99xyz",
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "hello world\xEF\xBC\x8E",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila /* TR29 WB5a */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen};
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainenstatic void test_fts_tokenizer_find(void)
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen{
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_begin("fts tokenizer find");
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_end();
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen}
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic unsigned int
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen unsigned int first_outi)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char *token, *error;
2ac5f36aa7c2e7a07ba8815d43a6d7483f62e74cTimo Sirainen unsigned int i, outi, max, char_len;
2ac5f36aa7c2e7a07ba8815d43a6d7483f62e74cTimo Sirainen size_t input_len = strlen(_input);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test all input at once */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = first_outi;
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input one byte at a time */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = first_outi;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < input_len; i += char_len) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen char_len = uni_utf8_char_bytes(input[i]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input in random chunks */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = first_outi;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < input_len; i += char_len) {
191153d1a5b0eb0c129139570e3aa5212f28d2acJosef 'Jeff' Sipek max = i_rand_minmax(1, input_len - i);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen for (char_len = 0; char_len < max; )
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang test_assert_strcmp(token, expected_output[outi]);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen return outi+1;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen}
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic void
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output)
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen{
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen unsigned int i, outi = 0;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen expected_output, outi);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen}
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
3448096d5b1cd324ed5132045de0345cd7120a25Timo Sirainen "there", "was", "text", "galor\xC3\xA9",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "abc", "example", "com", "Bar", "Baz",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "bar", "example", "org", "foo", "domain",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila "and", "longlonglongabcdefghijklmnopqr",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen "1", NULL,
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "word", "pre", "post", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "word", "pre", "post", NULL,
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "you're'xyz", NULL,
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "there", "was", "text", "galore",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and", "more", NULL,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello", "world", NULL,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_begin("fts tokenizer generic simple");
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputs(tok, expected_output);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen This definitely needs to be remapped. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
3448096d5b1cd324ed5132045de0345cd7120a25Timo Sirainen "there", "was", "text", "galor\xC3\xA9",
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila "abc", "example", "com", "Bar", "Baz",
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila "bar", "example", "org", "foo", "domain",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen "1", NULL,
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "word", "pre", "post", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x'",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "1234567890123456789012345678x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "12345678901234567890123456789x",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
b6b06530d654f0436bfbaefc1e988d53fff0cbeeTimo Sirainen "123456789012345678901234567890",
202468f94e6c6c8b5d3d98ee74e01bb0d0bb04aaTimo Sirainen
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "word", "pre", "post", NULL,
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen "you're'xyz", NULL,
83172e28d4ac684dfed83f7c9db933493d7c5922Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "there", "was", "text", "galore",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and", "more", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello", "world", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_begin("fts tokenizer generic TR29");
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputs(tok, expected_output);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilaconst char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila/* TODO: U+206F is in "Format" and therefore currently not word break.
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila This definitely needs to be remapped. */
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovilastatic void test_fts_tokenizer_generic_tr29_wb5a(void)
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila{
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila static const char *const expected_output[] = {
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "hello", "world", "And",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "there", "was", "text", "galor\xC3\xA9",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "abc", "example", "com", "Bar", "Baz",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "bar", "example", "org", "foo", "domain",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "12345678901234567890123456789",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567890",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "and", "longlonglongabcdefghijklmnopqr",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "1", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "word", "pre", "post", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "1234567890123456789012345678\xC3\xA4",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567x'",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "1234567890123456789012345678x'",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "1234567890123456789012345678x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "1234567890123456789012345678x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "12345678901234567890123456789x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "12345678901234567890123456789x",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567890",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "123456789012345678901234567890",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "word", "pre", "post", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "you're'xyz", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "hello", "world", "And",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "there", "was", "text", "galore",
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "and", "more", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "hello", "world", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila "l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila NULL
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila };
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila struct fts_tokenizer *tok;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila const char *error;
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila test_begin("fts tokenizer generic TR29 with WB5a");
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila test_tokenizer_inputs(tok, expected_output);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila fts_tokenizer_unref(&tok);
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila test_end();
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila}
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainenstatic void test_fts_tokenizer_address_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "abc.dfg@example.com", "bar@example.org",
af177be2664018e8074d69449b9c6a2d9741ec25Teemu Huovila "foo.bar@host.example.org", "foo@domain",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "period@blue.com", /*trailing period '.' in email */
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "mul@trail.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "m@s", /*one letter local-part and domain name */
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen.com",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen-sick.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer email address only");
908c417cc19ec4a2a01db542498c13ade3943601Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilastatic void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar", "Baz", "bar", "example", "org", "bar@example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "trailing", "period", "blue", "com", "period@blue.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "multi", "trialing", "mul", "trail", "com", "mul@trail.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "m", "s", "m@s",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&gen_tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilaconst char *const simple_settings[] = {"algorithm", "simple", NULL};
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilastatic void test_fts_tokenizer_address_parent_simple(void)
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila{
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_fts_tokenizer_address_parent("simple", simple_settings);
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila}
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovilastatic void test_fts_tokenizer_address_parent_tr29(void)
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila{
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_fts_tokenizer_address_parent("tr29", tr29_settings);
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila}
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar", "Baz", "bar@example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "trailing", "period@blue.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "multi", "trialing", "mul@trail.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang "m@s",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen.com",
eb568e46e82bc814ca3384236a483691a12f9c54Baofeng Wang "hypen@hypen-hypen-sick.com",
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang NULL
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen };
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char *const settings[] = { "search", "", NULL };
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen const char *token, *error;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer search email address + parent");
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* make sure state is forgotten at EOF */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "foo") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "bar@baz") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "foo") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* test reset explicitly */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_reset(tok);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "b@c") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen fts_tokenizer_unref(&tok);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen fts_tokenizer_unref(&gen_tok);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_end();
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen}
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainenstatic void test_fts_tokenizer_delete_trailing_partial_char(void)
0d6f8e7e231ac3fc8647d8fc3072d7d1e477a7cfBaofeng Wang{
b7324e421e2132cbbf753e6fdbe675bbaecdf929Timo Sirainen static const struct {
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen const char *str;
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen unsigned int truncated_len;
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen } tests[] = {
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen /* non-truncated */
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen { "\x7f", 1 },
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen { "\xC2\x80", 2 },
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen { "\xE0\x80\x80", 3 },
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen { "\xF0\x80\x80\x80", 4 },
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen /* truncated */
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen { "\xF0\x80\x80", 0 },
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen { "x\xF0\x80\x80", 1 },
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen };
0d6f8e7e231ac3fc8647d8fc3072d7d1e477a7cfBaofeng Wang unsigned int i;
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen size_t size;
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen test_begin("fts tokenizer delete trailing partial char");
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen for (i = 0; i < N_ELEMENTS(tests); i++) {
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen size = strlen(tests[i].str);
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen test_assert(size == tests[i].truncated_len);
0d6f8e7e231ac3fc8647d8fc3072d7d1e477a7cfBaofeng Wang }
edc654a35c3368dfb529ba784aee41dff6f45149Timo Sirainen test_end();
0d6f8e7e231ac3fc8647d8fc3072d7d1e477a7cfBaofeng Wang}
0d6f8e7e231ac3fc8647d8fc3072d7d1e477a7cfBaofeng Wang
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainenstatic void test_fts_tokenizer_address_maxlen(void)
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen{
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen const char *const settings[] = {"maxlen", "5", NULL};
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen const char *input = "...\357\277\275@a";
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen struct fts_tokenizer *tok;
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen const char *token, *error;
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen test_begin("fts tokenizer address maxlen");
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen while (fts_tokenizer_next(tok, (const unsigned char *)input,
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen strlen(input), &token, &error) > 0) ;
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) ;
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen fts_tokenizer_unref(&tok);
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen test_end();
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen}
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainenstatic void test_fts_tokenizer_random(void)
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen{
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen const char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen const char *const settings[] = {"algorithm", "simple", NULL};
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen const char *const email_settings[] = {"maxlen", "9", NULL};
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen unsigned int i;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen unsigned char addr[10] = { 0 };
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen string_t *str = t_str_new(20);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen const char *token, *error;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen test_begin("fts tokenizer random");
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen for (i = 0; i < 10000; i++) T_BEGIN {
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen for (unsigned int j = 0; j < sizeof(addr); j++)
62461eb609e1d852e027cf4e07d30d51288678a2Aki Tuomi addr[j] = test_chars[i_rand() % N_ELEMENTS(test_chars)];
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen str_truncate(str, 0);
5c97732871842800816aea0215c56bf701f623a6Aki Tuomi (void)uni_utf8_get_valid_data(addr, sizeof(addr), str);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen while (fts_tokenizer_next(tok, str_data(str), str_len(str),
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen &token, &error) > 0) ;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) ;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen } T_END;
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen fts_tokenizer_unref(&tok);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen fts_tokenizer_unref(&gen_tok);
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen test_end();
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen}
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenint main(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
baf3e87e186453fda13bd21f7cbcb2efc8492e8bTimo Sirainen static void (*const test_functions[])(void) = {
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_fts_tokenizer_find,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_fts_tokenizer_generic_only,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_fts_tokenizer_generic_tr29_only,
3a54211bd6c4dc3f8687c16020770551cf83a548Teemu Huovila test_fts_tokenizer_generic_tr29_wb5a,
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_fts_tokenizer_address_only,
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_fts_tokenizer_address_parent_simple,
fdf70410de49eadfbb77997bb60ebba19aee4752Teemu Huovila test_fts_tokenizer_address_parent_tr29,
b1b0b2b543dc1a10015272fc970ad7534f84e0c5Timo Sirainen test_fts_tokenizer_address_maxlen,
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_fts_tokenizer_address_search,
0d6f8e7e231ac3fc8647d8fc3072d7d1e477a7cfBaofeng Wang test_fts_tokenizer_delete_trailing_partial_char,
9d92ea347e1c098fa33ea517514dfdc0bb8995e2Timo Sirainen test_fts_tokenizer_random,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen int ret;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen fts_tokenizers_init();
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen ret = test_run(test_functions);
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen fts_tokenizers_deinit();
507ea0bc5b25efb4c96033a19dec66689a50ebd0Baofeng Wang
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen return ret;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}