test-fts-tokenizer.c revision 62461eb609e1d852e027cf4e07d30d51288678a2
2454dfa32c93c20a8522c6ed42fe057baaac9f9aStephan Bosch/* Copyright (c) 2014-2017 Dovecot authors, see the included COPYING file */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen/*there should be a trailing space ' ' at the end of each string except the last one*/
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
c4ec7cb598805b1387dc3aab59ec8f32d8cc24e1Timo Sirainen "Bar Baz <bar@example.org>" \
b55f914c0ade77252cfd798ea8eb9a84bda56315Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "foo, foo@domain " \
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "trailing, period@blue.com. " \
92c49f3005f4dff1a6f576fffa8112ef6d1cae7fTimo Sirainen "multi-trialing, mul@trail.com..... " \
1d2c463d23f09f15727edae9c78b07ec6a7a27daTimo Sirainen "hypen@hypen-hypen.com " \
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hypen@hypen-hypen-sick.com.-"
de754cb78f75e8b3b994cddafe41c9ed1467c33dTimo Sirainenstatic const char *test_inputs[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* generic things and word truncation: */
4ead43ecc06d10047998966c4dc0b142ecce4b66Timo Sirainen "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
1d4f710106fb498750456724628da6063e012e6dTimo Sirainen "abc@example.com, "
1d4f710106fb498750456724628da6063e012e6dTimo Sirainen "Bar Baz <bar@example.org>, "
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "foo@domain "
0b3e92b6043435c5aa9f1cf1d04b632f3e19abd9Phil Carmody "1234567890123456789012345678\xC3\xA4,"
0b3e92b6043435c5aa9f1cf1d04b632f3e19abd9Phil Carmody "12345678901234567890123456789\xC3\xA4,"
ab0d9eecd85f74acae18fe88529302e0776cc500Timo Sirainen "123456789012345678901234567890\xC3\xA4,"
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
a7efba62b6235e5efc124cbf702ddeb547ca3665Timo Sirainen "(\"Hello world\")3.14 3,14 last",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "'1234567890123456789012345678\xC3\xA4,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567x'\xC3\xA4,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x're,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x'',"
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen "12345678901234567890123456789x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "12345678901234567890123456789x'',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567890x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567890x'',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen "\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "you\xE2\x80\x99re\xE2\x80\x99xyz",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "hello world\xEF\xBC\x8E",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* TR29 WB5a */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainenstatic unsigned int
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen const char *const *expected_output,
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen /* test all input at once */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen /* test input one byte at a time */
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
2b96880f2d789d125aff6a95eaa7b51f558a6a1cTimo Sirainen test_assert_strcmp(token, expected_output[outi]);
2b96880f2d789d125aff6a95eaa7b51f558a6a1cTimo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen /* test input in random chunks */
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
e93184a9055c2530366dfe617e07199603c399ddMartti Rannanjärvi while (fts_tokenizer_final(tok, &token, &error) > 0) {
e2a88d59c0d47d63ce1ad5b1fd95e487124a3fd4Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen const char *const *expected_output)
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen unsigned int i, outi = 0;
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainenstatic void test_fts_tokenizer_generic_only(void)
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen static const char *const expected_output[] = {
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "1234567890123456789012345678\xC3\xA4",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "12345678901234567890123456789",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "123456789012345678901234567890",
f059a046515f4b2b15a6c2a10a6f12f6166e39a5Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678\xC3\xA4",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567x'",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x'",
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen "1234567890123456789012345678x",
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen "1234567890123456789012345678x",
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "12345678901234567890123456789x",
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "12345678901234567890123456789x",
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "123456789012345678901234567890",
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
b6fbc235f981b10333403e2fd6d333fd351c7a3cAki Tuomi test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
b6fbc235f981b10333403e2fd6d333fd351c7a3cAki Tuomiconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
ddbdc644a15f56f4b43596f1b8c0fc196c101445Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen This definitely needs to be remapped. */
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen static const char *const expected_output[] = {
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "1234567890123456789012345678\xC3\xA4",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "12345678901234567890123456789",
5a9e240ebf8d0daaf029973973b52e415148070bTimo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
f059a046515f4b2b15a6c2a10a6f12f6166e39a5Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678\xC3\xA4",
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen "123456789012345678901234567x'",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x'",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x",
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen "1234567890123456789012345678x",
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen "12345678901234567890123456789x",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "12345678901234567890123456789x",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
56af9dd10e7e6caeaca64395bad3f882b28ecdffTimo Sirainen "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
1312cf655d3ea22c0ab6487ce710ad4060c25905Timo Sirainenconst char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
1312cf655d3ea22c0ab6487ce710ad4060c25905Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
1312cf655d3ea22c0ab6487ce710ad4060c25905Timo Sirainen This definitely needs to be remapped. */
87dbf3e85526ccde5908a611eb9a798f1d0ccac3Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_wb5a(void)
1312cf655d3ea22c0ab6487ce710ad4060c25905Timo Sirainen static const char *const expected_output[] = {
1312cf655d3ea22c0ab6487ce710ad4060c25905Timo Sirainen "1234567890123456789012345678\xC3\xA4",
a7efba62b6235e5efc124cbf702ddeb547ca3665Timo Sirainen "12345678901234567890123456789",
a7efba62b6235e5efc124cbf702ddeb547ca3665Timo Sirainen "123456789012345678901234567890",
a7efba62b6235e5efc124cbf702ddeb547ca3665Timo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
c7fca6cbb32388556d9f6d8313486cc4e4a3c058Timo Sirainen "1234567890123456789012345678\xC3\xA4",
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen "123456789012345678901234567x'",
c7fca6cbb32388556d9f6d8313486cc4e4a3c058Timo Sirainen "1234567890123456789012345678x'",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
0d1b8b6bec79746c5d89d57dd8c1688946bd9237Josef 'Jeff' Sipek "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
1093de32efb2a231949566d4bd8aa55a8f43fb70Timo Sirainen "l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
4de2a17e0a2aed3b57a6c1057329b6a132b56ae2Timo Sirainen test_begin("fts tokenizer generic TR29 with WB5a");
4de2a17e0a2aed3b57a6c1057329b6a132b56ae2Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
5d2e7ec2ea725c8a6a63f56b771e746f93e782ecTimo Sirainenstatic void test_fts_tokenizer_address_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "period@blue.com", /*trailing period '.' in email */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "mul@trail.com",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "m@s", /*one letter local-part and domain name */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hypen@hypen-hypen.com",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hypen@hypen-hypen-sick.com",
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen test_begin("fts tokenizer email address only");
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
b833824981bc75af72adb844f8a4a992bd2f3ad3Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen static const char *const expected_output[] = {
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen "Bar", "Baz", "bar", "example", "org", "bar@example.org",
66ea9eaaa2d7531b3be8f633937628c94d907031Timo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen "trailing", "period", "blue", "com", "period@blue.com",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "multi", "trialing", "mul", "trail", "com", "mul@trail.com",
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen "hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com",
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen "hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
a7efba62b6235e5efc124cbf702ddeb547ca3665Timo Sirainen test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
a7efba62b6235e5efc124cbf702ddeb547ca3665Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
204ee6ed414f5e4eeb6f6c10763b55daf56f11acJosef 'Jeff' Sipek test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
204ee6ed414f5e4eeb6f6c10763b55daf56f11acJosef 'Jeff' Sipek test_tokenizer_inputoutput(tok, input, expected_output, 0);
c7eb1ffb7c73cb5d9c1316bbecd02947441a40d4Timo Sirainenconst char *const simple_settings[] = {"algorithm", "simple", NULL};
2f90189c6ee66a17f7bf838a8eb8a69868630fb8Timo Sirainenstatic void test_fts_tokenizer_address_parent_simple(void)
b6b7a17731a917958b6479920b3fac5ca991db6aTimo Sirainen test_fts_tokenizer_address_parent("simple", simple_settings);
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainenstatic void test_fts_tokenizer_address_parent_tr29(void)
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen test_fts_tokenizer_address_parent("tr29", tr29_settings);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen static const char *const expected_output[] = {
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen "hypen@hypen-hypen.com",
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen "hypen@hypen-hypen-sick.com",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const settings[] = { "search", "", NULL };
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen test_begin("fts tokenizer search email address + parent");
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen /* make sure state is forgotten at EOF */
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
717bb0dbaf4bd3f745669570647845e6d493bfe0Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
73f021723bffa0841bbdf371882b463a449f1ea9Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* test reset explicitly */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
717bb0dbaf4bd3f745669570647845e6d493bfe0Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
717bb0dbaf4bd3f745669570647845e6d493bfe0Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
717bb0dbaf4bd3f745669570647845e6d493bfe0Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
b1485f2691de41ed7b5f96cebda2ebcb69a5e22fTimo Sirainenstatic void test_fts_tokenizer_delete_trailing_partial_char(void)
f29756821a4c6b12b73e4a2a3e1c230117a43773Timo Sirainen static const struct {
f29756821a4c6b12b73e4a2a3e1c230117a43773Timo Sirainen const char *str;
b1485f2691de41ed7b5f96cebda2ebcb69a5e22fTimo Sirainen /* non-truncated */
b1485f2691de41ed7b5f96cebda2ebcb69a5e22fTimo Sirainen /* truncated */
b1485f2691de41ed7b5f96cebda2ebcb69a5e22fTimo Sirainen { "\xF0\x80\x80", 0 },
b1485f2691de41ed7b5f96cebda2ebcb69a5e22fTimo Sirainen unsigned int i;
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen test_begin("fts tokenizer delete trailing partial char");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainenstatic void test_fts_tokenizer_address_maxlen(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *const settings[] = {"maxlen", "5", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while (fts_tokenizer_next(tok, (const unsigned char *)input,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) ;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *const settings[] = {"algorithm", "simple", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *const email_settings[] = {"maxlen", "9", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
6a9e034441607c0c5a61858ff559af4615ac31caTimo Sirainen for (unsigned int j = 0; j < sizeof(addr); j++)
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen addr[j] = test_chars[i_rand() % N_ELEMENTS(test_chars)];
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen (void)uni_utf8_get_valid_data(addr, sizeof(addr), str);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while (fts_tokenizer_next(tok, str_data(str), str_len(str),
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) ;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static void (*const test_functions[])(void) = {