test-fts-tokenizer.c revision 3448096d5b1cd324ed5132045de0345cd7120a25
2454dfa32c93c20a8522c6ed42fe057baaac9f9aStephan Bosch/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
1d4f710106fb498750456724628da6063e012e6dTimo Sirainen
46552a931924c2d743f045e95b08c3ce6beda91aTimo Sirainen#include "lib.h"
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen#include "unichar.h"
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen#include "test-common.h"
d3d769026fae5d21c2d29614d3bc4579e8d79e81Timo Sirainen#include "fts-tokenizer.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include "fts-tokenizer-private.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include "fts-tokenizer-generic-private.h"
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include <stdlib.h>
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
b55f914c0ade77252cfd798ea8eb9a84bda56315Timo Sirainen#define TEST_INPUT_ADDRESS \
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
c4ec7cb598805b1387dc3aab59ec8f32d8cc24e1Timo Sirainen "Bar Baz <bar@example.org>" \
b55f914c0ade77252cfd798ea8eb9a84bda56315Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen "foo, foo@domain"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainenstatic const char *test_inputs[] = {
92c49f3005f4dff1a6f576fffa8112ef6d1cae7fTimo Sirainen /* generic things and word truncation: */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
1d2c463d23f09f15727edae9c78b07ec6a7a27daTimo Sirainen "abc@example.com, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain "
de754cb78f75e8b3b994cddafe41c9ed1467c33dTimo Sirainen "1234567890123456789012345678ä,"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "12345678901234567890123456789ä,"
4ead43ecc06d10047998966c4dc0b142ecce4b66Timo Sirainen "123456789012345678901234567890ä,"
1d4f710106fb498750456724628da6063e012e6dTimo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
1d4f710106fb498750456724628da6063e012e6dTimo Sirainen "(\"Hello world\")3.14 3,14 last",
9439bed2f07d6475febd8a247cd2f0990fb32a13Timo Sirainen
0b3e92b6043435c5aa9f1cf1d04b632f3e19abd9Phil Carmody "1.",
0b3e92b6043435c5aa9f1cf1d04b632f3e19abd9Phil Carmody
ab0d9eecd85f74acae18fe88529302e0776cc500Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "'1234567890123456789012345678ä,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567x'ä,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x're,"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "1234567890123456789012345678x'',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "12345678901234567890123456789x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "12345678901234567890123456789x'',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567890x',"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "123456789012345678901234567890x'',"
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "you\xE2\x80\x99re\xE2\x80\x99xyz",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "hello world\xEF\xBC\x8E"
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen};
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainenstatic void test_fts_tokenizer_find(void)
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen{
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_begin("fts tokenizer find");
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_end();
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen}
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainenstatic unsigned int
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen const char *const *expected_output,
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen unsigned int first_outi)
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen{
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen const char *token, *error;
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen /* test all input at once */
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen outi = first_outi;
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen outi++;
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen }
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen outi++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen /* test input one byte at a time */
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen outi = first_outi;
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen for (i = 0; i < input_len; i += char_len) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen char_len = uni_utf8_char_bytes(input[i]);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen outi++;
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen }
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen }
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen outi++;
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen }
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen /* test input in random chunks */
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen outi = first_outi;
2b96880f2d789d125aff6a95eaa7b51f558a6a1cTimo Sirainen for (i = 0; i < input_len; i += char_len) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen max = rand() % (input_len - i) + 1;
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen for (char_len = 0; char_len < max; )
2b96880f2d789d125aff6a95eaa7b51f558a6a1cTimo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen outi++;
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen }
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen }
ed16ab579bd058ec5e2b5d02bb41fdadd9e05b31Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
7a94f950fd1dcc81537acfc8adb030b5e703d722Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen outi++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen
e93184a9055c2530366dfe617e07199603c399ddMartti Rannanjärvi return outi+1;
e93184a9055c2530366dfe617e07199603c399ddMartti Rannanjärvi}
e2a88d59c0d47d63ce1ad5b1fd95e487124a3fd4Timo Sirainen
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainenstatic void
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen const char *const *expected_output)
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen{
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen unsigned int i, outi = 0;
baebb412a9a5a44b1756e01cfa3b99f5d8a846b6Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen expected_output, outi);
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen }
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen}
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainenstatic void test_fts_tokenizer_generic_only(void)
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen{
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen static const char *const expected_output[] = {
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "hello", "world", "And",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "there", "was", "text", "galor\xC3\xA9",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "abc", "example", "com", "Bar", "Baz",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "bar", "example", "org", "foo", "domain",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "1234567890123456789012345678ä",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "12345678901234567890123456789",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "123456789012345678901234567890",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "and", "longlonglongabcdefghijklmnopqr",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "1", NULL,
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen "word", "pre", "post", NULL,
12c6ef6f1268ed4d5b63709bb4215c481b4f078cTimo Sirainen
f059a046515f4b2b15a6c2a10a6f12f6166e39a5Timo Sirainen "1234567890123456789012345678ä",
f059a046515f4b2b15a6c2a10a6f12f6166e39a5Timo Sirainen "123456789012345678901234567x'",
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch "1234567890123456789012345678x'",
f29756821a4c6b12b73e4a2a3e1c230117a43773Timo Sirainen "1234567890123456789012345678x",
f29756821a4c6b12b73e4a2a3e1c230117a43773Timo Sirainen "1234567890123456789012345678x",
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen "word", "pre", "post", NULL,
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "you're'xyz", NULL,
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "hello", "world", "And",
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "there", "was", "text", "galore",
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen "and", "more", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen NULL
b045b66988bfbaa2795791e42ee724fae6f0db1cAki Tuomi };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
f29756821a4c6b12b73e4a2a3e1c230117a43773Timo Sirainen const char *error;
9132f9df4e12ed5293c70957813aa3736444a13cTimo Sirainen
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch test_begin("fts tokenizer generic simple");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch test_tokenizer_inputs(tok, expected_output);
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch fts_tokenizer_unref(&tok);
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch test_end();
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch}
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Boschconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch
c12d96f12cac9af464ab2e59046bd59b0c06b4eaTimo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
c12d96f12cac9af464ab2e59046bd59b0c06b4eaTimo Sirainen This definitely needs to be remapped. */
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Boschstatic void test_fts_tokenizer_generic_tr29_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
a05fec120ecd8c4ed6331c42100cba42adf22893Stephan Bosch "hello", "world", "And",
ddbdc644a15f56f4b43596f1b8c0fc196c101445Timo Sirainen "there", "was", "text", "galor\xC3\xA9",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc", "example", "com", "Bar", "Baz",
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen "bar", "example", "org", "foo", "domain",
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen "1234567890123456789012345678ä",
d3d769026fae5d21c2d29614d3bc4579e8d79e81Timo Sirainen "12345678901234567890123456789",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "123456789012345678901234567890",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
5a9e240ebf8d0daaf029973973b52e415148070bTimo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
5a9e240ebf8d0daaf029973973b52e415148070bTimo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
c5e62353a11087958ea4e619660e084a613e1a37Timo Sirainen "word", "pre", "post", NULL,
f059a046515f4b2b15a6c2a10a6f12f6166e39a5Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678ä",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567x'",
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen "1234567890123456789012345678x'",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "1234567890123456789012345678x",
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen "12345678901234567890123456789x",
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen "12345678901234567890123456789x",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "123456789012345678901234567890",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "123456789012345678901234567890",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen "word", "pre", "post", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "you're'xyz", NULL,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", "And",
b045b66988bfbaa2795791e42ee724fae6f0db1cAki Tuomi "there", "was", "text", "galore",
b045b66988bfbaa2795791e42ee724fae6f0db1cAki Tuomi "and", "more", NULL,
b045b66988bfbaa2795791e42ee724fae6f0db1cAki Tuomi
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", NULL,
56af9dd10e7e6caeaca64395bad3f882b28ecdffTimo Sirainen
56af9dd10e7e6caeaca64395bad3f882b28ecdffTimo Sirainen NULL
56af9dd10e7e6caeaca64395bad3f882b28ecdffTimo Sirainen };
56af9dd10e7e6caeaca64395bad3f882b28ecdffTimo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer generic TR29");
ad004e44be109684521494b5af2ad1da39b8bb27Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen test_tokenizer_inputs(tok, expected_output);
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen fts_tokenizer_unref(&tok);
d3d769026fae5d21c2d29614d3bc4579e8d79e81Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
a117008f03ad9e2d54258b30d3fb03ffa502a448Timo Sirainen
a117008f03ad9e2d54258b30d3fb03ffa502a448Timo Sirainenstatic void test_fts_tokenizer_address_only(void)
6da2d4faed507f513c68b94bb56a13caeeb3ff4aTimo Sirainen{
6da2d4faed507f513c68b94bb56a13caeeb3ff4aTimo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc.dfg@example.com", "bar@example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address only");
c7fca6cbb32388556d9f6d8313486cc4e4a3c058Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
c7fca6cbb32388556d9f6d8313486cc4e4a3c058Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen fts_tokenizer_unref(&tok);
c7fca6cbb32388556d9f6d8313486cc4e4a3c058Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
5f1d689131a75c39f064cbd4202373e7edf78f18Josef 'Jeff' Sipek "Bar", "Baz", "bar", "example", "org", "bar@example.org",
5a9e240ebf8d0daaf029973973b52e415148070bTimo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
5a9e240ebf8d0daaf029973973b52e415148070bTimo Sirainen "foo", "foo", "domain", "foo@domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen const char *error;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&gen_tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenconst char *const simple_settings[] = {"algorithm", "simple", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_parent_simple(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
1093de32efb2a231949566d4bd8aa55a8f43fb70Timo Sirainen test_fts_tokenizer_address_parent("simple", simple_settings);
de754cb78f75e8b3b994cddafe41c9ed1467c33dTimo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_parent_tr29(void)
ab90f702ceedb7ba445a9a592be0b213b27cbafaStephan Bosch{
ab90f702ceedb7ba445a9a592be0b213b27cbafaStephan Bosch test_fts_tokenizer_address_parent("tr29", tr29_settings);
2aac7ca853f63b62ea79ef8eae9ded83ed6063a5Timo Sirainen}
4de2a17e0a2aed3b57a6c1057329b6a132b56ae2Timo Sirainen
4de2a17e0a2aed3b57a6c1057329b6a132b56ae2Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
2aac7ca853f63b62ea79ef8eae9ded83ed6063a5Timo Sirainen{
2aac7ca853f63b62ea79ef8eae9ded83ed6063a5Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
2aac7ca853f63b62ea79ef8eae9ded83ed6063a5Timo Sirainen static const char *const expected_output[] = {
9ddd3d7d8651985e373a6c48e0ddc76b8a4ef1c7Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
4de2a17e0a2aed3b57a6c1057329b6a132b56ae2Timo Sirainen "Bar", "Baz", "bar@example.org",
5d2e7ec2ea725c8a6a63f56b771e746f93e782ecTimo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen "foo", "foo@domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const settings[] = { "search", "", NULL };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer search email address + parent");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen /* make sure state is forgotten at EOF */
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
9f627b360ed38fdc54cb02ec5e67246c3f0d5b0fTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
bdb9f7f7fbf828fb85a393bd2803167b1bb8ff0dTimo Sirainen strcmp(token, "foo") == 0);
bdb9f7f7fbf828fb85a393bd2803167b1bb8ff0dTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
bdb9f7f7fbf828fb85a393bd2803167b1bb8ff0dTimo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen strcmp(token, "bar@baz") == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen strcmp(token, "foo") == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen /* test reset explicitly */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_reset(tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen strcmp(token, "b@c") == 0);
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&gen_tok);
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen test_end();
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen}
b84eff65e25ae86dfd6f798386577209b94838c6Timo Sirainen
fa02962b74d39e8d74c4c307c0210791b2f0a1caTimo Sirainenint main(void)
fa02962b74d39e8d74c4c307c0210791b2f0a1caTimo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static void (*test_functions[])(void) = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_find,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_generic_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_generic_tr29_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_address_only,
6303191abcb37164f435ccdc56e9dbddf1288851Timo Sirainen test_fts_tokenizer_address_parent_simple,
6303191abcb37164f435ccdc56e9dbddf1288851Timo Sirainen test_fts_tokenizer_address_parent_tr29,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_address_search,
c7eb1ffb7c73cb5d9c1316bbecd02947441a40d4Timo Sirainen NULL
2f90189c6ee66a17f7bf838a8eb8a69868630fb8Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen int ret;
b6b7a17731a917958b6479920b3fac5ca991db6aTimo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizers_init();
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen ret = test_run(test_functions);
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen fts_tokenizers_deinit();
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen return ret;
54533aa265f5c87730022cc7576090bc51370f97Timo Sirainen}
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen