test-fts-tokenizer.c revision fdf70410de49eadfbb77997bb60ebba19aee4752
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen#include "lib.h"
767431e5084a037c4dbefdf30ebfa03c84b1f449Timo Sirainen#include "unichar.h"
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen#include "test-common.h"
1c633f71ec2060e5bfa500a97f34cd881a958ecdTimo Sirainen#include "fts-tokenizer.h"
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen#include "fts-tokenizer-private.h"
a8fe899601735459641edae975c0fa08be8482e2Timo Sirainen#include "fts-tokenizer-generic-private.h"
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen
5fb3f13537dffd15a31e997da133a721c0728af8Timo Sirainen#include <stdlib.h>
97437f768d1a3e6134fed1971202803fd250eef2Timo Sirainen
bb25bed75eefd011138ebf1b8e033fc8ef55ca74Timo Sirainen#define TEST_INPUT_ADDRESS \
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "Bar Baz <bar@example.org>" \
cf63dc8723b971cc80638fccbf494d961cbafc7fTimo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "foo, foo@domain"
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainenstatic const char *test_inputs[] = {
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen /* generic things and word truncation: */
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "hello world\r\n\nAnd there\twas: text galore, "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "abc@example.com, "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "Bar Baz <bar@example.org>, "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "foo@domain "
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen "1234567890123456789012345678ä,"
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "12345678901234567890123456789ä,"
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "123456789012345678901234567890ä,"
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "(\"Hello world\")3.14 3,14 last",
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen
23878bd03d1de531e3261a25598beec621351910Timo Sirainen "1.",
23878bd03d1de531e3261a25598beec621351910Timo Sirainen
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen "hello world\xEF\xBC\x8E"
62d0db14d2c5008758983c28d242ec158baabf9eTimo Sirainen
62d0db14d2c5008758983c28d242ec158baabf9eTimo Sirainen};
62d0db14d2c5008758983c28d242ec158baabf9eTimo Sirainen
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainenstatic void test_fts_tokenizer_find(void)
42681892b206d13cb87a5f526d2bf4ff3f2f4af7Timo Sirainen{
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen test_begin("fts tokenizer find");
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
98e8f95ffee4eacca72b1bcf082f2c735592301bTimo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
98e8f95ffee4eacca72b1bcf082f2c735592301bTimo Sirainen test_end();
cf63dc8723b971cc80638fccbf494d961cbafc7fTimo Sirainen}
cf63dc8723b971cc80638fccbf494d961cbafc7fTimo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainenstatic unsigned int
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen const char *const *expected_output,
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen unsigned int first_outi)
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen{
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen const char *token, *error;
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen /* test all input at once */
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen outi = first_outi;
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
71df09024cea5f2faa93da3bb9513ee96ba6bf22Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
71df09024cea5f2faa93da3bb9513ee96ba6bf22Timo Sirainen outi++;
10b8040903b1d1591f1d44552ff466c8789b8814Timo Sirainen }
10b8040903b1d1591f1d44552ff466c8789b8814Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
0a9cb42cbb135e3200cbfbb657820304cca8ecb8Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
0a9cb42cbb135e3200cbfbb657820304cca8ecb8Timo Sirainen outi++;
71df09024cea5f2faa93da3bb9513ee96ba6bf22Timo Sirainen }
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
72bc08129fb0aaec8144cc183a998ccc426fef9eTimo Sirainen
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen /* test input one byte at a time */
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen outi = first_outi;
cdfdb67422891a44fc7d9ace6bc1a00185fd3528Timo Sirainen for (i = 0; i < input_len; i += char_len) {
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen char_len = uni_utf8_char_bytes(input[i]);
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
f7141101e27d766b695ef27726f755117332a58eTimo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen outi++;
1d082a46e1676e7ec13928d588c4a25e062713ccTimo Sirainen }
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen }
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
7358272563d8ef77366447708ab0e58c0cff4151Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen outi++;
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen }
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen /* test input in random chunks */
4ee00532a265bdfb38539d811fcd12d51210ac35Timo Sirainen outi = first_outi;
7358272563d8ef77366447708ab0e58c0cff4151Timo Sirainen for (i = 0; i < input_len; i += char_len) {
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen max = rand() % (input_len - i) + 1;
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen for (char_len = 0; char_len < max; )
885a3c2287ae3e5827aa580ea06b231de38abb47Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
a8281b7c770f4a9a842b19303083fc7f6859e756Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
a8281b7c770f4a9a842b19303083fc7f6859e756Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
29f138b4b9bc037b21dfaa6b8e458943a99d5db2Timo Sirainen outi++;
29f138b4b9bc037b21dfaa6b8e458943a99d5db2Timo Sirainen }
29f138b4b9bc037b21dfaa6b8e458943a99d5db2Timo Sirainen }
7358272563d8ef77366447708ab0e58c0cff4151Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen outi++;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen }
23878bd03d1de531e3261a25598beec621351910Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen return outi+1;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen}
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainenstatic void
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen const char *const *expected_output)
1db62753d9e3b5d71018889c8ef0a3722a307455Timo Sirainen{
ad58b50aef8125981ebdbc89513236558bcccf60Timo Sirainen unsigned int i, outi = 0;
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen
344bb4abc3acb63d04131cb63f1503a6ca01fb40Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
eff34528733a7893b2914a26023aac227ef4ae7fTimo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
344bb4abc3acb63d04131cb63f1503a6ca01fb40Timo Sirainen expected_output, outi);
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen }
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
0779e926687b319fe1bcc0f1010ba7f88023e789Timo Sirainen}
0f9a8663b0ff6fe30389d02284a2b002c40914ebTimo Sirainen
bd417d416988d11a6b555b9aa57779e7ed976951Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
a9efdb661eb7a8a33aacfdcc3486dcc675a21543Timo Sirainen{
a9efdb661eb7a8a33aacfdcc3486dcc675a21543Timo Sirainen static const char *const expected_output[] = {
fab850a6aee4aaef4f4795bd7946807a3ba45041Timo Sirainen "hello", "world", "And",
bd417d416988d11a6b555b9aa57779e7ed976951Timo Sirainen "there", "was", "text", "galore",
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "abc", "example", "com", "Bar", "Baz",
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "bar", "example", "org", "foo", "domain",
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "1234567890123456789012345678ä",
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen "12345678901234567890123456789",
5685e60e62a8e0d368bd28a1526056f97bbba022Timo Sirainen "123456789012345678901234567890",
5685e60e62a8e0d368bd28a1526056f97bbba022Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
5685e60e62a8e0d368bd28a1526056f97bbba022Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
0779e926687b319fe1bcc0f1010ba7f88023e789Timo Sirainen
72bc08129fb0aaec8144cc183a998ccc426fef9eTimo Sirainen "1", NULL,
72bc08129fb0aaec8144cc183a998ccc426fef9eTimo Sirainen
c14c5561e85853d91280235a7611b6050feaebb2Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
c14c5561e85853d91280235a7611b6050feaebb2Timo Sirainen "word", "pre", "post", NULL,
c14c5561e85853d91280235a7611b6050feaebb2Timo Sirainen
71df09024cea5f2faa93da3bb9513ee96ba6bf22Timo Sirainen "hello", "world", "And",
72bc08129fb0aaec8144cc183a998ccc426fef9eTimo Sirainen "there", "was", "text", "galore",
2cc88ff507e244faa63683f804833b321a62c665Timo Sirainen "and", "more", NULL,
71df09024cea5f2faa93da3bb9513ee96ba6bf22Timo Sirainen
0779e926687b319fe1bcc0f1010ba7f88023e789Timo Sirainen "hello", "world", NULL,
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen NULL
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen };
32c779d5d0b3dabc697408e6b5d9d2e652180b33Timo Sirainen struct fts_tokenizer *tok;
51327f2489a4e0e615eb9f7d921473cf8512bb79Timo Sirainen const char *error;
51327f2489a4e0e615eb9f7d921473cf8512bb79Timo Sirainen
97afa073e3e1e0301dc41173ec34beb08edcce50Timo Sirainen test_begin("fts tokenizer generic simple");
97afa073e3e1e0301dc41173ec34beb08edcce50Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
636f017be100bce67d66fd3ae1544a47681efd33Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
b8b085f7bc6f1c0367802a9f00062bbbd981690dTimo Sirainen
b8b085f7bc6f1c0367802a9f00062bbbd981690dTimo Sirainen test_tokenizer_inputs(tok, expected_output);
94ba4820927b906b333e39445c1508a29387c3aaTimo Sirainen fts_tokenizer_unref(&tok);
b932ee7fbbec6e79b777dcc7ba613b9e99f8337bTimo Sirainen test_end();
b932ee7fbbec6e79b777dcc7ba613b9e99f8337bTimo Sirainen}
cf63dc8723b971cc80638fccbf494d961cbafc7fTimo Sirainen
23878bd03d1de531e3261a25598beec621351910Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
23878bd03d1de531e3261a25598beec621351910Timo Sirainen
23878bd03d1de531e3261a25598beec621351910Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
23878bd03d1de531e3261a25598beec621351910Timo Sirainen This definitely needs to be remapped. */
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen{
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen static const char *const expected_output[] = {
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen "hello", "world", "And",
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen "there", "was", "text", "galore",
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen "abc", "example", "com", "Bar", "Baz",
bb25bed75eefd011138ebf1b8e033fc8ef55ca74Timo Sirainen "bar", "example", "org", "foo", "domain",
bb25bed75eefd011138ebf1b8e033fc8ef55ca74Timo Sirainen "1234567890123456789012345678ä",
5fbccc935e3f7b916aa7c6e302a212821072e83aTimo Sirainen "12345678901234567890123456789",
2a15ce3abe14099b94535f6dfc2d4ee023a7c455Timo Sirainen "123456789012345678901234567890",
2a15ce3abe14099b94535f6dfc2d4ee023a7c455Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
2a15ce3abe14099b94535f6dfc2d4ee023a7c455Timo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
2a15ce3abe14099b94535f6dfc2d4ee023a7c455Timo Sirainen
eed1ec3ac96fddb8d9e4fa2af6e760ee42801fb8Timo Sirainen "1", NULL,
94ba4820927b906b333e39445c1508a29387c3aaTimo Sirainen
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi "quoted", "text", "word", "hlo", "words", "you're", "bad",
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi "word", "pre", "post", NULL,
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi "hello", "world", "And",
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi "there", "was", "text", "galore",
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi "and", "more", NULL,
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi "hello", "world", NULL,
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi NULL
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomi };
89d31290dab6e4bde08b8a118121f008154772e9Aki Tuomi struct fts_tokenizer *tok;
5ba6009f4e5493c4e6be9ffb3134525004a7975cAki Tuomi const char *error;
71f4549303dc1691382748a096c2ada9d2a1a9feAki Tuomi
844929a7bd6e9d21f0a8cdb3a19f4620a17cdecaAki Tuomi test_begin("fts tokenizer generic TR29");
e1d08b1c39c63de92f0e914064a508bbf6c6fcc5Aki Tuomi test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
977f08d645b1779527c0938bbb848b61064839c3Aki Tuomi test_tokenizer_inputs(tok, expected_output);
a893aaa999856b1ba6e4541890016767aaa283c7Aki Tuomi fts_tokenizer_unref(&tok);
a893aaa999856b1ba6e4541890016767aaa283c7Aki Tuomi test_end();
a893aaa999856b1ba6e4541890016767aaa283c7Aki Tuomi}
ae8c89c81de5d867bd1359fb9c438dd8771210c7Aki Tuomi
0368f3b0ae3fc1ea892da5c5ec02c05c0c3989afAki Tuomistatic void test_fts_tokenizer_address_only(void)
14af7be4aa26d55c341cd6efe32bb2add2c39830Aki Tuomi{
14102a0c5db8828ca8c7751ec96587fadc97a0bcTimo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
14102a0c5db8828ca8c7751ec96587fadc97a0bcTimo Sirainen static const char *const expected_output[] = {
14102a0c5db8828ca8c7751ec96587fadc97a0bcTimo Sirainen "abc.dfg@example.com", "bar@example.org",
14102a0c5db8828ca8c7751ec96587fadc97a0bcTimo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
14af7be4aa26d55c341cd6efe32bb2add2c39830Aki Tuomi };
c45a841bee3f42ec6524b8f62c3fd457115c3f97Timo Sirainen struct fts_tokenizer *tok;
c45a841bee3f42ec6524b8f62c3fd457115c3f97Timo Sirainen const char *error;
c45a841bee3f42ec6524b8f62c3fd457115c3f97Timo Sirainen
a6ab8f00351265e35b79f3a22b1f5a4978ae5c35Timo Sirainen test_begin("fts tokenizer email address only");
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
fts_tokenizer_unref(&tok);
test_end();
}
static void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
"Bar", "Baz", "bar", "example", "org", "bar@example.org",
"Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
"foo", "foo", "domain", "foo@domain", NULL
};
struct fts_tokenizer *tok, *gen_tok;
const char *error;
test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
test_end();
}
const char *const simple_settings[] = {"algorithm", "simple", NULL};
static void test_fts_tokenizer_address_parent_simple(void)
{
test_fts_tokenizer_address_parent("simple", simple_settings);
}
static void test_fts_tokenizer_address_parent_tr29(void)
{
test_fts_tokenizer_address_parent("tr29", tr29_settings);
}
static void test_fts_tokenizer_address_search(void)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
"Bar", "Baz", "bar@example.org",
"Foo", "Bar", "comment", "foo.bar@host.example.org",
"foo", "foo@domain", NULL
};
static const char *const settings[] = { "search", "", NULL };
struct fts_tokenizer *tok, *gen_tok;
const char *token, *error;
test_begin("fts tokenizer search email address + parent");
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
/* make sure state is forgotten at EOF */
test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "foo") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "bar@baz") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "foo") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
/* test reset explicitly */
test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
fts_tokenizer_reset(tok);
test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "b@c") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
test_end();
}
int main(void)
{
static void (*test_functions[])(void) = {
test_fts_tokenizer_find,
test_fts_tokenizer_generic_only,
test_fts_tokenizer_generic_tr29_only,
test_fts_tokenizer_address_only,
test_fts_tokenizer_address_parent_simple,
test_fts_tokenizer_address_parent_tr29,
test_fts_tokenizer_address_search,
NULL
};
int ret;
fts_tokenizers_init();
ret = test_run(test_functions);
fts_tokenizers_deinit();
return ret;
}