test-fts-tokenizer.c revision fdf70410de49eadfbb77997bb60ebba19aee4752
02c335c23bf5fa225a467c19f2c063fb0dc7b8c3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen#include "lib.h"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen#include "unichar.h"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen#include "test-common.h"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen#include "fts-tokenizer.h"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen#include "fts-tokenizer-private.h"
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen#include "fts-tokenizer-generic-private.h"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen#include <stdlib.h>
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen#define TEST_INPUT_ADDRESS \
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen "Bar Baz <bar@example.org>" \
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "foo, foo@domain"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenstatic const char *test_inputs[] = {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* generic things and word truncation: */
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "hello world\r\n\nAnd there\twas: text galore, "
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen "abc@example.com, "
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "Bar Baz <bar@example.org>, "
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "foo@domain "
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "1234567890123456789012345678ä,"
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen "12345678901234567890123456789ä,"
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "123456789012345678901234567890ä,"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "(\"Hello world\")3.14 3,14 last",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen "1.",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
e2ce8d4a6ac5d82a906178148453e7613fab9ba0Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
e2ce8d4a6ac5d82a906178148453e7613fab9ba0Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
e2ce8d4a6ac5d82a906178148453e7613fab9ba0Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "hello world\xEF\xBC\x8E"
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen};
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainenstatic void test_fts_tokenizer_find(void)
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen{
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_begin("fts tokenizer find");
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_end();
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen}
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainenstatic unsigned int
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen const char *const *expected_output,
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen unsigned int first_outi)
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen{
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen const char *token, *error;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* test all input at once */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi = first_outi;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
46b823ac3bce2c0f9f0fc73911e48d3a77b04fbeTimo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi++;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi++;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
e245fb1302121d2bc2580f61e040c2c8a558ee9eTimo Sirainen /* test input one byte at a time */
e245fb1302121d2bc2580f61e040c2c8a558ee9eTimo Sirainen outi = first_outi;
e245fb1302121d2bc2580f61e040c2c8a558ee9eTimo Sirainen for (i = 0; i < input_len; i += char_len) {
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen char_len = uni_utf8_char_bytes(input[i]);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi++;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi++;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* test input in random chunks */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi = first_outi;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen for (i = 0; i < input_len; i += char_len) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen max = rand() % (input_len - i) + 1;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen for (char_len = 0; char_len < max; )
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen outi++;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi++;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen return outi+1;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen}
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenstatic void
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen const char *const *expected_output)
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen{
25ec868bd8b5375e1c1c4c3331d761667ddfe26cTimo Sirainen unsigned int i, outi = 0;
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen expected_output, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen }
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen}
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen{
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen static const char *const expected_output[] = {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "hello", "world", "And",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "there", "was", "text", "galore",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "abc", "example", "com", "Bar", "Baz",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "bar", "example", "org", "foo", "domain",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "1234567890123456789012345678ä",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "12345678901234567890123456789",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "123456789012345678901234567890",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "1", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen "word", "pre", "post", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "hello", "world", "And",
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen "there", "was", "text", "galore",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "and", "more", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen "hello", "world", NULL,
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen NULL
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen };
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen struct fts_tokenizer *tok;
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen const char *error;
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_begin("fts tokenizer generic simple");
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_tokenizer_inputs(tok, expected_output);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen fts_tokenizer_unref(&tok);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_end();
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen}
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen This definitely needs to be remapped. */
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen{
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen static const char *const expected_output[] = {
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "hello", "world", "And",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "there", "was", "text", "galore",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "abc", "example", "com", "Bar", "Baz",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "bar", "example", "org", "foo", "domain",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "1234567890123456789012345678ä",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "12345678901234567890123456789",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "123456789012345678901234567890",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "and", "longlonglongabcdefghijklmnopqr",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "1", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "word", "pre", "post", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen "hello", "world", "And",
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen "there", "was", "text", "galore",
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen "and", "more", NULL,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen "hello", "world", NULL,
c076ad69e28e7d41af83ada84e12019793ffcfa2Timo Sirainen
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen NULL
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen };
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen struct fts_tokenizer *tok;
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen const char *error;
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_begin("fts tokenizer generic TR29");
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_tokenizer_inputs(tok, expected_output);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen fts_tokenizer_unref(&tok);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_end();
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen}
6adf683655750bcb809275cd65dc75fd12214198Timo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainenstatic void test_fts_tokenizer_address_only(void)
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen{
46631c1d903c409444b1b1c4a1d41a033c09ee37Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen static const char *const expected_output[] = {
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen "abc.dfg@example.com", "bar@example.org",
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen };
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen struct fts_tokenizer *tok;
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen const char *error;
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen test_begin("fts tokenizer email address only");
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen fts_tokenizer_unref(&tok);
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen test_end();
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen}
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainenstatic void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen{
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen static const char *const expected_output[] = {
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
d5e839aea288aceaddae28a1578cebda3c9e3b58Timo Sirainen "Bar", "Baz", "bar", "example", "org", "bar@example.org",
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen "foo", "foo", "domain", "foo@domain", NULL
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen };
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen const char *error;
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen fts_tokenizer_unref(&tok);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen fts_tokenizer_unref(&gen_tok);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_end();
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen}
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainenconst char *const simple_settings[] = {"algorithm", "simple", NULL};
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainenstatic void test_fts_tokenizer_address_parent_simple(void)
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen{
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_fts_tokenizer_address_parent("simple", simple_settings);
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen}
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainenstatic void test_fts_tokenizer_address_parent_tr29(void)
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen{
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen test_fts_tokenizer_address_parent("tr29", tr29_settings);
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen}
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen{
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen static const char *const expected_output[] = {
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "Bar", "Baz", "bar@example.org",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen "foo", "foo@domain", NULL
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen };
fb176cdc122707cda985ab3c09c02ccf3cec0af1Timo Sirainen static const char *const settings[] = { "search", "", NULL };
153ed0fbca1f5f944b70937dfd71911db172ca97Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen const char *token, *error;
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_begin("fts tokenizer search email address + parent");
8a8a3b43987b5ade914f22765e51c9e3de8179d3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* make sure state is forgotten at EOF */
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen strcmp(token, "foo") == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen strcmp(token, "bar@baz") == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen strcmp(token, "foo") == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen /* test reset explicitly */
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen fts_tokenizer_reset(tok);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen strcmp(token, "b@c") == 0);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen fts_tokenizer_unref(&tok);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen fts_tokenizer_unref(&gen_tok);
cf636afb3826f0d8e15c248aa1fc04ce72820e08Timo Sirainen test_end();
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen}
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainenint main(void)
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen{
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen static void (*test_functions[])(void) = {
e44028b5df7045dd9e7f324175e73e3ff490cb5dTimo Sirainen test_fts_tokenizer_find,
d28179fd78550a58be44dcb1e3e830ab7d33172dTimo Sirainen test_fts_tokenizer_generic_only,
d28179fd78550a58be44dcb1e3e830ab7d33172dTimo Sirainen test_fts_tokenizer_generic_tr29_only,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_fts_tokenizer_address_only,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_fts_tokenizer_address_parent_simple,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_fts_tokenizer_address_parent_tr29,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen test_fts_tokenizer_address_search,
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen NULL
b3f4c31f1533e25380f49f77d5bb1251bf43db2aTimo Sirainen };
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen int ret;
bace943c67e6cd14ce6c994f533d82a3caad5bf1Timo Sirainen
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen fts_tokenizers_init();
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen ret = test_run(test_functions);
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen fts_tokenizers_deinit();
baf346e71ebd7b44fcba4b48f4d39845453b778bTimo Sirainen return ret;
c096257fbdaf4b9fcf8eb97aae94afdbb4e71ed4Timo Sirainen}
2f4f603d4cebab2cc956c72164efb02da83515c5Timo Sirainen