test-fts-tokenizer.c revision 72c4ef3b44c50c662b37bba93b463b0caeb63a4f
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "lib.h"
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen#include "unichar.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "test-common.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "fts-tokenizer-private.h"
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen#include "fts-tokenizer-generic-private.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include <stdlib.h>
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen#define TEST_INPUT_ADDRESS \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar Baz <bar@example.org>" \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "foo, foo@domain"
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic const char *test_inputs[] = {
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* generic things and word truncation: */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello world\r\n\nAnd there\twas: text galore, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "abc@example.com, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "Bar Baz <bar@example.org>, "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "foo@domain "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "1234567890123456789012345678ä,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "12345678901234567890123456789ä,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "123456789012345678901234567890ä,"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "(\"Hello world\")3.14 3,14 last",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen "1.",
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen
72c4ef3b44c50c662b37bba93b463b0caeb63a4fTimo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello world\xEF\xBC\x8E"
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen};
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainenstatic void test_fts_tokenizer_find(void)
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen{
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_begin("fts tokenizer find");
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_end();
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen}
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic unsigned int
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen unsigned int first_outi)
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen const char *token, *error;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen unsigned int i, outi, max, char_len, input_len = strlen(_input);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test all input at once */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = first_outi;
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input one byte at a time */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = first_outi;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < input_len; i += char_len) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen char_len = uni_utf8_char_bytes(input[i]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen /* test input in random chunks */
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = first_outi;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < input_len; i += char_len) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen max = rand() % (input_len - i) + 1;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen for (char_len = 0; char_len < max; )
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen outi++;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen return outi+1;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen}
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainenstatic void
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen const char *const *expected_output)
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen{
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen unsigned int i, outi = 0;
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen expected_output, outi);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen }
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen}
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "there", "was", "text", "galore",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "abc", "example", "com", "Bar", "Baz",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "bar", "example", "org", "foo", "domain",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "1234567890123456789012345678ä",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
78f87ea1d30f3f54bdf8560ea947ab7ee094283aTeemu Huovila "and", "longlonglongabcdefghijklmnopqr",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen "1", NULL,
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "word", "pre", "post", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "there", "was", "text", "galore",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and", "more", NULL,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello", "world", NULL,
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_begin("fts tokenizer generic simple");
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputs(tok, expected_output);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen This definitely needs to be remapped. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "there", "was", "text", "galore",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "abc", "example.com", "Bar", "Baz",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "bar", "example.org", "foo", "domain",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "1234567890123456789012345678ä",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "12345678901234567890123456789",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "123456789012345678901234567890",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "more", "Hello", "world", "3.14", "3,14", "last", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen "1", NULL,
b04e76cbc807707d299055be79500f8ff131da43Timo Sirainen
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "quoted", "text", "word", "hlo", "words", "you're", "bad",
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila "word", "pre", "post", NULL,
0c5854b6891c59c1c3f443569bc823d7db571582Teemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "hello", "world", "And",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "there", "was", "text", "galore",
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "and", "more", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen "hello", "world", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_begin("fts tokenizer generic TR29");
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputs(tok, expected_output);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainenstatic void test_fts_tokenizer_address_only(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "abc.dfg@example.com", "bar@example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "foo.bar@host.example.org", "foo@domain", NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer email address only");
908c417cc19ec4a2a01db542498c13ade3943601Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainenstatic void test_fts_tokenizer_address_parent(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar", "Baz", "bar", "example", "org", "bar@example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "foo", "foo", "domain", "foo@domain", NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen const char *error;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer email address + parent");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_tokenizer_unref(&gen_tok);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_end();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen{
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char input[] = TEST_INPUT_ADDRESS;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen static const char *const expected_output[] = {
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Bar", "Baz", "bar@example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "Foo", "Bar", "comment", "foo.bar@host.example.org",
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen "foo", "foo@domain", NULL
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen };
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen static const char *const settings[] = { "search", "", NULL };
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen const char *token, *error;
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_begin("fts tokenizer search email address + parent");
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
62fc0b4f07eb6f18a3bff4b1fccb636e6fae3cf4Timo Sirainen test_tokenizer_inputoutput(tok, input, expected_output, 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* make sure state is forgotten at EOF */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "foo") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "bar@baz") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "foo") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
c4b772bfbdafe68ac1a0076eab26cd681f8e5046Timo Sirainen
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen /* test reset explicitly */
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen fts_tokenizer_reset(tok);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen strcmp(token, "b@c") == 0);
8b1a9a4d63b0abccdf7cb1acb8359d5396dd657bTimo Sirainen test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
2730605833442b5ddcb261f90b8375fc98201e35Timo Sirainen
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen fts_tokenizer_unref(&tok);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen fts_tokenizer_unref(&gen_tok);
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_end();
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen}
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenint main(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen static void (*test_functions[])(void) = {
f5c0d5cada4da23a167c38426d0c481a3e1d5583Timo Sirainen test_fts_tokenizer_find,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_fts_tokenizer_generic_only,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_fts_tokenizer_generic_tr29_only,
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_fts_tokenizer_address_only,
568fec5b1e629f25d288b48007485b9aa4a018b1Timo Sirainen test_fts_tokenizer_address_parent,
e4bf76afb82ea28ec9d06823fa7deed5f8277183Timo Sirainen test_fts_tokenizer_address_search,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen NULL
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen };
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen int ret;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen fts_tokenizers_init();
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen ret = test_run(test_functions);
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen fts_tokenizers_deinit();
b1965419f329eb7cf78ee39e7c5942462eabb256Timo Sirainen return ret;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}