test-fts-tokenizer.c revision 5c97732871842800816aea0215c56bf701f623a6
2454dfa32c93c20a8522c6ed42fe057baaac9f9aStephan Bosch/* Copyright (c) 2014-2017 Dovecot authors, see the included COPYING file */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "lib.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "unichar.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "str.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "test-common.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "fts-tokenizer.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "fts-tokenizer-common.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "fts-tokenizer-private.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#include "fts-tokenizer-generic-private.h"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen/*there should be a trailing space ' ' at the end of each string except the last one*/
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen#define TEST_INPUT_ADDRESS \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "Bar Baz <bar@example.org>" \
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org " \
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen "foo, foo@domain " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "trailing, period@blue.com. " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "multi-trialing, mul@trail.com..... " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "m@s " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hypen@hypen-hypen.com " \
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hypen@hypen-hypen-sick.com.-"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic const char *test_inputs[] = {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* generic things and word truncation: */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "abc@example.com, "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "Bar Baz <bar@example.org>, "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "foo@domain "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "(\"Hello world\")3.14 3,14 last",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1.",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "'1234567890123456789012345678\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567x'\xC3\xA4,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x're,"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x',"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x'',"
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch "12345678901234567890123456789x',"
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen "12345678901234567890123456789x'',"
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi "123456789012345678901234567890x',"
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi "123456789012345678901234567890x'',"
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch "\xE2\x80\x99 \xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99\xE2\x80\x99\xE2\x80\x99 \xE2\x80\x99quoted text\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99 \xE2\x80\x99hlo words\xE2\x80\x99 you\xE2\x80\x99re78901234567890123456789012 bad\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99word\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99pre post\xE2\x80\x99\xE2\x80\x99\xE2\x80\x99",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch "you\xE2\x80\x99re\xE2\x80\x99xyz",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f) */
a326f9da3c18a4ccfb28e72f87161eaf3624eaf2Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
a326f9da3c18a4ccfb28e72f87161eaf3624eaf2Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
a326f9da3c18a4ccfb28e72f87161eaf3624eaf2Timo Sirainen
2bc963ea051ddacefe0fa5e26280e8ef853fd6c6Timo Sirainen /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello world\xEF\xBC\x8E",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* TR29 WB5a */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen};
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Boschstatic void test_fts_tokenizer_find(void)
1e2b3bd82f2d4fbae0963f4a220df30b7b5ae628Timo Sirainen{
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch test_begin("fts tokenizer find");
1e2b3bd82f2d4fbae0963f4a220df30b7b5ae628Timo Sirainen test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
60670187b0dd0e7f23f99a58feab11b862ad77acStephan Bosch test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
60670187b0dd0e7f23f99a58feab11b862ad77acStephan Bosch test_end();
60670187b0dd0e7f23f99a58feab11b862ad77acStephan Bosch}
60670187b0dd0e7f23f99a58feab11b862ad77acStephan Bosch
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic unsigned int
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainentest_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const char *const *expected_output,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen unsigned int first_outi)
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen{
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const unsigned char *input = (const unsigned char *)_input;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const char *token, *error;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen unsigned int i, outi, max, char_len;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen size_t input_len = strlen(_input);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* test all input at once */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi = first_outi;
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch test_assert_strcmp(token, expected_output[outi]);
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch outi++;
8c1199cac76762101a2ca3ae66443b6b0dc28683Stephan Bosch }
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
f24edebe360d3effe584a884aa7d119daf3fd371Aki Tuomi test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi++;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* test input one byte at a time */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi = first_outi;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen for (i = 0; i < input_len; i += char_len) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen char_len = uni_utf8_char_bytes(input[i]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi++;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi++;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen /* test input in random chunks */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi = first_outi;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen for (i = 0; i < input_len; i += char_len) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen max = rand() % (input_len - i) + 1;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen for (char_len = 0; char_len < max; )
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen char_len += uni_utf8_char_bytes(input[i+char_len]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi++;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen while (fts_tokenizer_final(tok, &token, &error) > 0) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_strcmp(token, expected_output[outi]);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi++;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen return outi+1;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen}
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic void
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainentest_tokenizer_inputs(struct fts_tokenizer *tok,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const char *const *expected_output)
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen{
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen unsigned int i, outi = 0;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen for (i = 0; i < N_ELEMENTS(test_inputs); i++) {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen outi = test_tokenizer_inputoutput(tok, test_inputs[i],
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen expected_output, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen }
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert_idx(expected_output[outi] == NULL, outi);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen}
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen{
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen static const char *const expected_output[] = {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello", "world", "And",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "there", "was", "text", "galor\xC3\xA9",
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen "abc", "example", "com", "Bar", "Baz",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "bar", "example", "org", "foo", "domain",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1", NULL,
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "word", "pre", "post", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "word", "pre", "post", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "you're'xyz", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello", "world", "And",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "there", "was", "text", "galore",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "and", "more", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello", "world", NULL,
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen NULL
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen };
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen struct fts_tokenizer *tok;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const char *error;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_begin("fts tokenizer generic simple");
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_tokenizer_inputs(tok, expected_output);
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen fts_tokenizer_unref(&tok);
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_end();
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen}
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen This definitely needs to be remapped. */
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen{
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen static const char *const expected_output[] = {
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello", "world", "And",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "there", "was", "text", "galor\xC3\xA9",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "abc", "example", "com", "Bar", "Baz",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "bar", "example", "org", "foo", "domain",
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen "1234567890123456789012345678\xC3\xA4",
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen "12345678901234567890123456789",
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "and", "longlonglongabcdefghijklmnopqr",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen "1", NULL,
83228b3f9f6ee8c62a61902e0203af9760f7b9b7Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're", "bad",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "word", "pre", "post", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678\xC3\xA4",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x'",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "1234567890123456789012345678x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "12345678901234567890123456789x",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "123456789012345678901234567890",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen "word", "pre", "post", NULL,
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "you're'xyz", NULL,
7b032348d7bbb93ff96188289d3dfc1899b9abb3Josef 'Jeff' Sipek
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "hello", "world", "And",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "there", "was", "text", "galore",
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen "and", "more", NULL,
f32c6ed9db6f4c535f97a2020401572efc8abf86Timo Sirainen
f32c6ed9db6f4c535f97a2020401572efc8abf86Timo Sirainen "hello", "world", NULL,
f32c6ed9db6f4c535f97a2020401572efc8abf86Timo Sirainen
f32c6ed9db6f4c535f97a2020401572efc8abf86Timo Sirainen "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
f32c6ed9db6f4c535f97a2020401572efc8abf86Timo Sirainen NULL
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen };
b596cac264eaa0fbd6cd74a279d58accccb7405bTimo Sirainen struct fts_tokenizer *tok;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen const char *error;
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen test_begin("fts tokenizer generic TR29");
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen test_tokenizer_inputs(tok, expected_output);
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen fts_tokenizer_unref(&tok);
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen test_end();
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen}
f818f91a2e6ee003aaa83323acd74008aa1276d9Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainenconst char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen
5ef28f68edef46f69961b19b7c1dcd8ec5a955e8Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
static void test_fts_tokenizer_generic_tr29_wb5a(void)
{
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galor\xC3\xA9",
"abc", "example", "com", "Bar", "Baz",
"bar", "example", "org", "foo", "domain",
"1234567890123456789012345678\xC3\xA4",
"12345678901234567890123456789",
"123456789012345678901234567890",
"and", "longlonglongabcdefghijklmnopqr",
"more", "Hello", "world", "3", "14", "3,14", "last", NULL,
"1", NULL,
"quoted", "text", "word", "hlo", "words", "you're", "bad",
"word", "pre", "post", NULL,
"1234567890123456789012345678\xC3\xA4",
"123456789012345678901234567x'",
"1234567890123456789012345678x'",
"1234567890123456789012345678x",
"1234567890123456789012345678x",
"12345678901234567890123456789x",
"12345678901234567890123456789x",
"123456789012345678901234567890",
"123456789012345678901234567890",
"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
"word", "pre", "post", NULL,
"you're'xyz", NULL,
"hello", "world", "And",
"there", "was", "text", "galore",
"and", "more", NULL,
"hello", "world", NULL,
"l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
NULL
};
struct fts_tokenizer *tok;
const char *error;
test_begin("fts tokenizer generic TR29 with WB5a");
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
test_tokenizer_inputs(tok, expected_output);
fts_tokenizer_unref(&tok);
test_end();
}
static void test_fts_tokenizer_address_only(void)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"abc.dfg@example.com", "bar@example.org",
"foo.bar@host.example.org", "foo@domain",
"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
"period@blue.com", /*trailing period '.' in email */
"mul@trail.com",
"m@s", /*one letter local-part and domain name */
"hypen@hypen-hypen.com",
"hypen@hypen-hypen-sick.com",
NULL
};
struct fts_tokenizer *tok;
const char *error;
test_begin("fts tokenizer email address only");
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
fts_tokenizer_unref(&tok);
test_end();
}
static void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
"Bar", "Baz", "bar", "example", "org", "bar@example.org",
"Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
"foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
"trailing", "period", "blue", "com", "period@blue.com",
"multi", "trialing", "mul", "trail", "com", "mul@trail.com",
"m", "s", "m@s",
"hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com",
"hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
NULL
};
struct fts_tokenizer *tok, *gen_tok;
const char *error;
test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
test_end();
}
const char *const simple_settings[] = {"algorithm", "simple", NULL};
static void test_fts_tokenizer_address_parent_simple(void)
{
test_fts_tokenizer_address_parent("simple", simple_settings);
}
static void test_fts_tokenizer_address_parent_tr29(void)
{
test_fts_tokenizer_address_parent("tr29", tr29_settings);
}
static void test_fts_tokenizer_address_search(void)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
"Bar", "Baz", "bar@example.org",
"Foo", "Bar", "comment", "foo.bar@host.example.org",
"foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
"trailing", "period@blue.com",
"multi", "trialing", "mul@trail.com",
"m@s",
"hypen@hypen-hypen.com",
"hypen@hypen-hypen-sick.com",
NULL
};
static const char *const settings[] = { "search", "", NULL };
struct fts_tokenizer *tok, *gen_tok;
const char *token, *error;
test_begin("fts tokenizer search email address + parent");
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
/* make sure state is forgotten at EOF */
test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "foo") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "bar@baz") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "foo") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
/* test reset explicitly */
test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
fts_tokenizer_reset(tok);
test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "b@c") == 0);
test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
test_end();
}
static void test_fts_tokenizer_delete_trailing_partial_char(void)
{
static const struct {
const char *str;
unsigned int truncated_len;
} tests[] = {
/* non-truncated */
{ "\x7f", 1 },
{ "\xC2\x80", 2 },
{ "\xE0\x80\x80", 3 },
{ "\xF0\x80\x80\x80", 4 },
/* truncated */
{ "\xF0\x80\x80", 0 },
{ "x\xF0\x80\x80", 1 },
};
unsigned int i;
size_t size;
test_begin("fts tokenizer delete trailing partial char");
for (i = 0; i < N_ELEMENTS(tests); i++) {
size = strlen(tests[i].str);
fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
test_assert(size == tests[i].truncated_len);
}
test_end();
}
static void test_fts_tokenizer_address_maxlen(void)
{
const char *const settings[] = {"maxlen", "5", NULL};
const char *input = "...\357\277\275@a";
struct fts_tokenizer *tok;
const char *token, *error;
test_begin("fts tokenizer address maxlen");
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
while (fts_tokenizer_next(tok, (const unsigned char *)input,
strlen(input), &token, &error) > 0) ;
while (fts_tokenizer_final(tok, &token, &error) > 0) ;
fts_tokenizer_unref(&tok);
test_end();
}
static void test_fts_tokenizer_random(void)
{
const char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
const char *const settings[] = {"algorithm", "simple", NULL};
const char *const email_settings[] = {"maxlen", "9", NULL};
unsigned int i;
unsigned char addr[10] = { 0 };
string_t *str = t_str_new(20);
struct fts_tokenizer *tok, *gen_tok;
const char *token, *error;
test_begin("fts tokenizer random");
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
for (i = 0; i < 10000; i++) T_BEGIN {
for (unsigned int j = 0; j < sizeof(addr); j++)
addr[j] = test_chars[rand() % N_ELEMENTS(test_chars)];
str_truncate(str, 0);
(void)uni_utf8_get_valid_data(addr, sizeof(addr), str);
while (fts_tokenizer_next(tok, str_data(str), str_len(str),
&token, &error) > 0) ;
while (fts_tokenizer_final(tok, &token, &error) > 0) ;
} T_END;
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
test_end();
}
int main(void)
{
static void (*const test_functions[])(void) = {
test_fts_tokenizer_find,
test_fts_tokenizer_generic_only,
test_fts_tokenizer_generic_tr29_only,
test_fts_tokenizer_generic_tr29_wb5a,
test_fts_tokenizer_address_only,
test_fts_tokenizer_address_parent_simple,
test_fts_tokenizer_address_parent_tr29,
test_fts_tokenizer_address_maxlen,
test_fts_tokenizer_address_search,
test_fts_tokenizer_delete_trailing_partial_char,
test_fts_tokenizer_random,
NULL
};
int ret;
fts_tokenizers_init();
ret = test_run(test_functions);
fts_tokenizers_deinit();
return ret;
}