bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
5916f19b49cae37c888109f6fdff3224f81d33aeTimo Sirainenstatic const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
5916f19b49cae37c888109f6fdff3224f81d33aeTimo Sirainenstatic struct fts_language english_language = { .name = "en" };
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovilastatic struct fts_language french_language = { .name = "fr" };
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovilastatic struct fts_language norwegian_language = { .name = "no" };
e467b295e44fb659ec28e9b6b3f05e71335b85e3Teemu Huovila#if defined(HAVE_LIBICU) && defined(HAVE_FTS_STEMMER)
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovilastatic struct fts_language swedish_language = { .name = "sv" };
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_find("stopwords") == fts_filter_stopwords);
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball);
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu);
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_find("lowercase") == fts_filter_lowercase);
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_assert(fts_filter_find("contractions") == fts_filter_contractions);
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovilastatic void test_fts_filter_contractions_fail(void)
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_begin("fts filter contractions, unsupported language");
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovilastatic void test_fts_filter_contractions_fr(void)
b7324e421e2132cbbf753e6fdbe675bbaecdf929Timo Sirainen static const struct {
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila unsigned int i;
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_begin("fts filter contractions, French");
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila ret = fts_filter_filter(filter, &token, &error);
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_assert_idx(strcmp(token, tests[i].output) == 0, i);
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila else if (ret == 0)
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_assert_idx(token == NULL && tests[i].output == NULL, i);
b7324e421e2132cbbf753e6fdbe675bbaecdf929Timo Sirainen static const struct {
acfcf88e4dd529e4b2409f43bc9713cbc0169347Timo Sirainen unsigned int i;
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
acfcf88e4dd529e4b2409f43bc9713cbc0169347Timo Sirainen test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
c8eaee2ad6cc96e2ef42657f89d4404e674680b7Teemu Huovilastatic void test_fts_filter_lowercase_utf8(void)
b7324e421e2132cbbf753e6fdbe675bbaecdf929Timo Sirainen static const struct {
c8eaee2ad6cc96e2ef42657f89d4404e674680b7Teemu Huovila unsigned int i;
c8eaee2ad6cc96e2ef42657f89d4404e674680b7Teemu Huovila test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
c8eaee2ad6cc96e2ef42657f89d4404e674680b7Teemu Huovila test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovilastatic void test_fts_filter_lowercase_too_long_utf8(void)
b7324e421e2132cbbf753e6fdbe675bbaecdf929Timo Sirainen static const struct {
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila { "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxy" },
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila { "abc\xC3\x85""defghijklmnopqrstuvwxyz", "abc\xC3\xA5""defghijklmnopqrstuvw" },
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila { "abcdefghijklmnopqrstuvwx\xC3\x85", "abcdefghijklmnopqrstuvwx" }
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila const char * const settings[] = {"maxlen", "25", NULL};
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila unsigned int i;
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila test_begin("fts filter lowercase, too long UTF8");
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0);
5fcd30add8dcf4d883978cce3e39f3a89184f1e5Teemu Huovila test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_stopwords_eng(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *input[] = {"an", "elephant", "and", "a", "bear",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *output[] = {NULL, "elephant", NULL, NULL, "bear",
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
1d0f568e26ce5cbf18cd7bb335c6eea20a7e3770Teemu Huovila ret = fts_filter_filter(filter, &token, &error);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_stopwords_fin(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language finnish = { .name = "fi" };
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *input[] = {"olla", "vaiko", "eik\xC3\xB6", "olla",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *output[] = {NULL, "vaiko", "eik\xC3\xB6", NULL, NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen {"kuka", "kenet", "keneen", "testi", "eiv\xC3\xA4t", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *output2[] = {NULL, NULL, NULL, "testi", NULL};
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
1d0f568e26ce5cbf18cd7bb335c6eea20a7e3770Teemu Huovila ret = fts_filter_filter(filter, &token, &error);
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
1d0f568e26ce5cbf18cd7bb335c6eea20a7e3770Teemu Huovila ret = fts_filter_filter(filter, &token, &error);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_stopwords_fra(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *input[] = {"e\xC3\xBBt", "soyez", "soi", "peut", "que",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *output[] = {NULL, NULL, NULL, "peut", NULL,
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
1d0f568e26ce5cbf18cd7bb335c6eea20a7e3770Teemu Huovila ret = fts_filter_filter(filter, &token, &error);
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila const char *input[] = {"og", "d\xC3\xA5", "medlemsstatane", "har",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "og", "halde", "seg", "etter", "menneskerettane",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "og", "den", "grunnleggjande", "fridomen", "i",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila const char *output[] = {NULL, NULL, "medlemsstatane", NULL,
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila NULL, NULL, "grunnleggjande", "fridomen", NULL,
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "Nasjonane"};
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila test_begin("fts filter stopwords, Norwegian");
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila test_assert(fts_filter_create(fts_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila ret = fts_filter_filter(filter, &token, &error);
dfb9243af1c95de27c7b3a783629ad901c085927Teemu Huovilastatic void test_fts_filter_stopwords_fail_lazy_init(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language unknown = { .name = "bebobidoop" };
dfb9243af1c95de27c7b3a783629ad901c085927Teemu Huovila test_begin("fts filter stopwords, fail filter() (lazy init)");
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stopwords, NULL, &unknown, stopword_settings, &filter, &error) == 0);
5916f19b49cae37c888109f6fdff3224f81d33aeTimo Sirainen test_assert(fts_filter_filter(filter, &token, &error) < 0 && error != NULL);
3f3c1b629196bc8491f146705b6f8ddadfcde1c8Teemu Huovilastatic void test_fts_filter_stopwords_malformed(void)
3f3c1b629196bc8491f146705b6f8ddadfcde1c8Teemu Huovila const struct fts_language malformed = { .name = "malformed" };
3f3c1b629196bc8491f146705b6f8ddadfcde1c8Teemu Huovila test_begin("fts filter stopwords, malformed list");
3f3c1b629196bc8491f146705b6f8ddadfcde1c8Teemu Huovila test_assert(fts_filter_create(fts_filter_stopwords, NULL, &malformed, stopword_settings, &filter, &error) == 0);
3f3c1b629196bc8491f146705b6f8ddadfcde1c8Teemu Huovila test_expect_error_string("seems empty. Is the file correctly formatted?");
3f3c1b629196bc8491f146705b6f8ddadfcde1c8Teemu Huovila test_assert(fts_filter_filter(filter, &token, &error) > 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_stemmer_snowball_stem_english(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "conscience", "and", "should", "act", "towards", "one",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "dri" ,"friend", "All", "human", "be", "are", "born", "free",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "and", "equal", "in", "digniti", "and", "right", "They", "are",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "endow", "with", "reason", "and", "conscienc", "and", "should",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "act", "toward", "one", "anoth", "in", "a", "spirit", "of",
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &english_language, NULL, &stemmer, &error) == 0);
5916f19b49cae37c888109f6fdff3224f81d33aeTimo Sirainen test_assert(fts_filter_filter(stemmer, &token, &error) > 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_stemmer_snowball_stem_french(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Tous", "les", "\xC3\xAAtres", "humains", "naissent",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "libres", "et", "\xC3\xA9gaux", "en", "dignit\xC3\xA9",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Tous" ,"le", "\xC3\xAAtre", "humain", "naissent", "libr", "et",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "\xC3\xA9gal", "en", "dignit", "et", "en", "droit", NULL};
440b625484f3cc9d3ec0a7ba36fe3583aa90172dTeemu Huovila test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
5916f19b49cae37c888109f6fdff3224f81d33aeTimo Sirainen test_assert(fts_filter_filter(stemmer, &token, &error) > 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_stopwords_stemmer_eng(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "conscience", "and", "should", "act", "towards", "one",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "dri" ,"friend", "All", "human", "be", NULL, "born", "free",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen NULL, "equal", NULL, "digniti", NULL, "right", "They", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "endow", NULL, "reason", NULL, "conscienc", NULL, "should",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "act", "toward", "one", "anoth", NULL, NULL, "spirit", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_begin("fts filters stopwords and stemming chained, English");
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
1d0f568e26ce5cbf18cd7bb335c6eea20a7e3770Teemu Huovila ret = fts_filter_filter(stemmer, &token, &error);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_normalizer_swedish_short(void)
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "\xC3\x85\xC3\x84\xC3\x96",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Vem kan segla f\xC3\xB6rutan vind?\n"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "\xC3\x85\xC3\x84\xC3\x96\xC3\xB6\xC3\xA4\xC3\xA5"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "vem kan segla forutan vind?\naaooaa"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_begin("fts filter normalizer Swedish short text");
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_normalizer_swedish_short_default_id(void)
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "\xC3\x85\xC3\x84\xC3\x96",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "Vem kan segla f\xC3\xB6rutan vind?\n"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "\xC3\x85\xC3\x84\xC3\x96\xC3\xB6\xC3\xA4\xC3\xA5"
3e786e2a411dc973a2359bc213fcf827e6c314d2Timo Sirainen "vemkanseglaforutanvind?\naaooaa"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_begin("fts filter normalizer Swedish short text using default ID");
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, NULL, &norm, &error) == 0);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* UDHRDIR comes from Automake AM_CPPFLAGS */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_normalizer_french(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned char sha512_digest[SHA512_RESULTLEN];
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0x06, 0x80, 0xf1, 0x81, 0xf2, 0xed, 0xfb, 0x6d,
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0xcd, 0x7d, 0xcb, 0xbd, 0xc4, 0x87, 0xc3, 0xf6,
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0xb8, 0x6a, 0x01, 0x82, 0xdf, 0x0a, 0xb5, 0x92,
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0x6b, 0x9b, 0x7b, 0x21, 0x5e, 0x62, 0x40, 0xbd,
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0xbf, 0x15, 0xb9, 0x7b, 0x75, 0x9c, 0x4e, 0xc9,
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0xe8, 0x48, 0xaa, 0x08, 0x63, 0xf2, 0xa0, 0x6c,
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0x20, 0x4c, 0x01, 0xe3, 0xb3, 0x4f, 0x15, 0xc6,
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen 0x8c, 0xd6, 0x7a, 0xb7, 0xc5, 0xc6, 0x85, 0x00};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_begin("fts filter normalizer French UDHR");
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen udhr_path = t_strconcat(UDHRDIR, UDHR_FRA_NAME, NULL);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen while (NULL != fgets(buf, sizeof(buf), input)) {
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen if (fts_filter_filter(norm, &tokens, &error) != 1){
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert(memcmp(sha512_digest, correct_digest,
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen sizeof(sha512_digest)) == 0);
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainenstatic void test_fts_filter_normalizer_empty(void)
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainen /* test just a couple of these */
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainen "\xCC\x80\xF3\xA0\x87\xAF" /* U+0300 U+E01EF */
3e786e2a411dc973a2359bc213fcf827e6c314d2Timo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove", NULL};
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainen unsigned int i;
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainen test_begin("fts filter normalizer empty tokens");
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainen for (i = 0; i < N_ELEMENTS(empty_tokens); i++) {
f5b6f113bfbdf57b3335118e9b0387a9fb760bd2Timo Sirainen test_assert_idx(fts_filter_filter(norm, &token, &error) == 0, i);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainenstatic void test_fts_filter_normalizer_baddata(void)
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
0190b33c05bd72f1049255d03a7b5217ff1bbcedAki Tuomi if (!uni_is_valid_ucs4(i)) continue;
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen test_assert_idx(fts_filter_filter(norm, &token, &error) >= 0, i);
bf698b98d3a3a1eced66cc682c449f23bf2b67d0Timo Sirainen test_assert(fts_filter_filter(norm, &token, &error) >= 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_normalizer_invalid_id(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen {"id", "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_begin("fts filter normalizer invalid id");
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
5916f19b49cae37c888109f6fdff3224f81d33aeTimo Sirainen test_assert(fts_filter_filter(norm, &token, &error) < 0 && error != NULL);
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainenstatic void test_fts_filter_normalizer_oversized(void)
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", "maxlen", "250",
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen const char *token = "\xe4\x95\x91\x25\xe2\x94\xad\xe1\x90\xad\xee\x94\x81\xe2\x8e\x9e"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xe7\x9a\xb7\xea\xbf\x97\xe3\xb2\x8f\xe4\x9c\xbe\xee\xb4\x98\xe1"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\x8d\x99\xe2\x91\x83\xe3\xb1\xb8\xef\xbf\xbd\xe8\xbb\x9c\xef\xbf"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xbd\xea\xbb\x98\xea\xb5\xac\xe4\x87\xae\xe4\x88\x93\xe9\x86\x8f"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xe9\x86\x83\xe6\x8f\x8d\xe7\xa3\x9d\xed\x89\x96\xe2\x89\x85\xe6"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\x8c\x82\xec\x80\x98\xee\x91\x96\xe7\xa8\x8a\xec\xbc\x85\xeb\x9c"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xbd\xeb\x97\x95\xe3\xa4\x9d\xd7\xb1\xea\xa7\x94\xe0\xbb\xac\xee"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\x95\x87\xd5\x9d\xe8\xba\x87\xee\x8b\xae\xe5\xb8\x80\xe9\x8d\x82"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xe7\xb6\x8c\xe7\x9b\xa0\xef\x82\x9f\xed\x96\xa4\xe3\x8d\xbc\xe1"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\x81\xbd\xe9\x81\xb2\xea\xac\xac\xec\x9b\x98\xe7\x84\xb2\xee\xaf"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xbc\xeb\xa2\x9d\xe9\x86\xb3\xe0\xb0\x89\xeb\x80\xb6\xe3\x8c\x9d"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xe9\x8f\x9e\xe2\xae\x8a\xee\x9e\x9a\xef\xbf\xbd\xe7\xa3\x9b\xe4"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xa3\x8b\xe4\x82\xb9\xeb\x8e\x93\xec\xb5\x82\xe5\xa7\x81\xe2\x8c"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\x97\xea\xbb\xb4\xe5\x85\xb7\xeb\x96\xbe\xe7\x97\x91\xea\xbb\x98"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\xe6\xae\xb4\xe9\x8a\x85\xc4\xb9\xe4\x90\xb2\xe9\x96\xad\xef\x90"
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen "\x9c\xe5\xa6\xae\xe9\x93\x91\xe8\x87\xa1";
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen test_begin("fts filter normalizer over-sized token");
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
ea591a86852fb868a7cb9b6e2c80e3446071df8fTimo Sirainen test_assert(fts_filter_filter(norm, &token, &error) >= 0);
35eb3a2394bb064cc6b4a67436e8860994e4636aTeemu Huovilastatic void test_fts_filter_normalizer_truncation(void)
35eb3a2394bb064cc6b4a67436e8860994e4636aTeemu Huovila test_begin("fts filter normalizer token truncated mid letter");
35eb3a2394bb064cc6b4a67436e8860994e4636aTeemu Huovila test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL,
35eb3a2394bb064cc6b4a67436e8860994e4636aTeemu Huovila test_assert(fts_filter_filter(norm, &token, &error) >= 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen //{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "conscience", "and", "should", "act", "towards", "one",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", "ABCFoo",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "dri" ,"friend", "all", "human", "be", NULL, "born", "free",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen NULL, "equal", NULL, "digniti", NULL, "right", NULL, NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "endow", NULL, "reason", NULL, "conscienc", NULL, "should",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "act", "toward", "one", "anoth", NULL, NULL, "spirit", NULL,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_begin("fts filters normalizer, stopwords and stemming chained, English");
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, id_settings, &normalizer, &error) == 0);
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stopwords, normalizer, &english_language, stopword_settings, &filter, &error) == 0);
a7d8afaadae968db20eb979052111d76a3086cd7Timo Sirainen test_assert(fts_filter_create(fts_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
1d0f568e26ce5cbf18cd7bb335c6eea20a7e3770Teemu Huovila ret = fts_filter_filter(stemmer, &token, &error);
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovilastatic void test_fts_filter_stopwords_normalizer_stemmer_no(void)
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "Alle", "har", "plikter", "andsynes", "samfunnet", "d\xC3\xA5",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "personlegdomen", "til", "den", "einskilde", "einast", "der",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "voksterk\xC3\xA5r",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "Alle", "mennesker", "er", "f\xC3\xB8""dt", "frie", "og", "med",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "samme", "menneskeverd", "og", "menneskerettigheter", "De",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "er", "utstyrt", "med", "fornuft", "og", "samvittighet",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "og", "b\xC3\xB8r", "handle", "mot", "hverandre", "i",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "all", NULL, "plikt", "andsyn", "samfunn", NULL,
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "personlegdom", NULL, NULL, "einskild", "ein", NULL, NULL,
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "all", "mennesk", NULL, "f\xC3\xB8""dt", "frie", NULL, NULL,
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila NULL, "menneskeverd", NULL, "menneskerett", "de", NULL,
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "utstyrt", NULL, "fornuft", NULL, "samvitt", NULL, "b\xC3\xB8r",
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila "handl", NULL, "hverandr", NULL, "brorskap", "and", NULL};
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila test_begin("fts filters with stopwords, default normalizer and stemming chained, Norwegian");
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila test_assert(fts_filter_create(fts_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &norwegian_language, NULL, &stemmer, &error) == 0);
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila ret = fts_filter_filter(stemmer, &token, &error);
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovilastatic void test_fts_filter_stopwords_normalizer_stemmer_sv(void)
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila "Enär", "erkännandet", "av", "det", "inneboende", "värdet",
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila "hos", "alla", "medlemmar", "av", "människosläktet", "och",
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila "av", "deras", "lika", "och", "oförytterliga", "rättigheter",
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila "är", "grundvalen", "för", "frihet", "rättvisa", "och", "fred",
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila "enar", "erkan", NULL, NULL, "inneboend", "vardet", "hos", NULL,
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila "medlemm", NULL, "manniskoslaktet", NULL, NULL, NULL, "lik",
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila NULL, "oforytter", "ratt", NULL, "grundval", NULL, "frihet",
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila "rattvis", NULL, "fred", NULL, "varld", NULL};
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila test_begin("fts filters with stopwords, default normalizer and stemming chained, Swedish");
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila test_assert(fts_filter_create(fts_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0);
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0);
c5effa0f13da8f45991c89a9d8c9d2109db66039Teemu Huovila ret = fts_filter_filter(stemmer, &token, &error);
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainenstatic void test_fts_filter_english_possessive(void)
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "foo\xC3\xA4's",
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen "foo\xE2\x80\x99s",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "foo\xC3\xA4\xE2\x80\x99s",
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen "foo\xE2\x80\x99S",
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen "foos\xE2\x80\x99S",
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen "foo\xE2\x80\x99s\xE2\x80\x99s",
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen "foo\xE2\x80\x99ss"
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "foo\xC3\xA4",
19ed8f08b23d6ed204e6b27e5d1c0c6fe6bb11ddPhil Carmody "foo\xC3\xA4",
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen "foo\xE2\x80\x99s",
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen "foo\xE2\x80\x99ss"
471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4Timo Sirainen unsigned int i;
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
98add73ebcec199c04a9b243190f244c216c30e9Timo Sirainen test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* TODO: Functions to test 1. ref-unref pairs 2. multiple registers +
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen an unregister + find */
baf3e87e186453fda13bd21f7cbcb2efc8492e8bTimo Sirainen static void (*const test_functions[])(void) = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_fts_filter_stemmer_snowball_stem_english,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_fts_filter_normalizer_swedish_short_default_id,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen test_fts_filter_normalizer_stopwords_stemmer_eng,
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila test_fts_filter_stopwords_normalizer_stemmer_no,