test-fts-filter.c revision 2f2faa96aaf6989fae9acab1523f8be372060a02
5f5870385cff47efd2f58e7892f251cf13761528Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#include "lib.h"
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#include "sha2.h"
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#include "test-common.h"
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen#include "fts-language.h"
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen#include "fts-filter.h"
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#include <stdio.h>
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenconst char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_eng(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language english = { .name = "en" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *filter;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *input[] = {"an", "elephant", "and", "a", "bear",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "drive", "by", "for", "no", "reason",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "they", "will", "not", "sing", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *output[] = {NULL, "elephant", NULL, NULL, "bear",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "drive", NULL, NULL, NULL, "reason",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen NULL, NULL, NULL, "sing"};
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const char **ip, **op;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const char *filtered;
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen test_begin("fts filter stopwords, English");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &english, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ip = input;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen op = output;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen while (*ip != NULL) {
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen filtered = fts_filter_filter(filter, *ip);
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen if (filtered == NULL)
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen test_assert(*op == NULL);
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen else {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(*op != NULL);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(strcmp(*ip, filtered) == 0);
678d0463849ba777106eb7875f27db07a5d8e3dfTimo Sirainen }
4ee00532a265bdfb38539d811fcd12d51210ac35Timo Sirainen op++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ip++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&filter);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(filter == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_fin(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language finnish = { .name = "fi" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *filter;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *input[] = {"olla", "vaiko", "eik\xC3\xB6", "olla",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "kenest\xC3\xA4", "ja", "joista", "jonka",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "testi", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *output[] = {NULL, "vaiko", "eik\xC3\xB6", NULL, NULL,
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen NULL, NULL, NULL, "testi"};
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *input2[] =
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen {"kuka", "kenet", "keneen", "testi", "eiv\xC3\xA4t", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *output2[] = {NULL, NULL, NULL, "testi", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char **ip, **op;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *filtered;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filter stopwords, Finnish");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &finnish, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ip = input;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen op = output;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen while (*ip != NULL) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filtered = fts_filter_filter(filter, *ip);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen if (filtered == NULL)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(*op == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen else {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(*op != NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(strcmp(*ip, filtered) == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen op++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ip++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&filter);
df1713bd29d29a3e3f3ebfdf05f929525825a7d3Timo Sirainen test_assert(filter == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &finnish, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ip = input2;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen op = output2;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen while (*ip != NULL) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filtered = fts_filter_filter(filter, *ip);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen if (filtered == NULL)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(*op == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen else {
df1713bd29d29a3e3f3ebfdf05f929525825a7d3Timo Sirainen test_assert(*op != NULL);
df1713bd29d29a3e3f3ebfdf05f929525825a7d3Timo Sirainen test_assert(strcmp(*ip, filtered) == 0);
df1713bd29d29a3e3f3ebfdf05f929525825a7d3Timo Sirainen }
cd2fc7dd28c3a2e3f82e8480eaf3ba7c4abc3614Timo Sirainen op++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ip++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&filter);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(filter == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_fra(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language french = { .name = "fr" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *filter;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *input[] = {"e\xC3\xBBt", "soyez", "soi", "peut", "que",
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen "quelconque", "\xC3\xA9t\xC3\xA9",
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen "l\xE2\x80\x99""av\xC3\xA8nement",
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen NULL};
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *output[] = {NULL, NULL, NULL, "peut", NULL,
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen "quelconque", NULL,
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen "l\xE2\x80\x99""av\xC3\xA8nement",};
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char **ip, **op;
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *filtered;
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen test_begin("fts filter stopwords, French");
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen ret = fts_filter_create(filter_class, NULL, &french, stopword_settings, &filter, &error);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen test_assert(ret == 0);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen ip = input;
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen op = output;
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen while (*ip != NULL) {
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen filtered = fts_filter_filter(filter, *ip);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen if (filtered == NULL)
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen test_assert(*op == NULL);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen else {
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen test_assert(*op != NULL);
53ec1ff2231d477db3103c51987fa9cb6033bc16Timo Sirainen test_assert(strcmp(*ip, filtered) == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen op++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ip++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
53ec1ff2231d477db3103c51987fa9cb6033bc16Timo Sirainen
53ec1ff2231d477db3103c51987fa9cb6033bc16Timo Sirainen fts_filter_unref(&filter);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(filter == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_fail_create(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language unknown = { .name = "bebobidoop" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *filter = NULL;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filter stopwords, fail create()");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &unknown, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == -1 && filter == NULL && error != NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen}
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#ifdef HAVE_FTS_STEMMER
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stemmer_snowball_stem_english(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *stemmer;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "EN" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *base = NULL;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "conscience", "and", "should", "act", "towards", "one",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const bases[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dri" ,"friend", "All", "human", "be", "are", "born", "free",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "and", "equal", "in", "digniti", "and", "right", "They", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "endow", "with", "reason", "and", "conscienc", "and", "should",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "act", "toward", "one", "anoth", "in", "a", "spirit", "of",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "brotherhood", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filter stem English");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &language, NULL, &stemmer, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen bpp = bases;
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen for (tpp=tokens; *tpp != NULL; tpp++) {
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen base = fts_filter_filter(stemmer, *tpp);
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen test_assert(base != NULL);
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen test_assert(null_strcmp(base, *bpp) == 0);
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen bpp++;
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen }
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen fts_filter_unref(&stemmer);
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen test_assert(stemmer == NULL);
3852872e6954b7132e637294132005e86b8ebd4aTimo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stemmer_snowball_stem_french(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *stemmer;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "fRench" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *base = NULL;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "Tous", "les", "\xC3\xAAtres", "humains", "naissent",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "libres", "et", "\xC3\xA9gaux", "en", "dignit\xC3\xA9",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "et", "en", "droits", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const bases[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "Tous" ,"le", "\xC3\xAAtre", "humain", "naissent", "libr", "et",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "\xC3\xA9gal", "en", "dignit", "et", "en", "droit", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filter stem French");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &language, NULL, &stemmer, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen bpp = bases;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen for (tpp=tokens; *tpp != NULL; tpp++) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen base = fts_filter_filter(stemmer, *tpp);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(base != NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(null_strcmp(base, *bpp) == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen bpp++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&stemmer);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(stemmer == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainenstatic void test_fts_filter_stopwords_stemmer_eng(void)
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen{
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen int ret;
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen const struct fts_filter *filter_class;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen struct fts_filter *stemmer;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen struct fts_filter *filter;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "eN" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *base = NULL;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "conscience", "and", "should", "act", "towards", "one",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", NULL};
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen const char * const bases[] = {
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen "dri" ,"friend", "All", "human", "be", NULL, "born", "free",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen NULL, "equal", NULL, "digniti", NULL, "right", "They", NULL,
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "endow", NULL, "reason", NULL, "conscienc", NULL, "should",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "act", "toward", "one", "anoth", NULL, NULL, "spirit", NULL,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "brotherhood", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filters stopwords and stemming chained, English");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &language, stopword_settings, &filter, &error);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(ret == 0);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ret = fts_filter_create(filter_class, filter, &language, NULL, &stemmer, &error);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(ret == 0);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen bpp = bases;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen for (tpp=tokens; *tpp != NULL; tpp++) {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen base = fts_filter_filter(stemmer, *tpp);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen if (base == NULL)
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(*bpp == NULL);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen else {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(*bpp != NULL);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(null_strcmp(*bpp, base) == 0);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen }
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen bpp++;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen }
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen fts_filter_unref(&stemmer);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen fts_filter_unref(&filter);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(stemmer == NULL);
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen test_assert(filter == NULL);
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen test_end();
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen}
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen#endif
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen#ifdef HAVE_LIBICU
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainenstatic void test_fts_filter_normalizer_swedish_short(void)
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen{
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen const struct fts_filter *filter_class;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen struct fts_filter *norm = NULL;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen int ret;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen const char *input[] = {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen NULL,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "Vem",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "Ã…",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "ÅÄÖ",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "Vem kan segla f\xC3\xB6rutan vind?\n"
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "\xC3\x85\xC3\x84\xC3\x96\xC3\xB6\xC3\xA4\xC3\xA5"
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen };
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen const char *expected_output[] = {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen NULL,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "vem",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "a",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "aao",
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "vem kan segla forutan vind?\naaooaa"
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen };
86bde2c1838d1ce967fa2b394bb952004a4adcb7Timo Sirainen const char * const settings[] =
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen const char *error = NULL;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen const char *normalized = NULL;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen unsigned int i;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen test_begin("fts filter normalizer Swedish short text");
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen T_BEGIN {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, settings, &norm, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen for (i = 0; i < N_ELEMENTS(input); i++) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen if (input[i] != NULL) {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen }
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen }
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen fts_filter_unref(&norm);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen } T_END;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(norm == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen}
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenstatic void test_fts_filter_normalizer_swedish_short_default_id(void)
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen{
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *norm = NULL;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *input[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen NULL,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "Vem",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "Ã…",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "ÅÄÖ",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "Vem kan segla f\xC3\xB6rutan vind?\n"
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "\xC3\x85\xC3\x84\xC3\x96\xC3\xB6\xC3\xA4\xC3\xA5"
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen };
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const char *expected_output[] = {
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen NULL,
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "vem",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "a",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "aao",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "vem kan segla forutan vind?\naaooaa"
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen };
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const char *error = NULL;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const char *normalized = NULL;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen unsigned int i;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filter normalizer Swedish short text using default ID");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen T_BEGIN {
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, NULL, &norm, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen for (i = 0; i < N_ELEMENTS(input); i++) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen if (input[i] != NULL) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i);
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&norm);
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen } T_END;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen test_assert(norm == NULL);
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainen/* UDHRDIR comes from Automake AM_CPPFLAGS */
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainen#define UDHR_FRA_NAME "/udhr_fra.txt"
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainenstatic void test_fts_filter_normalizer_french(void)
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainen{
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen struct fts_filter *norm = NULL;
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainen const struct fts_filter *filter_class;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen FILE *input;
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen const char * const settings[] =
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen char buf[4096] = {0};
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen const char *error = NULL;
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen const char *normalized = NULL;
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen int ret;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen unsigned char sha512_digest[SHA512_RESULTLEN];
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen struct sha512_ctx ctx;
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const unsigned char correct_digest[] = {
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen 0x78, 0x1e, 0xb9, 0x04, 0xa4, 0x92, 0xca, 0x88,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x1e, 0xef, 0x7b, 0xc8, 0x3e, 0x4a, 0xa8, 0xdb,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x9c, 0xd4, 0x42, 0x5c, 0x64, 0x81, 0x06, 0xd5,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x72, 0x93, 0x38, 0x0c, 0x09, 0xce, 0xbe, 0xdf,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x65, 0xff, 0x36, 0x35, 0x05, 0x77, 0xcc, 0xc6,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen 0xff, 0x44, 0x2c, 0x31, 0x10, 0x00, 0xf6, 0x8d,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x15, 0x25, 0x1e, 0x54, 0x67, 0x2a, 0x5b, 0xc1,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0xdb, 0x84, 0xc5, 0x0d, 0x43, 0x7e, 0x8c, 0x70};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *udhr_path;
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_begin("fts filter normalizer French UDHR");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen T_BEGIN {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen udhr_path = t_strconcat(UDHRDIR, UDHR_FRA_NAME, NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, settings, &norm, &error);
678d0463849ba777106eb7875f27db07a5d8e3dfTimo Sirainen test_assert(ret == 0);
678d0463849ba777106eb7875f27db07a5d8e3dfTimo Sirainen input = fopen(udhr_path, "r");
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert(input != NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen sha512_init(&ctx);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen while (NULL != fgets(buf, sizeof(buf), input)) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
6b09a3b269f4b10364c9a77f6614dbe3d306b79dTimo Sirainen if ((normalized = fts_filter_filter(norm, buf)) == NULL){
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen break;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen sha512_loop(&ctx, normalized, strlen(normalized));
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fclose(input);
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen sha512_result(&ctx, sha512_digest);
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen test_assert(memcmp(sha512_digest, correct_digest,
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen sizeof(sha512_digest)) == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&norm);
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen } T_END;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(norm == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_normalizer_invalid_id(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen struct fts_filter *norm = NULL;
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const struct fts_filter *filter_class;
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *settings[] =
428d63767dc20aeb87695b82fb01cd0a06d7769cTimo Sirainen {"id", "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove",
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen NULL};
a10ed8c47534b4c6b6bf2711ccfe577e720a47b4Timo Sirainen const char *error = NULL;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen int ret;
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen
31a574fda352ef4f71dbff9c30e15e4744e132c0Timo Sirainen test_begin("fts filter normalizer invalid id");
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, settings, &norm, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret < 0 && error != NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(norm == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen{
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_filter *filter_class;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *normalizer;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *stemmer;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_filter *filter;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *error;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const id_settings[] =
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen //{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen {"id", "Lower", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "En" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *base = NULL;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "conscience", "and", "should", "act", "towards", "one",
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", "ABCFoo",
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen NULL};
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen const char * const bases[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dri" ,"friend", "all", "human", "be", NULL, "born", "free",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen NULL, "equal", NULL, "digniti", NULL, "right", NULL, NULL,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "endow", NULL, "reason", NULL, "conscienc", NULL, "should",
54fcc10af7fb60e495318f7e81652d05eb3e0cadTimo Sirainen "act", "toward", "one", "anoth", NULL, NULL, "spirit", NULL,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "brotherhood", "abcfoo", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filters normalizer, stopwords and stemming chained, English");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, id_settings, &normalizer, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, normalizer, &language, stopword_settings, &filter, &error);
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, filter, &language, NULL, &stemmer, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen bpp = bases;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen for (tpp = tokens; *tpp != NULL; tpp++) {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen base = fts_filter_filter(stemmer, *tpp);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen if (base == NULL)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(*bpp == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen else {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(*bpp != NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(strcasecmp(*bpp, base) == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen bpp++;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen }
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&stemmer);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&filter);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filter_unref(&normalizer);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(stemmer == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(filter == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(normalizer == NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_end();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#endif
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen/* TODO: Functions to test 1. ref-unref pairs 2. multiple registers +
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen an unregister + find */
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainenint main(void)
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen{
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen static void (*test_functions[])(void) = {
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen test_fts_filter_stopwords_eng,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_stopwords_fin,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_stopwords_fra,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_stopwords_fail_create,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#ifdef HAVE_FTS_STEMMER
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_stemmer_snowball_stem_english,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_stemmer_snowball_stem_french,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_stopwords_stemmer_eng,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#endif
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#ifdef HAVE_LIBICU
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_normalizer_swedish_short,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_normalizer_swedish_short_default_id,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_normalizer_french,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_normalizer_invalid_id,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_normalizer_stopwords_stemmer_eng,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen#endif
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen NULL
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen int ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filters_init();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = test_run(test_functions);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen fts_filters_deinit();
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen return ret;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen}
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen