test-fts-filter.c revision 2f2faa96aaf6989fae9acab1523f8be372060a02
5f5870385cff47efd2f58e7892f251cf13761528Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenconst char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_eng(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language english = { .name = "en" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *input[] = {"an", "elephant", "and", "a", "bear",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *output[] = {NULL, "elephant", NULL, NULL, "bear",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &english, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_fin(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language finnish = { .name = "fi" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *input[] = {"olla", "vaiko", "eik\xC3\xB6", "olla",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *output[] = {NULL, "vaiko", "eik\xC3\xB6", NULL, NULL,
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *input2[] =
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen {"kuka", "kenet", "keneen", "testi", "eiv\xC3\xA4t", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *output2[] = {NULL, NULL, NULL, "testi", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &finnish, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &finnish, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_fra(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language french = { .name = "fr" };
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *input[] = {"e\xC3\xBBt", "soyez", "soi", "peut", "que",
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *output[] = {NULL, NULL, NULL, "peut", NULL,
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen ret = fts_filter_create(filter_class, NULL, &french, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stopwords_fail_create(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const struct fts_language unknown = { .name = "bebobidoop" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filter stopwords, fail create()");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &unknown, stopword_settings, &filter, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert(ret == -1 && filter == NULL && error != NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stemmer_snowball_stem_english(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "EN" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "conscience", "and", "should", "act", "towards", "one",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const bases[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dri" ,"friend", "All", "human", "be", "are", "born", "free",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "and", "equal", "in", "digniti", "and", "right", "They", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "endow", "with", "reason", "and", "conscienc", "and", "should",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "act", "toward", "one", "anoth", "in", "a", "spirit", "of",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &language, NULL, &stemmer, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_stemmer_snowball_stem_french(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "fRench" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "Tous", "les", "\xC3\xAAtres", "humains", "naissent",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "libres", "et", "\xC3\xA9gaux", "en", "dignit\xC3\xA9",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const bases[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "Tous" ,"le", "\xC3\xAAtre", "humain", "naissent", "libr", "et",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "\xC3\xA9gal", "en", "dignit", "et", "en", "droit", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &language, NULL, &stemmer, &error);
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainenstatic void test_fts_filter_stopwords_stemmer_eng(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "eN" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "conscience", "and", "should", "act", "towards", "one",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", NULL};
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen const char * const bases[] = {
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen "dri" ,"friend", "All", "human", "be", NULL, "born", "free",
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen NULL, "equal", NULL, "digniti", NULL, "right", "They", NULL,
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "endow", NULL, "reason", NULL, "conscienc", NULL, "should",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "act", "toward", "one", "anoth", NULL, NULL, "spirit", NULL,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filters stopwords and stemming chained, English");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, &language, stopword_settings, &filter, &error);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ret = fts_filter_create(filter_class, filter, &language, NULL, &stemmer, &error);
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainenstatic void test_fts_filter_normalizer_swedish_short(void)
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen const char *input[] = {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "Vem kan segla f\xC3\xB6rutan vind?\n"
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "\xC3\x85\xC3\x84\xC3\x96\xC3\xB6\xC3\xA4\xC3\xA5"
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen const char *expected_output[] = {
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen "vem kan segla forutan vind?\naaooaa"
86bde2c1838d1ce967fa2b394bb952004a4adcb7Timo Sirainen const char * const settings[] =
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen unsigned int i;
a0c453a8edaec90fb0d945c874de0b1845bc7d7eTimo Sirainen test_begin("fts filter normalizer Swedish short text");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, settings, &norm, &error);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainenstatic void test_fts_filter_normalizer_swedish_short_default_id(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char *input[] = {
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "Vem kan segla f\xC3\xB6rutan vind?\n"
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "\xC3\x85\xC3\x84\xC3\x96\xC3\xB6\xC3\xA4\xC3\xA5"
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const char *expected_output[] = {
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen "vem kan segla forutan vind?\naaooaa"
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen unsigned int i;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filter normalizer Swedish short text using default ID");
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, NULL, &norm, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i);
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i);
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainen/* UDHRDIR comes from Automake AM_CPPFLAGS */
acc4e0a41f1c8ef0559a19c280afc1b97b9e0818Timo Sirainenstatic void test_fts_filter_normalizer_french(void)
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen const char * const settings[] =
b8835b8a21c617ceb82ddc5a176243faf36aa8f7Timo Sirainen {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen unsigned char sha512_digest[SHA512_RESULTLEN];
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen const unsigned char correct_digest[] = {
ecdce39e5ef4b62eefa9f5818f17d153fd5d710aTimo Sirainen 0x78, 0x1e, 0xb9, 0x04, 0xa4, 0x92, 0xca, 0x88,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x1e, 0xef, 0x7b, 0xc8, 0x3e, 0x4a, 0xa8, 0xdb,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x9c, 0xd4, 0x42, 0x5c, 0x64, 0x81, 0x06, 0xd5,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x72, 0x93, 0x38, 0x0c, 0x09, 0xce, 0xbe, 0xdf,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x65, 0xff, 0x36, 0x35, 0x05, 0x77, 0xcc, 0xc6,
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen 0xff, 0x44, 0x2c, 0x31, 0x10, 0x00, 0xf6, 0x8d,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0x15, 0x25, 0x1e, 0x54, 0x67, 0x2a, 0x5b, 0xc1,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen 0xdb, 0x84, 0xc5, 0x0d, 0x43, 0x7e, 0x8c, 0x70};
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen test_begin("fts filter normalizer French UDHR");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen udhr_path = t_strconcat(UDHRDIR, UDHR_FRA_NAME, NULL);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, settings, &norm, &error);
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen while (NULL != fgets(buf, sizeof(buf), input)) {
6b09a3b269f4b10364c9a77f6614dbe3d306b79dTimo Sirainen if ((normalized = fts_filter_filter(norm, buf)) == NULL){
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen sha512_loop(&ctx, normalized, strlen(normalized));
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen test_assert(memcmp(sha512_digest, correct_digest,
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen sizeof(sha512_digest)) == 0);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_normalizer_invalid_id(void)
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen const char *settings[] =
428d63767dc20aeb87695b82fb01cd0a06d7769cTimo Sirainen {"id", "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove",
31a574fda352ef4f71dbff9c30e15e4744e132c0Timo Sirainen test_begin("fts filter normalizer invalid id");
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
8d587838c414c48a331f0b54cd7ffd97e5024abdTimo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, settings, &norm, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainenstatic void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const id_settings[] =
c649139f889c02154fc9a153728b81619edb5663Timo Sirainen //{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen struct fts_language language = { .name = "En" };
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const tokens[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dries" ,"friendlies", "All", "human", "beings", "are",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "born", "free", "and", "equal", "in", "dignity", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "rights", "They", "are", "endowed", "with", "reason", "and",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "conscience", "and", "should", "act", "towards", "one",
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen "another", "in", "a", "spirit", "of", "brotherhood", "ABCFoo",
beae08c3abc23434d15572ab3b059fbdf8efc2dfTimo Sirainen const char * const bases[] = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "dri" ,"friend", "all", "human", "be", NULL, "born", "free",
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen NULL, "equal", NULL, "digniti", NULL, "right", NULL, NULL,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen "endow", NULL, "reason", NULL, "conscienc", NULL, "should",
54fcc10af7fb60e495318f7e81652d05eb3e0cadTimo Sirainen "act", "toward", "one", "anoth", NULL, NULL, "spirit", NULL,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *tpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen const char * const *bpp;
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_begin("fts filters normalizer, stopwords and stemming chained, English");
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(ICU_NORMALIZER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, NULL, NULL, id_settings, &normalizer, &error);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, normalizer, &language, stopword_settings, &filter, &error);
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen filter_class = fts_filter_find(SNOWBALL_STEMMER_FILTER_NAME);
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen ret = fts_filter_create(filter_class, filter, &language, NULL, &stemmer, &error);
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen/* TODO: Functions to test 1. ref-unref pairs 2. multiple registers +
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen an unregister + find */
8a13d19a514bfc316149eda172558d12526f9e4eTimo Sirainen static void (*test_functions[])(void) = {
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_stemmer_snowball_stem_english,
48566ca412a7cf3b42512fd0ec112744778e5da0Timo Sirainen test_fts_filter_normalizer_swedish_short_default_id,