fts-filter-stemmer-snowball.c revision 02c335c23bf5fa225a467c19f2c063fb0dc7b8c3
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny/* Copyright (c) 2014-2016 Dovecot authors, see the included COPYING file */
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny#include "lib.h"
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny#include "fts-language.h"
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny#include "fts-filter-private.h"
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny#ifdef HAVE_FTS_STEMMER
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny#include <libstemmer.h>
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zelenystruct fts_filter_stemmer_snowball {
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny struct fts_filter filter;
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny pool_t pool;
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny struct fts_language *lang;
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny struct sb_stemmer *stemmer;
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny};
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zelenystatic void fts_filter_stemmer_snowball_destroy(struct fts_filter *filter)
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny{
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny struct fts_filter_stemmer_snowball *sp =
58dd26b1c5b60ee992dd5d1214bb168aebb42d54Jakub Hrozek (struct fts_filter_stemmer_snowball *)filter;
58dd26b1c5b60ee992dd5d1214bb168aebb42d54Jakub Hrozek
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny if (sp->stemmer != NULL)
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny sb_stemmer_delete(sp->stemmer);
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny pool_unref(&sp->pool);
20d0bc6d587f346238062df4da5edfde815e59b1Jan Zeleny}
58dd26b1c5b60ee992dd5d1214bb168aebb42d54Jakub Hrozek
58dd26b1c5b60ee992dd5d1214bb168aebb42d54Jakub Hrozekstatic int
58dd26b1c5b60ee992dd5d1214bb168aebb42d54Jakub Hrozekfts_filter_stemmer_snowball_create(const struct fts_language *lang,
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina const char *const *settings,
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina struct fts_filter **filter_r,
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina const char **error_r)
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina{
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina struct fts_filter_stemmer_snowball *sp;
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina pool_t pp;
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina *filter_r = NULL;
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina if (settings[0] != NULL) {
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina *error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina return -1;
cf1a8af5556b1d8eab68802918c881ae1a0b89ebPavel Březina }
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce pp = pool_alloconly_create(MEMPOOL_GROWING"fts_filter_stemmer_snowball",
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce sizeof(struct fts_filter));
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce sp = p_new(pp, struct fts_filter_stemmer_snowball, 1);
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce sp->pool = pp;
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce sp->filter = *fts_filter_stemmer_snowball;
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce sp->lang = p_malloc(sp->pool, sizeof(struct fts_language));
1f800ebb0f190854b8296146174f3d696a426333Simo Sorce sp->lang->name = p_strdup(sp->pool, lang->name);
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce *filter_r = &sp->filter;
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce return 0;
1f800ebb0f190854b8296146174f3d696a426333Simo Sorce}
1f800ebb0f190854b8296146174f3d696a426333Simo Sorce
07b92f78d1751d8a2a538a440e1fdb24c59021e0Pavel Březinastatic int
1f800ebb0f190854b8296146174f3d696a426333Simo Sorcefts_filter_stemmer_snowball_create_stemmer(struct fts_filter_stemmer_snowball *sp,
1f800ebb0f190854b8296146174f3d696a426333Simo Sorce const char **error_r)
850ca620611f65115ee95e1d919be8443f95c14cLukas Slebodnik{
1f800ebb0f190854b8296146174f3d696a426333Simo Sorce sp->stemmer = sb_stemmer_new(sp->lang->name, "UTF_8");
850ca620611f65115ee95e1d919be8443f95c14cLukas Slebodnik if (sp->stemmer == NULL) {
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce *error_r = t_strdup_printf(
1f800ebb0f190854b8296146174f3d696a426333Simo Sorce "Creating a Snowball stemmer for language '%s' failed.",
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce sp->lang->name);
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce fts_filter_stemmer_snowball_destroy(&sp->filter);
0232747f04b650796db56fd7b487aee8a96fab03Simo Sorce return -1;
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek }
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek return 0;
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek}
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozekstatic int
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozekfts_filter_stemmer_snowball_filter(struct fts_filter *filter,
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek const char **token, const char **error_r)
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek{
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek struct fts_filter_stemmer_snowball *sp =
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek (struct fts_filter_stemmer_snowball *) filter;
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek const sb_symbol *base;
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek if (sp->stemmer == NULL) {
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek if (fts_filter_stemmer_snowball_create_stemmer(sp, error_r) < 0)
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek return -1;
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek }
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek base = sb_stemmer_stem(sp->stemmer, (const unsigned char *)*token, strlen(*token));
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek if (base == NULL) {
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek /* the only reason why this could fail is because of
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek out of memory. */
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek i_fatal_status(FATAL_OUTOFMEM,
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek "sb_stemmer_stem(len=%"PRIuSIZE_T") failed: "
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek "Out of memory", strlen(*token));
33c865412732554ef255e93c4e7a58b0bce963c6Jakub Hrozek }
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce *token = t_strndup(base, sb_stemmer_length(sp->stemmer));
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce return 1;
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce}
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce#else
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorcestatic int
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorcefts_filter_stemmer_snowball_create(const struct fts_language *lang ATTR_UNUSED,
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce const char *const *settings ATTR_UNUSED,
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce struct fts_filter **filter_r ATTR_UNUSED,
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce const char **error_r)
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce{
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce *error_r = "Snowball support not built in";
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce return -1;
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce}
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorcestatic void
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorcefts_filter_stemmer_snowball_destroy(struct fts_filter *stemmer ATTR_UNUSED)
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce{
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce}
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorce
bba1a5fd62cffcae076d1351df5a83fbc4a6ec17Simo Sorcestatic int
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březinafts_filter_stemmer_snowball_filter(struct fts_filter *filter ATTR_UNUSED,
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina const char **token ATTR_UNUSED,
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina const char **error_r ATTR_UNUSED)
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina{
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina return -1;
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina}
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina#endif
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březinastatic const struct fts_filter fts_filter_stemmer_snowball_real = {
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina .class_name = "snowball",
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina .v = {
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina fts_filter_stemmer_snowball_create,
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina fts_filter_stemmer_snowball_filter,
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina fts_filter_stemmer_snowball_destroy
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina }
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina};
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březinaconst struct fts_filter *fts_filter_stemmer_snowball = &fts_filter_stemmer_snowball_real;
e1f68731525116ce686ffcdc07ad3a14e4fb1cd7Pavel Březina