bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "lib.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "array.h"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#include "llist.h"
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila#include "fts-language.h"
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#ifdef HAVE_LIBEXTTEXTCAT_TEXTCAT_H
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen# include <libexttextcat/textcat.h>
82eadbc4311faf7719d5db33fddaa06cb3a7010bTimo Sirainen#elif defined (HAVE_FTS_EXTTEXTCAT)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen# include <textcat.h>
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#ifndef TEXTCAT_RESULT_UNKNOWN /* old textcat.h has typos */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen# ifdef TEXTCAT_RESULT_UNKOWN
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen# define TEXTCAT_RESULT_UNKNOWN TEXTCAT_RESULT_UNKOWN
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen# endif
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#define DETECT_STR_MAX_LEN 200
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstruct fts_language_list {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pool_t pool;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen ARRAY_TYPE(fts_language) languages;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *textcat_config;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *textcat_datadir;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen void *textcat_handle;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen bool textcat_failed;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilapool_t fts_languages_pool;
4b26f71b46fc718be27933dfaa26905b993d87faTeemu HuovilaARRAY_TYPE(fts_language) fts_languages;
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila/* ISO 639-1 alpha 2 codes for languages */
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilaconst struct fts_language fts_languages_builtin [] = {
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "da" }, /* Danish */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "de" }, /* German */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "en" }, /* English */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "es" }, /* Spanish */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "fi" }, /* Finnish */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "fr" }, /* French */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "it" }, /* Italian */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "nl" }, /* Dutch */
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila { "no" }, /* Both Bokmal and Nynorsk are detected as Norwegian */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "pt" }, /* Portuguese */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "ro" }, /* Romanian */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "ru" }, /* Russian */
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila { "sv" } /* Swedish */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_language fts_language_data = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen "data"
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen};
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilavoid fts_languages_init(void)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila const struct fts_language *lp;
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila fts_languages_pool = pool_alloconly_create("fts_language",
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila sizeof(fts_languages_builtin));
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila p_array_init(&fts_languages, fts_languages_pool,
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila N_ELEMENTS(fts_languages_builtin));
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila for (i = 0; i < N_ELEMENTS(fts_languages_builtin); i++){
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila lp = &fts_languages_builtin[i];
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila array_append(&fts_languages, &lp, 1);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila }
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila}
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilavoid fts_languages_deinit(void)
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila{
c69a177207ed18d0f0210347430a60957136bd6cJosef 'Jeff' Sipek pool_unref(&fts_languages_pool);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila}
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilavoid fts_language_register(const char *name)
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila{
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila struct fts_language *lang;
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila if (fts_language_find(name) != NULL)
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila return;
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila lang = p_new(fts_languages_pool, struct fts_language, 1);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila lang->name = p_strdup(fts_languages_pool, name);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila array_append(&fts_languages, (const struct fts_language **)&lang, 1);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila}
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilaconst struct fts_language *fts_language_find(const char *name)
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila{
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila const struct fts_language *const *langp = NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila array_foreach(&fts_languages, langp) {
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila if (strcmp((*langp)->name, name) == 0)
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila return *langp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainenint fts_language_list_init(const char *const *settings,
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen struct fts_language_list **list_r,
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_language_list *lp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pool_t pool;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen const char *conf = NULL, *data = NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; settings[i] != NULL; i += 2) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen if (strcmp(key, "fts_language_config") == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen conf = value;
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen else if (strcmp(key, "fts_language_data") == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen data = value;
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen else {
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen *error_r = t_strdup_printf("Unknown setting: %s", key);
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pool = pool_alloconly_create("fts_language_list", 128);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp = p_new(pool, struct fts_language_list, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp->pool = pool;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (conf != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp->textcat_config = p_strdup(pool, conf);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp->textcat_config = NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (data != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp->textcat_datadir = p_strdup(pool, data);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp->textcat_datadir = NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen p_array_init(&lp->languages, pool, 32);
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen *list_r = lp;
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_language_list_deinit(struct fts_language_list **list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen struct fts_language_list *lp = *list;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *list = NULL;
82eadbc4311faf7719d5db33fddaa06cb3a7010bTimo Sirainen#ifdef HAVE_FTS_EXTTEXTCAT
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lp->textcat_handle != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen textcat_Done(lp->textcat_handle);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pool_unref(&lp->pool);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_language *
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_find(struct fts_language_list *list, const char *name)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language *const *langp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_foreach(&list->languages, langp) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (strcmp((*langp)->name, name) == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return *langp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return NULL;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_language_list_add(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language *lang)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(fts_language_list_find(list, lang->name) == NULL);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_append(&list->languages, &lang, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenbool fts_language_list_add_names(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *names,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char **unknown_name_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *const *langs;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language *lang;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (langs = t_strsplit_spaces(names, ", "); *langs != NULL; langs++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lang = fts_language_find(*langs);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (lang == NULL) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* unknown language */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *unknown_name_r = *langs;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (fts_language_list_find(list, lang->name) == NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen fts_language_list_add(list, lang);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst ARRAY_TYPE(fts_language) *
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_get_all(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return &list->languages;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_language *
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_get_first(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language *const *langp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen langp = array_idx(&list->languages, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return *langp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
82eadbc4311faf7719d5db33fddaa06cb3a7010bTimo Sirainen#ifdef HAVE_FTS_EXTTEXTCAT
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool fts_language_match_lists(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen candidate_t *candp, int candp_len,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language **lang_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *name;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (int i = 0; i < candp_len; i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* name is <lang>-<optional country or characterset>-<encoding>
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen eg, fi--utf8 or pt-PT-utf8 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen name = t_strcut(candp[i].name, '-');
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila /* For Norwegian we treat both bokmal and nynorsk as "no". */
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila if (strcmp(name, "nb") == 0 || strcmp(name, "nn") == 0)
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila name = "no";
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if ((*lang_r = fts_language_list_find(list, name)) != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
82eadbc4311faf7719d5db33fddaa06cb3a7010bTimo Sirainen#ifdef HAVE_FTS_EXTTEXTCAT
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic int fts_language_textcat_init(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *config_path;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *data_dir;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (list->textcat_handle != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (list->textcat_failed)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen config_path = list->textcat_config != NULL ? list->textcat_config :
a6f1ded5a5df8ba467c4026e9cd9c15e6880997bTimo Sirainen TEXTCAT_DATADIR"/fpdb.conf";
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen data_dir = list->textcat_datadir != NULL ? list->textcat_datadir :
a6f1ded5a5df8ba467c4026e9cd9c15e6880997bTimo Sirainen TEXTCAT_DATADIR"/";
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen list->textcat_handle = special_textcat_Init(config_path, data_dir);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (list->textcat_handle == NULL) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_error("special_textcat_Init(%s, %s) failed",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen config_path, data_dir);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen list->textcat_failed = TRUE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return -1;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* The textcat minimum document size could be set here. It
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen currently defaults to 3. UTF8 is enabled by default. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return 0;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic enum fts_language_result
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_detect_textcat(struct fts_language_list *list ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *text ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t size ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language **lang_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
82eadbc4311faf7719d5db33fddaa06cb3a7010bTimo Sirainen#ifdef HAVE_FTS_EXTTEXTCAT
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen candidate_t *candp; /* textcat candidate result array pointer */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen int cnt;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen bool match = FALSE;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (fts_language_textcat_init(list) < 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FTS_LANGUAGE_RESULT_ERROR;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen candp = textcat_GetClassifyFullOutput(list->textcat_handle);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (candp == NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_fatal_status(FATAL_OUTOFMEM, "textcat_GetCLassifyFullOutput failed: malloc() returned NULL");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen cnt = textcat_ClassifyFull(list->textcat_handle, (const void *)text,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen I_MIN(size, DETECT_STR_MAX_LEN), candp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (cnt > 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen T_BEGIN {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen match = fts_language_match_lists(list, candp, cnt, lang_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } T_END;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (match)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FTS_LANGUAGE_RESULT_OK;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FTS_LANGUAGE_RESULT_UNKNOWN;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen } else {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen switch (cnt) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen case TEXTCAT_RESULT_SHORT:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(size < DETECT_STR_MAX_LEN);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FTS_LANGUAGE_RESULT_SHORT;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen case TEXTCAT_RESULT_UNKNOWN:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FTS_LANGUAGE_RESULT_UNKNOWN;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen default:
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_unreached();
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#else
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FTS_LANGUAGE_RESULT_UNKNOWN;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#endif
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenenum fts_language_result
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_detect(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const unsigned char *text ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen size_t size ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language **lang_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen{
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(array_count(&list->languages) > 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* if there's only a single wanted language, return it always. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (array_count(&list->languages) == 1) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language *const *langp =
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen array_idx(&list->languages, 0);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen *lang_r = *langp;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return FTS_LANGUAGE_RESULT_OK;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen }
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen return fts_language_detect_textcat(list, text, size, lang_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen}