bcb4e51a409d94ae670de96afb8483a4f7855294Stephan Bosch/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#ifndef TEXTCAT_RESULT_UNKNOWN /* old textcat.h has typos */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen# define TEXTCAT_RESULT_UNKNOWN TEXTCAT_RESULT_UNKOWN
48afa4224df2a6bcfe75fec11a59c224426dcdc1Teemu Huovila/* ISO 639-1 alpha 2 codes for languages */
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilaconst struct fts_language fts_languages_builtin [] = {
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila { "no" }, /* Both Bokmal and Nynorsk are detected as Norwegian */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_language fts_language_data = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila fts_languages_pool = pool_alloconly_create("fts_language",
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila p_array_init(&fts_languages, fts_languages_pool,
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila for (i = 0; i < N_ELEMENTS(fts_languages_builtin); i++){
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila lang = p_new(fts_languages_pool, struct fts_language, 1);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila lang->name = p_strdup(fts_languages_pool, name);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila array_append(&fts_languages, (const struct fts_language **)&lang, 1);
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovilaconst struct fts_language *fts_language_find(const char *name)
4b26f71b46fc718be27933dfaa26905b993d87faTeemu Huovila const struct fts_language *const *langp = NULL;
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainenint fts_language_list_init(const char *const *settings,
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen const char **error_r)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen else if (strcmp(key, "fts_language_data") == 0)
a9b3887f4d9ed75a76fed964c1930432bf84f4f5Timo Sirainen *error_r = t_strdup_printf("Unknown setting: %s", key);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pool = pool_alloconly_create("fts_language_list", 128);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp = p_new(pool, struct fts_language_list, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_language_list_deinit(struct fts_language_list **list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_language *
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_find(struct fts_language_list *list, const char *name)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_language_list_add(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(fts_language_list_find(list, lang->name) == NULL);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenbool fts_language_list_add_names(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (langs = t_strsplit_spaces(names, ", "); *langs != NULL; langs++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* unknown language */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (fts_language_list_find(list, lang->name) == NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_get_all(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_get_first(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool fts_language_match_lists(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (int i = 0; i < candp_len; i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* name is <lang>-<optional country or characterset>-<encoding>
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen eg, fi--utf8 or pt-PT-utf8 */
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila /* For Norwegian we treat both bokmal and nynorsk as "no". */
3ec8b0d282d46d1f698b1f2aa27922cb8f26cb97Teemu Huovila if (strcmp(name, "nb") == 0 || strcmp(name, "nn") == 0)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if ((*lang_r = fts_language_list_find(list, name)) != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic int fts_language_textcat_init(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen config_path = list->textcat_config != NULL ? list->textcat_config :
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen data_dir = list->textcat_datadir != NULL ? list->textcat_datadir :
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen list->textcat_handle = special_textcat_Init(config_path, data_dir);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_error("special_textcat_Init(%s, %s) failed",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* The textcat minimum document size could be set here. It
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen currently defaults to 3. UTF8 is enabled by default. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_detect_textcat(struct fts_language_list *list ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language **lang_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen candidate_t *candp; /* textcat candidate result array pointer */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen candp = textcat_GetClassifyFullOutput(list->textcat_handle);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_fatal_status(FATAL_OUTOFMEM, "textcat_GetCLassifyFullOutput failed: malloc() returned NULL");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen cnt = textcat_ClassifyFull(list->textcat_handle, (const void *)text,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen match = fts_language_match_lists(list, candp, cnt, lang_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_detect(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* if there's only a single wanted language, return it always. */