fts-language.c revision 82eadbc4311faf7719d5db33fddaa06cb3a7010b
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen#ifndef TEXTCAT_RESULT_UNKNOWN /* old textcat.h has typos */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen# define TEXTCAT_RESULT_UNKNOWN TEXTCAT_RESULT_UNKOWN
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_language fts_language_data = {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenconst struct fts_language *fts_language_find(const char *name)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (i = 0; i < N_ELEMENTS(fts_languages); i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_init(const char *const *settings)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen unsigned int i;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *key = settings[i], *value = settings[i+1];
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (strcmp(key, "fts_language_config") == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen else if (strcmp(key, "fts_language_data") == 0) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen pool = pool_alloconly_create("fts_language_list", 128);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen lp = p_new(pool, struct fts_language_list, 1);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_language_list_deinit(struct fts_language_list **list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic const struct fts_language *
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_find(struct fts_language_list *list, const char *name)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenvoid fts_language_list_add(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_assert(fts_language_list_find(list, lang->name) == NULL);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenbool fts_language_list_add_names(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const char *const *langs;
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (langs = t_strsplit_spaces(names, ", "); *langs != NULL; langs++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* unknown language */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if (fts_language_list_find(list, lang->name) == NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_get_all(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_list_get_first(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic bool fts_language_match_lists(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen for (int i = 0; i < candp_len; i++) {
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* name is <lang>-<optional country or characterset>-<encoding>
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen eg, fi--utf8 or pt-PT-utf8 */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen if ((*lang_r = fts_language_list_find(list, name)) != NULL)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenstatic int fts_language_textcat_init(struct fts_language_list *list)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen config_path = list->textcat_config != NULL ? list->textcat_config :
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen data_dir = list->textcat_datadir != NULL ? list->textcat_datadir :
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen list->textcat_handle = special_textcat_Init(config_path, data_dir);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_error("special_textcat_Init(%s, %s) failed",
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* The textcat minimum document size could be set here. It
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen currently defaults to 3. UTF8 is enabled by default. */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_detect_textcat(struct fts_language_list *list ATTR_UNUSED,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen const struct fts_language **lang_r ATTR_UNUSED)
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen candidate_t *candp; /* textcat candidate result array pointer */
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen candp = textcat_GetClassifyFullOutput(list->textcat_handle);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen i_fatal_status(FATAL_OUTOFMEM, "textcat_GetCLassifyFullOutput failed: malloc() returned NULL");
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen cnt = textcat_ClassifyFull(list->textcat_handle, (const void *)text,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen match = fts_language_match_lists(list, candp, cnt, lang_r);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp);
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainenfts_language_detect(struct fts_language_list *list,
c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3Timo Sirainen /* if there's only a single wanted language, return it always. */