fts-tokenizer-generic.c revision e8ee7a23b194516087ba27f1af09213991af0b30
/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "buffer.h"
#include "unichar.h"
#include "bsearch-insert-pos.h"
#include "fts-common.h"
#include "fts-tokenizer-private.h"
#include "fts-tokenizer-generic-private.h"
#include "word-boundary-data.c"
#include "word-break-data.c"
#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
static unsigned char fts_ascii_word_breaks[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
};
static int
fts_tokenizer_generic_create(const char *const *settings,
struct fts_tokenizer **tokenizer_r,
const char **error_r)
{
struct generic_fts_tokenizer *tok;
unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
unsigned int i;
max_length == 0) {
"Invalid maxlen setting: %s", value);
return -1;
}
;
else {
"Invalid algorithm: %s", value);
return -1;
}
/* tokenizing a search string -
makes no difference to us */
} else {
return -1;
}
}
if (algo == BOUNDARY_ALGORITHM_TR29)
else
return 0;
}
static void
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
}
static void
fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
{
unsigned int char_bytes;
/* the token is truncated - make sure the last character
exists entirely in the token */
break;
}
}
}
static bool
const char **token_r)
{
/* Remove the trailing apostrophe - it was made
into U+0027 earlier. There can be only a single such
apostrophe, because otherwise the token would have already
been split. We also want to remove the trailing apostrophe
only if it's the the last character in the nontruncated
token - a truncated token may end with apostrophe. */
len--;
}
} else {
}
tok->untruncated_length = 0;
return len > 0;
}
{
}
static bool fts_uni_word_break(unichar_t c)
{
unsigned int idx;
/* Unicode General Punctuation, including deprecated characters. */
if (c >= 0x2000 && c <= 0x206f)
return TRUE;
/* From word-break-data.c, which is generated from PropList.txt. */
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return FALSE;
}
static inline bool
unichar_t c, bool apostrophe)
{
if (apostrophe)
else if (c < 0x80)
return fts_ascii_word_breaks[c] != 0;
else
return fts_uni_word_break(c);
}
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
tok->untruncated_length = 0;
}
{
}
static int
const char **error_r ATTR_UNUSED)
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
int char_size;
unichar_t c;
bool apostrophe;
apostrophe = IS_APOSTROPHE(c);
return 1;
}
/* it doesn't actually matter at this point how whether
subsequent apostrophes are handled by prefix
skipping or by ignoring empty tokens - they will be
dropped in any case. */
} else if (apostrophe) {
/* all apostrophes require special handling */
const unsigned char apostrophe_char = '\'';
} else {
}
}
/* word boundary not found yet */
*skip_r = i;
/* return the last token */
if (size == 0) {
return 1;
}
return 0;
}
/* TODO: Arrange array searches roughly in order of likelyhood of a match.
TODO: Make some array of the arrays, so this can be a foreach loop.
TODO: Check for Hangul.
TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
U+058A ( ֊ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
HYPHEN.
TODO
*/
{
unsigned int idx;
if (IS_APOSTROPHE(c))
return LETTER_TYPE_APOSTROPHE;
return LETTER_TYPE_CR;
return LETTER_TYPE_LF;
return LETTER_TYPE_NEWLINE;
return LETTER_TYPE_EXTEND;
return LETTER_TYPE_REGIONAL_INDICATOR;
return LETTER_TYPE_FORMAT;
return LETTER_TYPE_KATAKANA;
return LETTER_TYPE_HEBREW_LETTER;
return LETTER_TYPE_ALETTER;
return LETTER_TYPE_SINGLE_QUOTE;
return LETTER_TYPE_DOUBLE_QUOTE;
return LETTER_TYPE_MIDNUMLET;
return LETTER_TYPE_MIDLETTER;
return LETTER_TYPE_MIDNUM;
return LETTER_TYPE_NUMERIC;
return LETTER_TYPE_EXTENDNUMLET;
return LETTER_TYPE_OTHER;
}
{
i_panic("Letter type should not be used.");
}
/* WB3, WB3a and WB3b, but really different since we try to eat
whitespace between words. */
{
return TRUE;
}
{
/* WB4 */
return FALSE;
}
{
/* WB13c */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB13 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB5 */
return FALSE;
/* WB7 WB7c, except MidNumLet */
return FALSE;
/* WB10 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB5 */
return FALSE;
/* WB7, except MidNumLet */
return FALSE;
/* WB10 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB6 */
return FALSE;
/* WB12 */
return FALSE;
return TRUE; /* Any / Any */
}
{
return FALSE;
return TRUE; /* Any / Any */
}
{
return TRUE;
}
{
/* WB6 */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB12 */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB8 */
return FALSE;
/* WB9 */
return FALSE;
/* WB11 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB13a */
return FALSE;
return TRUE; /* Any / Any */
}
{
return FALSE;
return TRUE; /* Any / Any */
}
{
return TRUE; /* Any / Any */
}
static void
{
} else
}
/*
TODO: Define what to skip between words.
TODO: Include double quotation marks? Messes up parsing?
TODO: Does this "reverse approach" include too much in "whitespace"?
TODO: Possibly use is_word_break()?
*/
{
return FALSE;
return TRUE;
}
/* The way things are done WB6/7 and WB11/12 "false positives" can
leave trailing unwanted chars. They are searched for here. This is
very kludgy and should be coded into the rules themselves
somehow.
*/
{
/* WB6/7 false positive detected at one past end. */
return TRUE;
/* WB11/12 false positive detected at one past end. */
return TRUE;
return FALSE;
}
static void
const char **token_r)
{
if (is_one_past_end(tok) &&
/* delete the last character */
len--;
len--;
}
/* we're skipping all non-token chars at the beginning of the word,
so by this point we must have something here - even if we just
deleted the last character */
tok->untruncated_length = 0;
}
struct letter_fn {
};
static struct letter_fn letter_fns[] = {
};
/*
Find word boundaries in input text. Based on Unicode standard annex
#29, but tailored for FTS purposes.
Adaptions:
* No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
* Break just once, not before and after.
* Other things also (e.g. is_nontoken(), not really pure tr29. Meant
to assist in finding individual words.
*/
static bool
{
/* No rule knows what to do with just one char, except the linebreaks
we eat away (above) anyway. */
return TRUE;
}
/* These types are completely ignored. */
} else {
}
return FALSE;
}
static int
const char **error_r ATTR_UNUSED)
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
unichar_t c;
enum letter_type lt;
int char_size;
/* TODO: Process 8bit chars separately, to speed things up. */
for (i = 0; i < size; ) {
char_start_i = i;
i += char_size;
lt = letter_type(c);
/* Skip non-token chars at the beginning of token */
start_pos = i;
continue;
}
*skip_r = i;
return 1;
} else if (lt == LETTER_TYPE_APOSTROPHE ||
lt == LETTER_TYPE_SINGLE_QUOTE) {
/* all apostrophes require special handling */
const unsigned char apostrophe_char = '\'';
start_pos = i;
}
}
*skip_r = i;
/* return the last token */
*skip_r = 0;
return 1;
}
return 0;
}
static int
const unsigned char *data ATTR_UNUSED,
const char **token_r ATTR_UNUSED,
const char **error_r ATTR_UNUSED)
{
i_unreached();
}
static const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
};
static const struct fts_tokenizer fts_tokenizer_generic_real = {
.name = "generic",
.v = &generic_tokenizer_vfuncs
};
const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
};
const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
};