fts-tokenizer-generic.c revision 05031501650185cde7150fd538d35ce66d87b526
/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "buffer.h"
#include "unichar.h"
#include "bsearch-insert-pos.h"
#include "fts-tokenizer-private.h"
#include "fts-tokenizer-generic-private.h"
#include "word-boundary-data.c"
#include "word-break-data.c"
#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
#define IS_NONASCII_APOSTROPHE(c) \
((c) == 0x2019 || (c) == 0xFF07)
#define IS_APOSTROPHE(c) \
((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
static unsigned char fts_ascii_word_breaks[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
};
static int
fts_tokenizer_generic_create(const char *const *settings,
struct fts_tokenizer **tokenizer_r,
const char **error_r)
{
struct generic_fts_tokenizer *tok;
unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
unsigned int i;
max_length == 0) {
"Invalid maxlen setting: %s", value);
return -1;
}
;
else {
"Invalid algorithm: %s", value);
return -1;
}
/* tokenizing a search string -
makes no difference to us */
} else {
return -1;
}
}
if (algo == BOUNDARY_ALGORITHM_TR29)
else
return 0;
}
static void
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
}
{
/* if input is truncated with a partial UTF-8 character, drop it */
}
static bool
const char **token_r)
{
const unsigned char *data;
/* clean trailing and starting apostrophes. they were all made
into U+0027 earlier. */
len--;
start++;
return (*token_r)[0] != '\0';
}
{
}
static bool fts_uni_word_break(unichar_t c)
{
unsigned int idx;
/* Unicode General Punctuation, including deprecated characters. */
if (c >= 0x2000 && c <= 0x206f)
return TRUE;
/* From word-break-data.c, which is generated from PropList.txt. */
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return FALSE;
}
static inline bool
unichar_t c, bool apostrophe)
{
if (apostrophe)
else if (c < 0x80)
return fts_ascii_word_breaks[c] != 0;
else
return fts_uni_word_break(c);
}
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
}
{
unichar_t c;
/* Append only one kind of apostrophes. Simplifies things when returning
token. */
while (pos < append_len) {
i_unreached();
if (IS_NONASCII_APOSTROPHE(c)) {
}
}
if (appended < append_len)
}
static int
const char **error_r ATTR_UNUSED)
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
unsigned int char_size;
unichar_t c;
bool apostrophe;
i_unreached();
apostrophe = IS_APOSTROPHE(c);
return 1;
}
/* it doesn't actually matter at this point how whether
subsequent apostrophes are handled by prefix
skipping or by ignoring empty tokens - they will be
dropped in any case. */
} else {
}
}
/* word boundary not found yet */
*skip_r = i;
/* return the last token */
return 1;
}
return 0;
}
/* TODO: Arrange array searches roughly in order of likelyhood of a match.
TODO: Make some array of the arrays, so this can be a foreach loop.
TODO: Check for Hangul.
TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
U+058A ( ֊ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
HYPHEN.
TODO
*/
{
unsigned int idx;
if (IS_APOSTROPHE(c))
return LETTER_TYPE_APOSTROPHE;
return LETTER_TYPE_CR;
return LETTER_TYPE_LF;
return LETTER_TYPE_NEWLINE;
return LETTER_TYPE_EXTEND;
return LETTER_TYPE_REGIONAL_INDICATOR;
return LETTER_TYPE_FORMAT;
return LETTER_TYPE_KATAKANA;
return LETTER_TYPE_HEBREW_LETTER;
return LETTER_TYPE_ALETTER;
return LETTER_TYPE_SINGLE_QUOTE;
return LETTER_TYPE_DOUBLE_QUOTE;
return LETTER_TYPE_MIDNUMLET;
return LETTER_TYPE_MIDLETTER;
return LETTER_TYPE_MIDNUM;
return LETTER_TYPE_NUMERIC;
return LETTER_TYPE_EXTENDNUMLET;
return LETTER_TYPE_OTHER;
}
{
i_panic("Letter type should not be used.");
}
/* WB3, WB3a and WB3b, but really different since we try to eat
whitespace between words. */
{
return TRUE;
}
{
/* WB4 */
return FALSE;
}
{
/* WB13c */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB13 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB5 */
return FALSE;
/* WB7 WB7c, except MidNumLet */
return FALSE;
/* WB10 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB5 */
return FALSE;
/* WB7, except MidNumLet */
return FALSE;
/* WB10 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB6 */
return FALSE;
/* WB12 */
return FALSE;
return TRUE; /* Any / Any */
}
{
return FALSE;
return TRUE; /* Any / Any */
}
{
return TRUE;
}
{
/* WB6 */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB12 */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB8 */
return FALSE;
/* WB9 */
return FALSE;
/* WB11 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB13a */
return FALSE;
return TRUE; /* Any / Any */
}
{
return FALSE;
return TRUE; /* Any / Any */
}
{
return TRUE; /* Any / Any */
}
static void
{
} else
}
/*
TODO: Define what to skip between words.
TODO: Include double quotation marks? Messes up parsing?
TODO: Does this "reverse approach" include too much in "whitespace"?
TODO: Possibly use is_word_break()?
*/
{
return FALSE;
return TRUE;
}
/* The way things are done WB6/7 and WB11/12 "false positives" can
leave trailing unwanted chars. They are searched for here. This is
very kludgy and should be coded into the rules themselves
somehow.
*/
{
/* WB6/7 false positive detected at one past end. */
return TRUE;
/* WB11/12 false positive detected at one past end. */
return TRUE;
return FALSE;
}
static void
const char **token_r)
{
if (is_one_past_end(tok)) {
/* delete the last character */
len--;
len--;
}
/* we're skipping all non-token chars at the beginning of the word,
so by this point we must have something here - even if we just
deleted the last character */
}
struct letter_fn {
};
static struct letter_fn letter_fns[] = {
};
/*
Find word boundaries in input text. Based on Unicode standard annex
#29, but tailored for FTS purposes.
Adaptions:
* No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
* Break just once, not before and after.
* Other things also (e.g. is_nontoken(), not really pure tr29. Meant
to assist in finding individual words.
*/
static bool
{
/* No rule knows what to do with just one char, except the linebreaks
we eat away (above) anyway. */
return TRUE;
}
/* These types are completely ignored. */
} else {
}
return FALSE;
}
static int
const char **error_r ATTR_UNUSED)
{
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
unichar_t c;
enum letter_type lt;
/* TODO: Process 8bit chars separately, to speed things up. */
for (i = 0; i < size; ) {
char_start_i = i;
i_unreached();
i += uni_utf8_char_bytes(data[i]);
lt = letter_type(c);
/* Skip non-token chars at the beginning of token */
start_skip = i;
continue;
}
*skip_r = i;
return 1;
}
}
*skip_r = i;
/* return the last token */
*skip_r = 0;
return 1;
}
return 0;
}
static int
const unsigned char *data ATTR_UNUSED,
const char **token_r ATTR_UNUSED,
const char **error_r ATTR_UNUSED)
{
i_unreached();
}
static const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
};
static const struct fts_tokenizer fts_tokenizer_generic_real = {
.name = "generic",
.v = &generic_tokenizer_vfuncs
};
const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
};
const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
};