/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "buffer.h"
#include "str.h"
#include "unichar.h"
#include "bsearch-insert-pos.h"
#include "fts-common.h"
#include "fts-tokenizer-private.h"
#include "fts-tokenizer-generic-private.h"
#include "fts-tokenizer-common.h"
#include "word-boundary-data.c"
#include "word-break-data.c"
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47: !"#$%&()*+,-./ */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
};
static int
struct fts_tokenizer **tokenizer_r,
const char **error_r)
{
unsigned int i;
max_length == 0) {
"Invalid maxlen setting: %s", value);
return -1;
}
;
else {
"Invalid algorithm: %s", value);
return -1;
}
/* tokenizing a search string -
makes no difference to us */
else
} else {
return -1;
}
}
*error_r = "Can not use WB5a for algorithms other than TR29.";
return -1;
}
if (algo == BOUNDARY_ALGORITHM_TR29)
else
return 0;
}
static void
{
(struct generic_fts_tokenizer *)_tok;
}
static bool
const char **token_r)
{
/* Remove the trailing apostrophe - it was made
into U+0027 earlier. There can be only a single such
apostrophe, because otherwise the token would have already
been split. We also want to remove the trailing apostrophe
only if it's the the last character in the nontruncated
token - a truncated token may end with apostrophe. */
len--;
}
} else {
}
tok->untruncated_length = 0;
return len > 0;
}
{
}
{
unsigned int idx;
/* Unicode General Punctuation, including deprecated characters. */
if (c >= 0x2000 && c <= 0x206f)
return TRUE;
/* From word-break-data.c, which is generated from PropList.txt. */
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return TRUE;
return FALSE;
}
static inline bool
unichar_t c, bool apostrophe)
{
if (apostrophe)
else if (c < 0x80)
return fts_ascii_word_breaks[c] != 0;
else
return fts_uni_word_break(c);
}
{
(struct generic_fts_tokenizer *)_tok;
tok->untruncated_length = 0;
}
{
}
static int
const char **error_r ATTR_UNUSED)
{
(struct generic_fts_tokenizer *)_tok;
int char_size;
unichar_t c;
bool apostrophe;
apostrophe = IS_APOSTROPHE(c);
return 1;
}
/* it doesn't actually matter at this point how whether
subsequent apostrophes are handled by prefix
skipping or by ignoring empty tokens - they will be
dropped in any case. */
} else if (apostrophe) {
/* all apostrophes require special handling */
} else {
}
}
/* word boundary not found yet */
*skip_r = i;
/* return the last token */
if (size == 0) {
return 1;
}
return 0;
}
/* TODO: Arrange array searches roughly in order of likelihood of a match.
TODO: Make some array of the arrays, so this can be a foreach loop.
TODO: Check for Hangul.
TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
U+058A ( ֊ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
HYPHEN.
TODO
*/
{
unsigned int idx;
if (IS_APOSTROPHE(c))
return LETTER_TYPE_APOSTROPHE;
return LETTER_TYPE_CR;
return LETTER_TYPE_LF;
return LETTER_TYPE_NEWLINE;
return LETTER_TYPE_EXTEND;
return LETTER_TYPE_REGIONAL_INDICATOR;
return LETTER_TYPE_FORMAT;
return LETTER_TYPE_KATAKANA;
return LETTER_TYPE_HEBREW_LETTER;
return LETTER_TYPE_ALETTER;
return LETTER_TYPE_SINGLE_QUOTE;
return LETTER_TYPE_DOUBLE_QUOTE;
return LETTER_TYPE_MIDNUMLET;
return LETTER_TYPE_MIDLETTER;
return LETTER_TYPE_MIDNUM;
return LETTER_TYPE_NUMERIC;
return LETTER_TYPE_EXTENDNUMLET;
return LETTER_TYPE_OTHER;
}
{
i_panic("Letter type should not be used.");
}
/* WB3, WB3a and WB3b, but really different since we try to eat
whitespace between words. */
{
return TRUE;
}
{
/* WB4 */
return FALSE;
}
{
/* WB13c */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB13 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB5 */
return FALSE;
/* WB7 WB7c, except MidNumLet */
return FALSE;
/* WB10 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB5a */
return TRUE;
}
/* WB5 */
return FALSE;
/* WB7, except MidNumLet */
return FALSE;
/* WB10 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB6 */
return FALSE;
/* WB12 */
return FALSE;
return TRUE; /* Any / Any */
}
{
return FALSE;
return TRUE; /* Any / Any */
}
{
return TRUE;
}
{
/* WB6 */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB12 */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB8 */
return FALSE;
/* WB9 */
return FALSE;
/* WB11 */
return FALSE;
/* WB13b */
return FALSE;
return TRUE; /* Any / Any */
}
{
/* WB13a */
return FALSE;
return TRUE; /* Any / Any */
}
{
return FALSE;
return TRUE; /* Any / Any */
}
{
return TRUE; /* Any / Any */
}
static void
{
}
static void
{
}
/*
TODO: Define what to skip between words.
TODO: Include double quotation marks? Messes up parsing?
TODO: Does this "reverse approach" include too much in "whitespace"?
TODO: Possibly use is_word_break()?
*/
{
return FALSE;
return TRUE;
}
/* The way things are done WB6/7 and WB11/12 "false positives" can
leave trailing unwanted chars. They are searched for here. This is
very kludgy and should be coded into the rules themselves
somehow.
*/
{
/* WB6/7 false positive detected at one past end. */
return TRUE;
/* WB11/12 false positive detected at one past end. */
return TRUE;
return FALSE;
}
static void
const char **token_r)
{
if (is_one_past_end(tok) &&
/* delete the last character */
len--;
len--;
}
/* we're skipping all non-token chars at the beginning of the word,
so by this point we must have something here - even if we just
deleted the last character */
tok->untruncated_length = 0;
}
{
tok->prev_letter_c = 0;
}
struct letter_fn {
};
};
/*
Find word boundaries in input text. Based on Unicode standard annex
#29, but tailored for FTS purposes.
Note: The text of tr29 is a living standard, so it keeps
changing. In newer specs some characters are combined, like AHLetter
(ALetter | Hebrew_Letter) and MidNumLetQ (MidNumLet | Single_Quote).
Adaptions:
* Added optional WB5a as a configurable option. The cut of prefix is
max FTS_WB5A_PREFIX chars.
* No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
* Break just once, not before and after.
* Other things also (e.g. is_nontoken(), not really pure tr29. Meant
to assist in finding individual words.
*/
static bool
{
/* No rule knows what to do with just one char, except the linebreaks
we eat away (above) anyway. */
return TRUE;
}
/* These types are completely ignored. */
} else {
}
return FALSE;
}
static int
const char **error_r ATTR_UNUSED)
{
(struct generic_fts_tokenizer *)_tok;
unichar_t c;
int char_size;
for (i = 0; i < size; ) {
char_start_i = i;
i += char_size;
lt = letter_type(c);
/* The WB5a break is detected only when the "after
break" char is inspected. That char needs to be
reinserted as the "previous char". */
/* Skip non-token chars at the beginning of token */
start_pos = i;
continue;
}
add_letter_c(tok, c);
*skip_r = i;
return 1;
} else if (lt == LETTER_TYPE_APOSTROPHE ||
lt == LETTER_TYPE_SINGLE_QUOTE) {
/* all apostrophes require special handling */
start_pos = i;
}
}
*skip_r = i;
/* return the last token */
*skip_r = 0;
return 1;
}
return 0;
}
static int
const unsigned char *data ATTR_UNUSED,
const char **token_r ATTR_UNUSED,
const char **error_r ATTR_UNUSED)
{
i_unreached();
}
};
.name = "generic",
.v = &generic_tokenizer_vfuncs
};
};
};