test-fts-tokenizer.c revision b1965419f329eb7cf78ee39e7c5942462eabb256
/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "sha2.h"
#include "hex-binary.h"
#include "test-common.h"
#include "fts-tokenizer.h"
#include "fts-tokenizer-private.h"
/* TODO: fix including and linking of this. */
/* #include "fts-tokenizer-generic-private.h" */
#include <stdlib.h>
static void test_fts_tokenizer_generic_only(void)
{
static const unsigned char input[] =
"hello world\r\nAnd there\twas: text "
"galore, and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n (\"Hello world\")last ";
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galore",
"and", "longlonglongabcdefghijklmnopqr",
};
const struct fts_tokenizer *tok_class;
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
test_begin("fts tokenizer generic simple");
/*TODO: Uncomment when fts-tokenizer-generic-private.h inclusion is fixed */
/*test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);*/
eopp++;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_generic_unicode_whitespace(void)
{
/* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
U+205A(e2 81 9a) and U+205F(e2 81 9f )*/
static const unsigned char input[] =
"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galore",
};
const struct fts_tokenizer *tok_class;
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
test_begin("fts tokenizer generic simple with Unicode whitespace");
eopp++;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_char_generic_only(void)
{
static const unsigned char input[] =
"abc@example.com, "
"Bar Baz <bar@example.org>, "
"foo@domain";
static const char *const expected_output[] = {
"abc", "example", "com", "Bar", "Baz",
};
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
unsigned int i;
int ret;
test_begin("fts tokenizer generic simple input one character at a time");
for (i = 0; i <= sizeof(input)-1; ) {
if (ret == 0) {
i++;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_generic_tr29_only(void)
{
static const unsigned char input[] =
"hello world\r\n\nAnd there\twas: text "
"galore, and more.\n\n (\"Hello world\")3.14 3,14 last"
" longlonglongabcdefghijklmnopqrstuvwxyz 1.";
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galore",
"and", "more", "Hello", "world", "3.14",
};
const struct fts_tokenizer *tok_class;
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
test_begin("fts tokenizer generic TR29");
eopp++;
}
eopp++;
}
test_end();
}
/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
static void test_fts_tokenizer_generic_tr29_unicode_whitespace(void)
{
/* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2
81 9f)*/
static const unsigned char input[] =
"hello world\r\nAnd\xE2\x80\x80there\twas: text "
"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galore",
};
const struct fts_tokenizer *tok_class;
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
test_begin("fts tokenizer generic TR29 with Unicode whitespace");
eopp++;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_generic_tr29_midnumlet_end(void)
{
/* u+FF0E is EF BC 8E */
static const unsigned char input[] =
"hello world\xEF\xBC\x8E";
static const char *const expected_output[] = {
};
const struct fts_tokenizer *tok_class;
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
test_begin("fts tokenizer generic TR29 with MinNumLet U+FF0E at end");
eopp++;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_char_generic_tr29_only(void)
{
static const unsigned char input[] =
"abc@example.com, "
"Bar Baz <bar@example.org>, "
"foo@domain";
static const char *const expected_output[] = {
};
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
unsigned int i;
int ret;
test_begin("fts tokenizer generic TR29 input one character at a time");
for (i = 0; i <= sizeof(input)-1; ) {
if (ret == 0) {
i++;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_line_address_only(void)
{
static const char *const input[] = {
"abc@example.com",
" Bar Baz <bar@example.org>",
"foo@domain",
" moro foo@domain Bar Baz <bar@example.org>"
};
static const char *const expected_output[] = {
"abc@example.com", "bar@example.org",
};
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
unsigned int i;
int ret;
test_begin("fts tokenizer email address only, input one line at a time");
for (i = 0; i <= N_ELEMENTS(input);) {
if (ret == 0) {
i++;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_char_address_only(void)
{
static const unsigned char input[] =
"@invalid invalid@ abc@example.com, "
"Bar Baz <bar@example.org>, "
"foo@domain";
static const char *const expected_output[] = {
"abc@example.com", "bar@example.org",
"foo@domain", NULL
};
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
unsigned int i;
int ret;
test_begin("fts tokenizer email address only, input one character at a time");
for (i = 0; i <= sizeof(input)-1; ) {
if (ret == 0) {
i++;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_rand_address_only(void)
{
static const unsigned char input[] =
"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, "
"Foo Bar (comment)foo.bar@host.example.org foo ";
static const char *const expected_output[] = {
"abc.dfg@example.com",
"foo.bar@host.example.org",
};
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
int ret;
test_begin("fts tokenizer email address, input random length");
for (i = 0; i <= sizeof(input)-1; ) {
if (ret == 0) {
i += step;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_address_char(void)
{
static const unsigned char input[] =
"@invalid invalid@ abc@example.com, "
"Bar Baz <bar@example.org>, "
"foo@domain";
static const char *const expected_output[] = {
"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
"bar", "example", "org", "bar@example.org",
};
const char * const *eopp = expected_output;
unsigned int i;
int ret;
test_begin("fts tokenizer email address + parent, input one character at a time");
for (i = 0; i <= sizeof(input)-1; ) {
if (ret == 0) {
i++;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_address_line(void)
{
static const char *const input[] = {
"@invalid invalid@ abc@example.com, ",
"Bar Baz <bar@example.org>, ",
"foo@domain, ",
"foo@domain Bar Baz <bar@example.org>, "
};
static const char *const expected_output[] = {
"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
"bar", "example", "org", "bar@example.org",
"foo", "domain", "foo@domain",
"foo", "domain", "foo@domain", "Bar", "Baz",
};
const char * const *eopp = expected_output;
unsigned int i;
int ret;
test_begin("fts tokenizer email address + parent, input one line at a time");
for (i = 0; i <= N_ELEMENTS(input);) {
if (ret == 0) {
i++;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_address_rand(void)
{
static const unsigned char input[] =
"@invalid invalid@ abc@example.com, "
"Bar Baz <bar@example.org>, "
"foo@domain";
static const char *const expected_output[] = {
"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
"bar", "example", "org", "bar@example.org",
};
const char * const *eopp = expected_output;
int ret;
test_begin("fts tokenizer email address + parent, input random length");
//srand(1424142100); /* had a bug */
for (i = 0; i <= sizeof(input)-1; ) {
if (ret == 0) {
i += step;
continue;
}
eopp++;
}
test_end();
}
static void test_fts_tokenizer_address_search(void)
{
static const unsigned char input[] =
"@invalid invalid@ abc@example.com, "
"Bar Baz <bar@example.org>, "
"foo@domain";
static const char *const expected_output[] = {
"invalid", "invalid", "abc@example.com", "Bar", "Baz",
};
const char * const *eopp = expected_output;
unsigned int i;
int ret;
test_begin("fts tokenizer search email address + parent, input one character at a time");
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
for (i = 0; i <= sizeof(input)-1; ) {
if (ret == 0) {
i++;
continue;
}
eopp++;
}
/* make sure state is forgotten at EOF */
/* test reset explicitly */
test_end();
}
int main(void)
{
static void (*test_functions[])(void) = {
};
int ret;
return ret;
}