test-fts-tokenizer.c revision c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3
2e37d45867d081db150ab78dad303b9077aea24fTimo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen static const unsigned char input[] =
2ef0e8ee48c9683f7bd6698798efa3328e4322d1Timo Sirainen "hello world\r\nAnd there\twas: text "
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen "galore, and more.\n\n (\"Hello world\")last ";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
d1e7425048c61d71f41f737ba947687198842dc2Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_unicode_whitespace(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f )*/
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer generic simple with Unicode whitespace");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_char_generic_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "bar", "example", "org", "foo", "domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen test_begin("fts tokenizer generic simple input one character at a time");
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
6303191abcb37164f435ccdc56e9dbddf1288851Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
2e2a1d720ed53490e8e5c5031e773d395bd5683dTimo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\r\n\nAnd there\twas: text "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "galore, and more.\n\n (\"Hello world\")3.14 3,14 last 1.";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen This definitely needs to be remapped. */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_unicode_whitespace(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\r\nAnd\xE2\x80\x80there\twas: text "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer generic TR29 with Unicode whitespace");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_midnumlet_end(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* u+FF0E is EF BC 8E */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\xEF\xBC\x8E";
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen static const char *const expected_output[] = {
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen test_begin("fts tokenizer generic TR29 with MinNumLet U+FF0E at end");
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_char_generic_tr29_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
18f1bbf05980d3c53ecae81b62574212f0891522Timo Sirainen test_begin("fts tokenizer generic TR29 input one character at a time");
77f1da4b5e2b800197d8db548235497d5e9d6a4fTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
77f1da4b5e2b800197d8db548235497d5e9d6a4fTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_line_address_only(void)
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen static const char *const input[] = {
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen "abc@example.com",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen " Bar Baz <bar@example.org>",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen " moro foo@domain Bar Baz <bar@example.org>"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain", "foo@domain", "bar@example.org", NULL
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen const char *const settings[] = {"have_parent", "0", NULL};
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen unsigned int i;
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen test_begin("fts tokenizer email address only, input one line at a time");
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, (unsigned char *)input[i], strlen(input[i])) :
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
2ef0e8ee48c9683f7bd6698798efa3328e4322d1Timo Sirainenstatic void test_fts_tokenizer_char_address_only(void)
6303191abcb37164f435ccdc56e9dbddf1288851Timo Sirainen static const unsigned char input[] =
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen "abc@example.com, "
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen "Bar Baz <bar@example.org>, "
2ef0e8ee48c9683f7bd6698798efa3328e4322d1Timo Sirainen "foo@domain";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *const settings[] = {"have_parent", "0", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address only, input one character at a time");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_rand_address_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen "Abc Dfg <abc.dfg@example.com>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org foo ";
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen static const char *const expected_output[] = {
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen "abc.dfg@example.com",
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen "foo.bar@host.example.org",
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen const char *const settings[] = {"have_parent", "0", NULL};
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_begin("fts tokenizer email address, input random length");
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL,
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainenstatic void test_fts_tokenizer_address_char(void)
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, "
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain";
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen static const char *const expected_output[] = {
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "abc", "example", "com", "abc@example.com", "Bar", "Baz",
6998ca95b4947c90647ac5d4794ebd6311acada2Timo Sirainen unsigned int i;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_begin("fts tokenizer email address + parent, input one character at a time");
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_line(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const input[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, ",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, ",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain, ",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc", "example", "com", "abc@example.com", "Bar", "Baz",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "bar", "example", "org", "bar@example.org", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address + parent, input one line at a time");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, (unsigned char *)input[i], strlen(input[i])) :
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainenstatic void test_fts_tokenizer_address_rand(void)
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen static const unsigned char input[] =
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen "abc@example.com, "
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen "Bar Baz <bar@example.org>, "
cb2c44f33d9d48f58e4c5e42ba2526a0c100218aTimo Sirainen "foo@domain";
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen static const char *const expected_output[] = {
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen "abc", "example", "com", "abc@example.com", "Bar", "Baz",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address + parent, input random length");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
6998ca95b4947c90647ac5d4794ebd6311acada2Timo Sirainen //srand(1424142100); /* had a bug */
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static void (*test_functions[])(void) = {
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen test_fts_tokenizer_generic_unicode_whitespace,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_generic_tr29_unicode_whitespace,