test-fts-tokenizer.c revision c4b772bfbdafe68ac1a0076eab26cd681f8e5046
02c335c23bf5fa225a467c19f2c063fb0dc7b8c3Timo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen#include "lib.h"
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen#include "sha2.h"
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi#include "hex-binary.h"
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen#include "test-common.h"
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen#include "fts-tokenizer.h"
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen#include "fts-tokenizer-private.h"
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen/* TODO: fix including and linking of this. */
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen/* #include "fts-tokenizer-generic-private.h" */
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen#include <stdlib.h>
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainenstatic void test_fts_tokenizer_generic_only(void)
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen{
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen static const unsigned char input[] =
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen "hello world\r\nAnd there\twas: text "
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen "galore, and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n (\"Hello world\")last ";
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen static const char *const expected_output[] = {
daa7e7459749ae8f82cd3eed9c44522d81c609a3Timo Sirainen "hello", "world", "And",
46ec5983bf4519ea42dbfcae3d7c62be0d8ef95fTimo Sirainen "there", "was", "text", "galore",
bbadd5331f534017cf62d5183003b3d9fdad079eTimo Sirainen "and", "longlonglongabcdefghijklmnopqr",
6523f54d1521edf894880f2d45e75cef5dd31c3dTimo Sirainen "more", "Hello", "world", "last", NULL
72f5f2c5c6905b5d3f389b424313e2c450dfad96Timo Sirainen };
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen const struct fts_tokenizer *tok_class;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen struct fts_tokenizer *tok;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen const char * const *eopp = expected_output;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen const char *token, *error;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen
373492be949e159fda651807b3acda2c5c077027Timo Sirainen test_begin("fts tokenizer generic simple");
bbadd5331f534017cf62d5183003b3d9fdad079eTimo Sirainen fts_tokenizers_init();
ab90f702ceedb7ba445a9a592be0b213b27cbafaStephan Bosch tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
ab90f702ceedb7ba445a9a592be0b213b27cbafaStephan Bosch test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen/*TODO: Uncomment when fts-tokenizer-generic-private.h inclusion is fixed */
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen/*test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);*/
da7f1a07f583df8905684a7b78469960afd7c78dPhil Carmody while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen eopp++;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen }
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_assert(strcmp(token, *eopp) == 0);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen eopp++;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen }
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_assert(*eopp == NULL);
635df5b4cbcd7b24c825e01d9dd66d3a4274c4c7Timo Sirainen fts_tokenizer_unref(&tok);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen fts_tokenizers_deinit();
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_end();
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen}
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainenstatic void test_fts_tokenizer_generic_unicode_whitespace(void)
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen{
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen /* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f )*/
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen static const unsigned char input[] =
51fb710488efa419a2964335c30451c62b9633b1Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
93a7d1ee4b518b5c85f9721dc6539e4dab6aae00Timo Sirainen static const char *const expected_output[] = {
f7f25f9e1a38678d0e97d2e609beac16285fac6bTimo Sirainen "hello", "world", "And",
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "there", "was", "text", "galore",
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "and", "more", NULL
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen };
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen const struct fts_tokenizer *tok_class;
35fcdde46a71ac151c2518d48c841019f1181bb2Timo Sirainen struct fts_tokenizer *tok;
35fcdde46a71ac151c2518d48c841019f1181bb2Timo Sirainen const char * const *eopp = expected_output;
35fcdde46a71ac151c2518d48c841019f1181bb2Timo Sirainen const char *token, *error;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen test_begin("fts tokenizer generic simple with Unicode whitespace");
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
635df5b4cbcd7b24c825e01d9dd66d3a4274c4c7Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen eopp++;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen }
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
da7f1a07f583df8905684a7b78469960afd7c78dPhil Carmody test_assert(strcmp(token, *eopp) == 0);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen eopp++;
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen }
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen test_assert(*eopp == NULL);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen fts_tokenizer_unref(&tok);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_end();
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen}
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainenstatic void test_fts_tokenizer_char_generic_only(void)
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen{
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen static const unsigned char input[] =
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "abc@example.com, "
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "Bar Baz <bar@example.org>, "
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "foo@domain";
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen static const char *const expected_output[] = {
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "abc", "example", "com", "Bar", "Baz",
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "bar", "example", "org", "foo", "domain", NULL
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen };
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen struct fts_tokenizer *tok;
ba8498efbf886ca8b69fdb20c0ba2f5dba9416e3Timo Sirainen const char * const *eopp = expected_output;
93a7d1ee4b518b5c85f9721dc6539e4dab6aae00Timo Sirainen const char *token, *error;
f7f25f9e1a38678d0e97d2e609beac16285fac6bTimo Sirainen unsigned int i;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen int ret;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_begin("fts tokenizer generic simple input one character at a time");
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
9f0f2de10e4ea0c99052bf4b2bef8179f2536228Timo Sirainen
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen ret = i < sizeof(input)-1 ?
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_next(tok, &input[i], 1, &token) :
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_next(tok, NULL, 0, &token);
7744586e3e0fd60158abfbb03a233d3bd8d6c48bTimo Sirainen if (ret == 0) {
544a727de8ab0e6c55cab18a7ee475fffdf5eff3Timo Sirainen i++;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen continue;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen }
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen eopp++;
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen }
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen test_assert(*eopp == NULL);
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen fts_tokenizer_unref(&tok);
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen test_end();
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen}
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen
380dbb60ae291cbe39d1f710284562ca9167150bTimo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
380dbb60ae291cbe39d1f710284562ca9167150bTimo Sirainen
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen{
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen static const unsigned char input[] =
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "hello world\r\n\nAnd there\twas: text "
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "galore, and more.\n\n (\"Hello world\")3.14 3,14 last"
1b823b2b7790a1e1b7974fcf11a4c48a28e70f37Timo Sirainen " longlonglongabcdefghijklmnopqrstuvwxyz 1.";
544a727de8ab0e6c55cab18a7ee475fffdf5eff3Timo Sirainen static const char *const expected_output[] = {
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen "hello", "world", "And",
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen "there", "was", "text", "galore",
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen "and", "more", "Hello", "world", "3.14",
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen "3,14", "last", "longlonglongabcdefghijklmnopqr", "1", NULL
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen };
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen const struct fts_tokenizer *tok_class;
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen struct fts_tokenizer *tok;
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen const char * const *eopp = expected_output;
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen const char *token, *error;
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen test_begin("fts tokenizer generic TR29");
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen test_assert(strcmp(token, *eopp) == 0);
7744586e3e0fd60158abfbb03a233d3bd8d6c48bTimo Sirainen eopp++;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen }
1b823b2b7790a1e1b7974fcf11a4c48a28e70f37Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
544a727de8ab0e6c55cab18a7ee475fffdf5eff3Timo Sirainen eopp++;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen }
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_assert(*eopp == NULL);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen fts_tokenizer_unref(&tok);
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen test_end();
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen}
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen This definitely needs to be remapped. */
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_unicode_whitespace(void)
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen{
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen /* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2
7744586e3e0fd60158abfbb03a233d3bd8d6c48bTimo Sirainen 81 9f)*/
544a727de8ab0e6c55cab18a7ee475fffdf5eff3Timo Sirainen static const unsigned char input[] =
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "hello world\r\nAnd\xE2\x80\x80there\twas: text "
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen static const char *const expected_output[] = {
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen "hello", "world", "And",
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "there", "was", "text", "galore",
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen "and", "more", NULL
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen };
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen const struct fts_tokenizer *tok_class;
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen struct fts_tokenizer *tok;
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen const char * const *eopp = expected_output;
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainen const char *token, *error;
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainen
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_begin("fts tokenizer generic TR29 with Unicode whitespace");
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
1b823b2b7790a1e1b7974fcf11a4c48a28e70f37Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
74fb6b5a156c5a61bb6ec827089bb142a10547ddTimo Sirainen while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
544a727de8ab0e6c55cab18a7ee475fffdf5eff3Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen eopp++;
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen }
04052d7cacaa866a3f00afb4e104fa46c04c1dd7Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen eopp++;
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen }
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen test_assert(*eopp == NULL);
cd75c360f244c96b9ee10e01ee3a66fad13183c8Timo Sirainen fts_tokenizer_unref(&tok);
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainen test_end();
b484ab4b55b0d5341f2f4dd98a655a75f0bf1275Timo Sirainen}
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainen
3cf67672fdc87583cb23ce088c95bb5dee60e74dTimo Sirainenstatic void test_fts_tokenizer_generic_tr29_midnumlet_end(void)
7744586e3e0fd60158abfbb03a233d3bd8d6c48bTimo Sirainen{
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen /* u+FF0E is EF BC 8E */
1b823b2b7790a1e1b7974fcf11a4c48a28e70f37Timo Sirainen static const unsigned char input[] =
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "hello world\xEF\xBC\x8E";
544a727de8ab0e6c55cab18a7ee475fffdf5eff3Timo Sirainen static const char *const expected_output[] = {
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "hello", "world", NULL
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen };
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen const struct fts_tokenizer *tok_class;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen struct fts_tokenizer *tok;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen const char * const *eopp = expected_output;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen const char *token, *error;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_begin("fts tokenizer generic TR29 with MinNumLet U+FF0E at end");
57d2429fae575e96ca276355af675deb66b76d00Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
401b0787fff2dc986a5321ddb32acb1947ff66b0Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
57d2429fae575e96ca276355af675deb66b76d00Timo Sirainen eopp++;
401b0787fff2dc986a5321ddb32acb1947ff66b0Timo Sirainen }
401b0787fff2dc986a5321ddb32acb1947ff66b0Timo Sirainen while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen eopp++;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen }
7744586e3e0fd60158abfbb03a233d3bd8d6c48bTimo Sirainen test_assert(*eopp == NULL);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_unref(&tok);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_end();
61618d4c58080570f689614fec204ae14e90cef2Timo Sirainen}
50e20db49f29917fe9adcf1b56b11badf28bd0e4Timo Sirainen
50e20db49f29917fe9adcf1b56b11badf28bd0e4Timo Sirainenstatic void test_fts_tokenizer_char_generic_tr29_only(void)
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen{
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen static const unsigned char input[] =
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "abc@example.com, "
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "Bar Baz <bar@example.org>, "
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "foo@domain";
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen static const char *const expected_output[] = {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "abc", "example.com", "Bar", "Baz",
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "bar", "example.org", "foo", "domain", NULL
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen };
50e20db49f29917fe9adcf1b56b11badf28bd0e4Timo Sirainen struct fts_tokenizer *tok;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen const char * const *eopp = expected_output;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi const char *token, *error;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi unsigned int i;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi int ret;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi test_begin("fts tokenizer generic TR29 input one character at a time");
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi fts_tokenizer_register(fts_tokenizer_generic);
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi
44cf91b7a701a9b4d9f59a990552eab4f7f64fbcTimo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen ret = i < sizeof(input)-1 ?
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_next(tok, &input[i], 1, &token) :
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_next(tok, NULL, 0, &token);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen if (ret == 0) {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen i++;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen continue;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen }
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen eopp++;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen }
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen test_assert(*eopp == NULL);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_unref(&tok);
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
1da01eaa962be13cee75771064e2256b1a82d90aTimo Sirainen test_end();
401b0787fff2dc986a5321ddb32acb1947ff66b0Timo Sirainen}
f01eb1f51d618633c0189be9ab60a774f47fb7dfTimo Sirainen
f01eb1f51d618633c0189be9ab60a774f47fb7dfTimo Sirainenstatic void test_fts_tokenizer_line_address_only(void)
401b0787fff2dc986a5321ddb32acb1947ff66b0Timo Sirainen{
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen static const char *const input[] = {
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen "abc@example.com",
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen " Bar Baz <bar@example.org>",
7744586e3e0fd60158abfbb03a233d3bd8d6c48bTimo Sirainen "foo@domain",
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen " moro foo@domain Bar Baz <bar@example.org>"
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen };
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen static const char *const expected_output[] = {
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "abc@example.com", "bar@example.org",
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen "foo@domain", "foo@domain", "bar@example.org", NULL
f2df3069766c747cbf020fea5d3a4261949064b0Timo Sirainen };
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen struct fts_tokenizer *tok;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen const char * const *eopp = expected_output;
062ea54b7775d0c92ed67b9b1f4d93fa8ec80c84Timo Sirainen const char *token, *error;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen unsigned int i;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen int ret;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_begin("fts tokenizer email address only, input one line at a time");
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi for (i = 0; i <= N_ELEMENTS(input);) {
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi ret = i < N_ELEMENTS(input) ?
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi fts_tokenizer_next(tok, (unsigned char *)input[i],
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi strlen(input[i]), &token) :
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi fts_tokenizer_next(tok, NULL, 0, &token);
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi if (ret == 0) {
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi i++;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi continue;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi }
44cf91b7a701a9b4d9f59a990552eab4f7f64fbcTimo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen eopp++;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen }
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_assert(*eopp == NULL);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen fts_tokenizer_unref(&tok);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_end();
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen}
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainenstatic void test_fts_tokenizer_char_address_only(void)
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen{
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen static const unsigned char input[] =
401b0787fff2dc986a5321ddb32acb1947ff66b0Timo Sirainen "@invalid invalid@ abc@example.com, "
401b0787fff2dc986a5321ddb32acb1947ff66b0Timo Sirainen "Bar Baz <bar@example.org>, "
1da01eaa962be13cee75771064e2256b1a82d90aTimo Sirainen "foo@domain";
f01eb1f51d618633c0189be9ab60a774f47fb7dfTimo Sirainen static const char *const expected_output[] = {
f01eb1f51d618633c0189be9ab60a774f47fb7dfTimo Sirainen "abc@example.com", "bar@example.org",
f01eb1f51d618633c0189be9ab60a774f47fb7dfTimo Sirainen "foo@domain", NULL
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen };
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen struct fts_tokenizer *tok;
7744586e3e0fd60158abfbb03a233d3bd8d6c48bTimo Sirainen const char * const *eopp = expected_output;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen const char *token, *error;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen unsigned int i;
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen int ret;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen test_begin("fts tokenizer email address only, input one character at a time");
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
383d0e8c24451468d6bea17e4b55d74de744abe6Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen
383d0e8c24451468d6bea17e4b55d74de744abe6Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
7bafda1813454621e03615e83d55bccfa7cc56bdTimo Sirainen ret = i < sizeof(input)-1 ?
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_next(tok, &input[i], 1, &token) :
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen fts_tokenizer_next(tok, NULL, 0, &token);
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen if (ret == 0) {
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen i++;
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen continue;
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen }
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen eopp++;
4ee00532a265bdfb38539d811fcd12d51210ac35Timo Sirainen }
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen test_assert(*eopp == NULL);
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen fts_tokenizer_unref(&tok);
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen test_end();
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen}
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainenstatic void test_fts_tokenizer_rand_address_only(void)
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen{
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen static const unsigned char input[] =
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, "
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen "Foo Bar (comment)foo.bar@host.example.org foo ";
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen static const char *const expected_output[] = {
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen "abc.dfg@example.com",
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen "foo.bar@host.example.org",
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen NULL
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen };
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen struct fts_tokenizer *tok;
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen const char * const *eopp = expected_output;
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen const char *token, *error;
31a574fda352ef4f71dbff9c30e15e4744e132c0Timo Sirainen unsigned int i, step, step_max = 10;
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen int ret;
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen test_begin("fts tokenizer email address, input random length");
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL,
3190f12fb96daf61f7c880390472e18184cbb2d8Timo Sirainen NULL, &tok, &error) == 0);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen step = rand() % step_max + 1;
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen ret = i < sizeof(input)-1 ?
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen fts_tokenizer_next(tok, &input[i], step, &token) :
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen fts_tokenizer_next(tok, NULL, 0, &token);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen if (ret == 0) {
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen i += step;
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen step = rand() % step_max + 1;
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen step = I_MIN(step, sizeof(input) - i);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen continue;
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen }
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen eopp++;
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen }
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen test_assert(*eopp == NULL);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen fts_tokenizer_unref(&tok);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen test_end();
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen}
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen
3190f12fb96daf61f7c880390472e18184cbb2d8Timo Sirainenstatic void test_fts_tokenizer_address_char(void)
3190f12fb96daf61f7c880390472e18184cbb2d8Timo Sirainen{
3190f12fb96daf61f7c880390472e18184cbb2d8Timo Sirainen static const unsigned char input[] =
3190f12fb96daf61f7c880390472e18184cbb2d8Timo Sirainen "@invalid invalid@ abc@example.com, "
3190f12fb96daf61f7c880390472e18184cbb2d8Timo Sirainen "Bar Baz <bar@example.org>, "
a5bcc9f96bf56121a0704433c12137a43cd093beTimo Sirainen "foo@domain";
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen static const char *const expected_output[] = {
a5bcc9f96bf56121a0704433c12137a43cd093beTimo Sirainen "invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen "bar", "example", "org", "bar@example.org",
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen "foo", "domain", "foo@domain", NULL
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen };
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen const char * const *eopp = expected_output;
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen const char *token, *error;
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen unsigned int i;
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen int ret;
15f526e5ac611b4532568d131fcd0abf664abe41Timo Sirainen
15f526e5ac611b4532568d131fcd0abf664abe41Timo Sirainen test_begin("fts tokenizer email address + parent, input one character at a time");
15f526e5ac611b4532568d131fcd0abf664abe41Timo Sirainen fts_tokenizers_init();
15f526e5ac611b4532568d131fcd0abf664abe41Timo Sirainen
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
f93c833d644ecff0b0f80bee4f1cdde3e697b5c8Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
f93c833d644ecff0b0f80bee4f1cdde3e697b5c8Timo Sirainen
f93c833d644ecff0b0f80bee4f1cdde3e697b5c8Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
f93c833d644ecff0b0f80bee4f1cdde3e697b5c8Timo Sirainen ret = i < sizeof(input)-1 ?
f93c833d644ecff0b0f80bee4f1cdde3e697b5c8Timo Sirainen fts_tokenizer_next(tok, &input[i], 1, &token) :
6bd263caf006edc75205f446fa0283c6f364941bTimo Sirainen fts_tokenizer_next(tok, NULL, 0, &token);
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen if (ret == 0) {
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen i++;
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen continue;
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen }
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen test_assert(*eopp != NULL);
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen eopp++;
4fc74bba3548987b7e8597491cd9fafc1f701be6Timo Sirainen }
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen test_assert(*eopp == NULL);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen fts_tokenizer_unref(&tok);
4addfd26372c6ae32ec93252696d86fd32081327Timo Sirainen fts_tokenizer_unref(&gen_tok);
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen fts_tokenizers_deinit();
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen test_end();
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen}
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainenstatic void test_fts_tokenizer_address_line(void)
0f62889d833767acf9c2ad010c3269806b4cfae3Timo Sirainen{
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen static const char *const input[] = {
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen "@invalid invalid@ abc@example.com, ",
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen "Bar Baz <bar@example.org>, ",
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen "foo@domain, ",
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen "foo@domain Bar Baz <bar@example.org>, "
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen };
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen static const char *const expected_output[] = {
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen "invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen "bar", "example", "org", "bar@example.org",
7289c5600711b45f30fe289ab5b0293b51d87041Timo Sirainen "foo", "domain", "foo@domain",
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen "foo", "domain", "foo@domain", "Bar", "Baz",
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi "bar", "example", "org", "bar@example.org", NULL
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi };
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi struct fts_tokenizer *tok, *gen_tok;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi const char * const *eopp = expected_output;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi const char *token, *error;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi unsigned int i;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi int ret;
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi test_begin("fts tokenizer email address + parent, input one line at a time");
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi fts_tokenizers_init();
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi for (i = 0; i <= N_ELEMENTS(input);) {
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi ret = i < N_ELEMENTS(input) ?
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi fts_tokenizer_next(tok, (unsigned char *)input[i],
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi strlen(input[i]), &token) :
203bb272804e4394ae07103cdc8ce67041ba21a1Aki Tuomi fts_tokenizer_next(tok, NULL, 0, &token);
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen if (ret == 0) {
738cfeb96c4b9cd92aa3c791d77734c2745cdd1aTimo Sirainen i++;
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen continue;
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen }
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen eopp++;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen }
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen test_assert(*eopp == NULL);
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen fts_tokenizer_unref(&tok);
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen fts_tokenizer_unref(&gen_tok);
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen fts_tokenizers_deinit();
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen test_end();
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen}
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainenstatic void test_fts_tokenizer_address_rand(void)
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen{
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen static const unsigned char input[] =
559f278a4c54d9fa7e0f2e96ebceda30562f9009Timo Sirainen "@invalid invalid@ abc@example.com, "
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen "Bar Baz <bar@example.org>, "
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen "foo@domain";
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen static const char *const expected_output[] = {
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen "invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen "bar", "example", "org", "bar@example.org",
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen "foo", "domain", "foo@domain", NULL
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen };
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen const char * const *eopp = expected_output;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen const char *token, *error;
272aca0a772140d3a45a425a3fd67854ae2ccec2Timo Sirainen unsigned int i, step, step_max = 10;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen int ret;
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen test_begin("fts tokenizer email address + parent, input random length");
9dd1c256910f1fb42823116a641e7edb3ad11970Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
d477acb83e14a776ece4ca94dcd1869e75d0c6eeTimo Sirainen
1d22eaac93de41319918a1fc6de42bb302e25c1aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
1d22eaac93de41319918a1fc6de42bb302e25c1aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen //srand(1424142100); /* had a bug */
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen step = rand() % step_max + 1;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen ret = i < sizeof(input)-1 ?
2024157e8de36edd31f5fd72f5ea7364a0955fa7Timo Sirainen fts_tokenizer_next(tok, &input[i], step, &token) :
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen fts_tokenizer_next(tok, NULL, 0, &token);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen if (ret == 0) {
9ed2951bd0bb1878a27437d7c00611b2baadd614Timo Sirainen i += step;
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen step = rand() % step_max + 1;
b215a8a123623782554a83f3025ef4e771bd8f01Timo Sirainen step = I_MIN(step, sizeof(input) - i);
32b78da5dfbbf6a06b3dbdc9278c60b55714f9bcTimo Sirainen continue;
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen }
9ed2951bd0bb1878a27437d7c00611b2baadd614Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
9ed2951bd0bb1878a27437d7c00611b2baadd614Timo Sirainen eopp++;
9ed2951bd0bb1878a27437d7c00611b2baadd614Timo Sirainen }
9ed2951bd0bb1878a27437d7c00611b2baadd614Timo Sirainen test_assert(*eopp == NULL);
9ed2951bd0bb1878a27437d7c00611b2baadd614Timo Sirainen fts_tokenizer_unref(&tok);
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen fts_tokenizer_unref(&gen_tok);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
b215a8a123623782554a83f3025ef4e771bd8f01Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
e9371f899a3d4207a0ffd3923ea5ec7250cf5e75Timo Sirainen test_end();
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen}
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainenstatic void test_fts_tokenizer_address_search(void)
5d60e31c7b701b606067a20bc88dcc8a6de7bbd6Timo Sirainen{
32b78da5dfbbf6a06b3dbdc9278c60b55714f9bcTimo Sirainen static const unsigned char input[] =
32b78da5dfbbf6a06b3dbdc9278c60b55714f9bcTimo Sirainen "@invalid invalid@ abc@example.com, "
e9371f899a3d4207a0ffd3923ea5ec7250cf5e75Timo Sirainen "Bar Baz <bar@example.org>, "
32b78da5dfbbf6a06b3dbdc9278c60b55714f9bcTimo Sirainen "foo@domain";
a64adf62fa33f2463a86f990217b0c9078531a40Timo Sirainen static const char *const expected_output[] = {
"invalid", "invalid", "abc@example.com", "Bar", "Baz",
"bar@example.org", "foo@domain", NULL
};
static const char *const settings[] = { "search", "" };
struct fts_tokenizer *tok, *gen_tok;
const char * const *eopp = expected_output;
const char *token, *error;
unsigned int i;
int ret;
test_begin("fts tokenizer search email address + parent, input one character at a time");
fts_tokenizers_init();
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
for (i = 0; i <= sizeof(input)-1; ) {
ret = i < sizeof(input)-1 ?
fts_tokenizer_next(tok, &input[i], 1, &token) :
fts_tokenizer_next(tok, NULL, 0, &token);
if (ret == 0) {
i++;
continue;
}
test_assert(*eopp != NULL);
test_assert(null_strcmp(token, *eopp) == 0);
eopp++;
}
test_assert(*eopp == NULL);
test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token) == 0);
test_assert(fts_tokenizer_next(tok, NULL, 0, &token) > 0);
test_assert(fts_tokenizer_next(tok, NULL, 0, &token) == 0);
test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token) == 0);
test_assert(fts_tokenizer_next(tok, NULL, 0, &token) > 0);
test_assert(fts_tokenizer_next(tok, NULL, 0, &token) == 0);
test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token) == 0);
test_assert(fts_tokenizer_next(tok, NULL, 0, &token) > 0);
test_assert(fts_tokenizer_next(tok, NULL, 0, &token) == 0);
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
fts_tokenizers_deinit();
test_end();
}
int main(void)
{
static void (*test_functions[])(void) = {
test_fts_tokenizer_generic_only,
test_fts_tokenizer_generic_unicode_whitespace,
test_fts_tokenizer_char_generic_only,
test_fts_tokenizer_generic_tr29_only,
test_fts_tokenizer_generic_tr29_unicode_whitespace,
test_fts_tokenizer_char_generic_tr29_only,
test_fts_tokenizer_generic_tr29_midnumlet_end,
test_fts_tokenizer_char_address_only,
test_fts_tokenizer_line_address_only,
test_fts_tokenizer_rand_address_only,
test_fts_tokenizer_address_char,
test_fts_tokenizer_address_line,
test_fts_tokenizer_address_rand,
test_fts_tokenizer_address_search,
NULL
};
return test_run(test_functions);
}