test-fts-tokenizer.c revision c865b0e9c65fd77f7b2ab6f8616d3def5501ecb3
2e37d45867d081db150ab78dad303b9077aea24fTimo Sirainen/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
46552a931924c2d743f045e95b08c3ce6beda91aTimo Sirainen#include "lib.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include "sha2.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include "hex-binary.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include "test-common.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include "fts-tokenizer.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include "fts-tokenizer-private.h"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen#include <stdlib.h>
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen static const unsigned char input[] =
2ef0e8ee48c9683f7bd6698798efa3328e4322d1Timo Sirainen "hello world\r\nAnd there\twas: text "
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen "galore, and more.\n\n (\"Hello world\")last ";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", "And",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "there", "was", "text", "galore",
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen "and", "more", "Hello", "world", "last", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen const struct fts_tokenizer *tok_class;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
2e78f05b11df23ec2731afaf8f19d5b5240cb29fTimo Sirainen const char *token, *error;
2e78f05b11df23ec2731afaf8f19d5b5240cb29fTimo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer generic simple");
d1e7425048c61d71f41f737ba947687198842dc2Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen }
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen fts_tokenizer_unref(&tok);
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen test_end();
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_unicode_whitespace(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen U+205A(e2 81 9a) and U+205F(e2 81 9f )*/
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", "And",
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen "there", "was", "text", "galore",
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen "and", "more", NULL
b9c76fe9d9ca194816606342da1ddbd9be6bc8abTimo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const struct fts_tokenizer *tok_class;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer generic simple with Unicode whitespace");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_char_generic_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc", "example", "com", "Bar", "Baz",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "bar", "example", "org", "foo", "domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen test_begin("fts tokenizer generic simple input one character at a time");
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
6303191abcb37164f435ccdc56e9dbddf1288851Timo Sirainen
6303191abcb37164f435ccdc56e9dbddf1288851Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen token = i < sizeof(input)-1 ?
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen fts_tokenizer_next(tok, &input[i], 1) :
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen fts_tokenizer_next(tok, NULL, 0);
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen if (token == NULL) {
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen i++;
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen continue;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen eopp++;
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen }
1bf5c6c20f3d51f13d3240cfb46e471074c86276Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e5acc283bf030b0b5c79ca4e52d315c516a299faPascal Volk}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
2e2a1d720ed53490e8e5c5031e773d395bd5683dTimo Sirainenconst char *const tr29_settings[] = {"algorithm", "tr29", NULL};
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\r\n\nAnd there\twas: text "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "galore, and more.\n\n (\"Hello world\")3.14 3,14 last 1.";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", "And",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "there", "was", "text", "galore",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "and", "more", "Hello", "world", "3.14",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "3,14", "last", "1", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const struct fts_tokenizer *tok_class;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer generic TR29");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen eopp++;
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen }
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
b8afdaa1bffe2f27cd4b02bf3bfbd2d297c8e648Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen/* TODO: U+206F is in "Format" and therefore currently not word break.
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen This definitely needs to be remapped. */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_unicode_whitespace(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen 81 9f)*/
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\r\nAnd\xE2\x80\x80there\twas: text "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", "And",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "there", "was", "text", "galore",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "and", "more", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const struct fts_tokenizer *tok_class;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer generic TR29 with Unicode whitespace");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_generic_tr29_midnumlet_end(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen /* u+FF0E is EF BC 8E */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello world\xEF\xBC\x8E";
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "hello", "world", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen const struct fts_tokenizer *tok_class;
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen struct fts_tokenizer *tok;
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen const char * const *eopp = expected_output;
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen const char *token, *error;
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen test_begin("fts tokenizer generic TR29 with MinNumLet U+FF0E at end");
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
77f1da4b5e2b800197d8db548235497d5e9d6a4fTimo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen while ((token = fts_tokenizer_next(tok, NULL, 0)) != NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_char_generic_tr29_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc", "example.com", "Bar", "Baz",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "bar", "example.org", "foo", "domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
18f1bbf05980d3c53ecae81b62574212f0891522Timo Sirainen test_begin("fts tokenizer generic TR29 input one character at a time");
77f1da4b5e2b800197d8db548235497d5e9d6a4fTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen
77f1da4b5e2b800197d8db548235497d5e9d6a4fTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
18f1bbf05980d3c53ecae81b62574212f0891522Timo Sirainen
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
75e46142d8fbac811df8f2ca58d9a2f48a75d65fTimo Sirainen token = i < sizeof(input)-1 ?
77f1da4b5e2b800197d8db548235497d5e9d6a4fTimo Sirainen fts_tokenizer_next(tok, &input[i], 1) :
18f1bbf05980d3c53ecae81b62574212f0891522Timo Sirainen fts_tokenizer_next(tok, NULL, 0);
18f1bbf05980d3c53ecae81b62574212f0891522Timo Sirainen if (token == NULL) {
18f1bbf05980d3c53ecae81b62574212f0891522Timo Sirainen i++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen continue;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_line_address_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen static const char *const input[] = {
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen "abc@example.com",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen " Bar Baz <bar@example.org>",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen " moro foo@domain Bar Baz <bar@example.org>"
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com", "bar@example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain", "foo@domain", "bar@example.org", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen const char *const settings[] = {"have_parent", "0", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen const char *token, *error;
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen unsigned int i;
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen test_begin("fts tokenizer email address only, input one line at a time");
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
04b7dc631f33bf61f273138c679da9bd0910fb6dTimo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i <= N_ELEMENTS(input);) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen token = i < N_ELEMENTS(input) ?
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, (unsigned char *)input[i], strlen(input[i])) :
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, NULL, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen if (token == NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen i++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen continue;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen test_end();
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen}
2ef0e8ee48c9683f7bd6698798efa3328e4322d1Timo Sirainenstatic void test_fts_tokenizer_char_address_only(void)
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen{
6303191abcb37164f435ccdc56e9dbddf1288851Timo Sirainen static const unsigned char input[] =
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen "abc@example.com, "
2598b2f36365b52d9754b9348a5be29569293e46Timo Sirainen "Bar Baz <bar@example.org>, "
2ef0e8ee48c9683f7bd6698798efa3328e4322d1Timo Sirainen "foo@domain";
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
53dfcefa9440a49d703e49193819a79be99c9ba6Timo Sirainen "abc@example.com", "bar@example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *const settings[] = {"have_parent", "0", NULL};
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address only, input one character at a time");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen token = i < sizeof(input)-1 ?
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, &input[i], 1) :
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, NULL, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen if (token == NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen i++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen continue;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_rand_address_only(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const unsigned char input[] =
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen "Abc Dfg <abc.dfg@example.com>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Foo Bar (comment)foo.bar@host.example.org foo ";
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen static const char *const expected_output[] = {
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen "abc.dfg@example.com",
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen "foo.bar@host.example.org",
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen NULL
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen };
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen struct fts_tokenizer *tok;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen const char * const *eopp = expected_output;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen const char *token, *error;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen const char *const settings[] = {"have_parent", "0", NULL};
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen unsigned int i, step, step_max = 10;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_begin("fts tokenizer email address, input random length");
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL,
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen settings, &tok, &error) == 0);
7fa573e6ea36024f618492e7d3649a69c1b41028Timo Sirainen step = rand() % step_max + 1;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen token = i < sizeof(input)-1 ?
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_next(tok, &input[i], step) :
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_next(tok, NULL, 0);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen if (token == NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen i += step;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen step = rand() % step_max + 1;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen step = I_MIN(step, sizeof(input) - i);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen continue;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen eopp++;
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen }
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_assert(*eopp == NULL);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_unref(&tok);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_end();
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen}
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainenstatic void test_fts_tokenizer_address_char(void)
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen{
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen static const unsigned char input[] =
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, "
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain";
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen static const char *const expected_output[] = {
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "abc", "example", "com", "abc@example.com", "Bar", "Baz",
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "bar", "example", "org", "bar@example.org",
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen "foo", "domain", "foo@domain", NULL
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen };
8bb360f9e5de1c25e4f875205bb06e8bf15dae14Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen const char * const *eopp = expected_output;
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen const char *token, *error;
6998ca95b4947c90647ac5d4794ebd6311acada2Timo Sirainen unsigned int i;
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_begin("fts tokenizer email address + parent, input one character at a time");
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen token = i < sizeof(input)-1 ?
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen fts_tokenizer_next(tok, &input[i], 1) :
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen fts_tokenizer_next(tok, NULL, 0);
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen if (token == NULL) {
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen i++;
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen continue;
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen }
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen test_assert(*eopp != NULL);
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen eopp++;
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen }
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen test_assert(*eopp == NULL);
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen fts_tokenizer_unref(&tok);
4c6ddf2491104f917d00e6900e833e80ea02c7b6Timo Sirainen fts_tokenizer_unref(&gen_tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainenstatic void test_fts_tokenizer_address_line(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const input[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc@example.com, ",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "Bar Baz <bar@example.org>, ",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain, ",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo@domain Bar Baz <bar@example.org>, "
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static const char *const expected_output[] = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "abc", "example", "com", "abc@example.com", "Bar", "Baz",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "bar", "example", "org", "bar@example.org",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo", "domain", "foo@domain",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "foo", "domain", "foo@domain", "Bar", "Baz",
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen "bar", "example", "org", "bar@example.org", NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen struct fts_tokenizer *tok, *gen_tok;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen unsigned int i;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address + parent, input one line at a time");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i <= N_ELEMENTS(input);) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen token = i < N_ELEMENTS(input) ?
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, (unsigned char *)input[i], strlen(input[i])) :
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, NULL, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen if (token == NULL) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen i++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen continue;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen eopp++;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(*eopp == NULL);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unref(&tok);
660b99a7059824676b2b8d6f79b8e15d47df25a2Timo Sirainen fts_tokenizer_unref(&gen_tok);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen test_end();
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainenstatic void test_fts_tokenizer_address_rand(void)
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen{
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen static const unsigned char input[] =
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen "abc@example.com, "
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen "Bar Baz <bar@example.org>, "
cb2c44f33d9d48f58e4c5e42ba2526a0c100218aTimo Sirainen "foo@domain";
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen static const char *const expected_output[] = {
0348172a5278d1f5aa2440f30346c390ddc17318Timo Sirainen "abc", "example", "com", "abc@example.com", "Bar", "Baz",
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen "bar", "example", "org", "bar@example.org",
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen "foo", "domain", "foo@domain", NULL
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen };
7a23d586f07ec376e28e8f6f3f3392a4ac8b83bbTimo Sirainen struct fts_tokenizer *tok, *gen_tok;
3b22894b8805b186c73d8b754001e8d7e944be85Timo Sirainen const char * const *eopp = expected_output;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen const char *token, *error;
46ec792dd4ccf6c34706c4774228301fafde6aa9Timo Sirainen unsigned int i, step, step_max = 10;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_begin("fts tokenizer email address + parent, input random length");
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_generic);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_register(fts_tokenizer_email_address);
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen
decb23442f9e6cd5c4845a9cb162029b8c6d5f0fTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
a75907609d7c410c9e17beedfafbf28b4439fa8aTimo Sirainen test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
6998ca95b4947c90647ac5d4794ebd6311acada2Timo Sirainen //srand(1424142100); /* had a bug */
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen step = rand() % step_max + 1;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen for (i = 0; i <= sizeof(input)-1; ) {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen token = i < sizeof(input)-1 ?
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, &input[i], step) :
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen fts_tokenizer_next(tok, NULL, 0);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen if (token == NULL) {
ec23e16ed879e289d12c6e1a5f9745dd3979004aTimo Sirainen i += step;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen step = rand() % step_max + 1;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen step = I_MIN(step, sizeof(input) - i);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen continue;
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen }
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_assert(null_strcmp(token, *eopp) == 0);
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen eopp++;
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen }
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen test_assert(*eopp == NULL);
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen fts_tokenizer_unref(&tok);
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen fts_tokenizer_unref(&gen_tok);
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_generic);
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen fts_tokenizer_unregister(fts_tokenizer_email_address);
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen test_end();
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen}
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainenint main(void)
d3a7d023b47d2a137f01109e7b38702dca3f11d3Timo Sirainen{
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen static void (*test_functions[])(void) = {
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_generic_only,
5a250816ffc4cc5db203f9410ea99b6601c7b91aTimo Sirainen test_fts_tokenizer_generic_unicode_whitespace,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_char_generic_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_generic_tr29_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_generic_tr29_unicode_whitespace,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_char_generic_tr29_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_generic_tr29_midnumlet_end,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_char_address_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_line_address_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_rand_address_only,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_address_char,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_address_line,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen test_fts_tokenizer_address_rand,
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen NULL
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen };
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen return test_run(test_functions);
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen}
e248fe370c4047cee921a91b48edc37944ab0526Timo Sirainen