squat-test.c revision 97e62b2b36dda0acb3215667042f5c80cdee8155
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher/* Copyright (c) 2006-2011 Dovecot authors, see the included COPYING file */
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher#include "lib.h"
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher#include "array.h"
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher#include "file-lock.h"
2cb6f28b3a12bb714bf14494d31eb6b6fff64b8bJakub Hrozek#include "istream.h"
2cb6f28b3a12bb714bf14494d31eb6b6fff64b8bJakub Hrozek#include "time-util.h"
2cb6f28b3a12bb714bf14494d31eb6b6fff64b8bJakub Hrozek#include "unichar.h"
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher#include "squat-trie.h"
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher#include "squat-uidlist.h"
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher#include <stdio.h>
a9228ebcce14888b3123bdf46e610e0900bcd2ccJakub Hrozek#include <unistd.h>
a9228ebcce14888b3123bdf46e610e0900bcd2ccJakub Hrozek#include <fcntl.h>
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek#include <time.h>
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher#include <sys/time.h>
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagherstatic void result_print(ARRAY_TYPE(seq_range) *result)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher{
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher const struct seq_range *range;
7a14e8f66c0e932fe2954d792614a3b61d444bd1Jakub Hrozek unsigned int i, count;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher range = array_get(result, &count);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher for (i = 0; i < count; i++) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (i != 0)
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek printf(",");
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek printf("%u", range[i].seq1);
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek if (range[i].seq1 != range[i].seq2)
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek printf("-%u", range[i].seq2);
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek }
2ea6196484055397cc4bc011c5960f790431fa9dStephen Gallagher printf("\n");
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher}
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagherint main(int argc ATTR_UNUSED, char *argv[])
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher{
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher const char *trie_path = "/tmp/squat-test-index.search";
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher const char *uidlist_path = "/tmp/squat-test-index.search.uids";
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher struct squat_trie *trie;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher struct squat_trie_build_context *build_ctx;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher struct istream *input;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher struct stat trie_st, uidlist_st;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher ARRAY_TYPE(seq_range) definite_uids, maybe_uids;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher char *line, *str, buf[4096];
2ea6196484055397cc4bc011c5960f790431fa9dStephen Gallagher buffer_t *valid;
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek int ret, fd;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher unsigned int len, last = 0, seq = 1, node_count, uidlist_count;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher enum squat_index_type index_type;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher bool data_header = TRUE, first = TRUE, skip_body = FALSE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher bool mime_header = TRUE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher size_t trie_mem, uidlist_mem;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher clock_t clock_start, clock_end;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher struct timeval tv_start, tv_end;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher double cputime;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher lib_init();
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher (void)unlink(trie_path);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher (void)unlink(uidlist_path);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher trie = squat_trie_init(trie_path, time(NULL),
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher FILE_LOCK_METHOD_FCNTL, FALSE, 0600, (gid_t)-1);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher clock_start = clock();
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher gettimeofday(&tv_start, NULL);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fd = open(argv[1], O_RDONLY);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (fd == -1)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher return 1;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek if (squat_trie_build_init(trie, &build_ctx) < 0)
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek return 1;
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek valid = buffer_create_dynamic(default_pool, 4096);
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek input = i_stream_create_fd(fd, (size_t)-1, FALSE);
2ea6196484055397cc4bc011c5960f790431fa9dStephen Gallagher ret = 0;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher while (ret == 0 && (line = i_stream_read_next_line(input)) != NULL) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (last != input->v_offset/(1024*100)) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fprintf(stderr, "\r%ukB", (unsigned)(input->v_offset/1024));
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fflush(stderr);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher last = input->v_offset/(1024*100);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (strncmp(line, "From ", 5) == 0) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (!first)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher seq++;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher data_header = TRUE;
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek skip_body = FALSE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher mime_header = TRUE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher continue;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher first = FALSE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek if (strncmp(line, "--", 2) == 0) {
ea929f1b022fc2cb77dec89b0e12accef983ec85Jakub Hrozek skip_body = FALSE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher mime_header = TRUE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (mime_header) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (*line == '\0') {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher data_header = FALSE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher mime_header = FALSE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher continue;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (strncasecmp(line, "Content-Type:", 13) == 0 &&
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher strncasecmp(line, "Content-Type: text/", 19) != 0 &&
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher strncasecmp(line, "Content-Type: message/", 22) != 0)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher skip_body = TRUE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher else if (strncasecmp(line, "Content-Transfer-Encoding: base64", 33) == 0)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher skip_body = TRUE;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher } else if (skip_body)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher continue;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (*line == '\0')
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher continue;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher /* we're actually indexing here headers as bodies and bodies
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher as headers. it doesn't really matter in this test, and
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fixing it would require storing headers temporarily
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher elsewhere and index them only after the body */
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher index_type = !data_header ? SQUAT_INDEX_TYPE_HEADER :
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher SQUAT_INDEX_TYPE_BODY;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher buffer_set_used_size(valid, 0);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher len = strlen(line);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (uni_utf8_get_valid_data((const unsigned char *)line,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher len, valid)) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher ret = squat_trie_build_more(build_ctx, seq, index_type,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher (const void *)line, len);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher } else if (valid->used > 0) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher ret = squat_trie_build_more(build_ctx, seq, index_type,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher valid->data, valid->used);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher buffer_free(&valid);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (squat_trie_build_deinit(&build_ctx, NULL) < 0)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher ret = -1;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (ret < 0) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher printf("build broken\n");
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher return 1;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher clock_end = clock();
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek gettimeofday(&tv_end, NULL);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher cputime = (double)(clock_end - clock_start) / CLOCKS_PER_SEC;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fprintf(stderr, "\n - Index time: %.2f CPU seconds, "
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher "%.2f real seconds (%.02fMB/CPUs)\n", cputime,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher timeval_diff_msecs(&tv_end, &tv_start)/1000.0,
65a9065538fd85e6ead925d344e6b421900eb8c2Jakub Hrozek input->v_offset / cputime / (1024*1024));
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (stat(trie_path, &trie_st) < 0)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher i_error("stat(%s) failed: %m", trie_path);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher if (stat(uidlist_path, &uidlist_st) < 0)
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher i_error("stat(%s) failed: %m", uidlist_path);
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher trie_mem = squat_trie_mem_used(trie, &node_count);
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek uidlist_mem = squat_uidlist_mem_used(squat_trie_get_uidlist(trie),
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher &uidlist_count);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fprintf(stderr, " - memory: %uk for trie, %uk for uidlist\n",
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher (unsigned)(trie_mem/1024), (unsigned)(uidlist_mem/1024));
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fprintf(stderr, " - %"PRIuUOFF_T" bytes in %u nodes (%.02f%%)\n",
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher trie_st.st_size, node_count,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher trie_st.st_size / (float)input->v_offset * 100.0);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fprintf(stderr, " - %"PRIuUOFF_T" bytes in %u UID lists (%.02f%%)\n",
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher uidlist_st.st_size, uidlist_count,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher uidlist_st.st_size / (float)input->v_offset * 100.0);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher fprintf(stderr, " - %"PRIuUOFF_T" bytes total of %"
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher PRIuUOFF_T" (%.02f%%)\n",
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher (trie_st.st_size + uidlist_st.st_size), input->v_offset,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher (trie_st.st_size + uidlist_st.st_size) /
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher (float)input->v_offset * 100.0);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher i_stream_unref(&input);
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek close(fd);
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher i_array_init(&definite_uids, 128);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher i_array_init(&maybe_uids, 128);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher while ((str = fgets(buf, sizeof(buf), stdin)) != NULL) {
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher ret = strlen(str)-1;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher str[ret] = 0;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher gettimeofday(&tv_start, NULL);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher ret = squat_trie_lookup(trie, str, SQUAT_INDEX_TYPE_HEADER |
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher SQUAT_INDEX_TYPE_BODY,
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher &definite_uids, &maybe_uids);
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek if (ret < 0)
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek printf("error\n");
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek else {
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek gettimeofday(&tv_end, NULL);
6463ed1dcdd45416468b3fa178bd856b5a9ed2c3Jakub Hrozek printf(" - Search took %.05f CPU seconds\n",
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher timeval_diff_usecs(&tv_end, &tv_start)/1000000.0);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher printf(" - definite uids: ");
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher result_print(&definite_uids);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher printf(" - maybe uids: ");
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher result_print(&maybe_uids);
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher }
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher return 0;
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher}
52261fe16203dec6e6f69177c6d0a810b47d073fStephen Gallagher