/* Copyright (c) 2006-2018 Dovecot authors, see the included COPYING file */
#include "lib.h"
#include "istream.h"
#include "buffer.h"
#include "str.h"
#include "rfc822-parser.h"
#include "message-address.h"
#include "message-parser.h"
#include "message-decoder.h"
#include "mail-storage.h"
#include "index-mail.h"
#include "fts-parser.h"
#include "fts-user.h"
#include "fts-language.h"
#include "fts-tokenizer.h"
#include "fts-filter.h"
#include "fts-api-private.h"
#include "fts-build-mail.h"
/* there are other characters as well, but this doesn't have to be exact */
#define IS_WORD_WHITESPACE(c) \
((c) == ' ' || (c) == '\t' || (c) == '\n')
/* if we see a word larger than this, just go ahead and split it from
wherever */
struct fts_mail_build_context {
};
const struct message_header_line *hdr)
{
return;
T_BEGIN {
} T_END;
}
static void
const struct message_header_line *hdr)
{
/* just pass it as-is to backend. */
}
const struct message_block *raw_block)
{
}
static int
const struct message_header_line *hdr)
{
unsigned int i;
int ret;
/* @UNSAFE: if there are any NULs, replace them with spaces */
for (i = 0; i < hdr->full_value_len; i++) {
if (data[i] == '\0') {
}
buf[i] = ' ';
}
}
return ret;
}
{
size_t i;
for (i = 0; i < size; i++) {
if ((data[i] & 0x80) != 0)
return TRUE;
}
return FALSE;
}
struct fts_user_language *user_lang)
{
/* reset tokenizer between fields - just to be sure no state
leaks between fields (especially if previous indexing had
failed) */
}
static void
const struct message_header_line *hdr)
{
/* Headers that don't contain any human language will only be
translated to lowercase - no stemming or other filtering. There's
unfortunately no pefect way of detecting which headers contain
human languages, so we have a list of some hardcoded header names
and we'll also assume that if there's any 8bit content it's a human
language. */
else {
}
}
const struct message_block *block)
{
int ret;
return 0;
/* hdr->full_value is always set because we get the block from
message_decoder */
return 0;
/* regular unstructured header */
} else T_BEGIN {
/* message address. normalize it to give better
search results. */
} T_END;
FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) {
/* index the header name itself using data-language. */
ret = -1;
}
}
return ret;
}
static bool
{
*binary_body_r = FALSE;
/* multiparts are never indexed, only their contents */
return FALSE;
}
/* extract text using the the returned parser */
*binary_body_r = TRUE;
/* text body parts */
} else {
/* possibly binary */
return FALSE;
*binary_body_r = TRUE;
}
return FALSE;
}
return TRUE;
}
static int
{
if (ret2 < 0)
if (ret2 > 0) {
(const void *)token,
ret = -1;
}
} T_END;
return ret;
}
static int
const struct fts_language **lang_r)
{
/* save the input so far and try again later */
if (last) {
/* we've run out of data. use the default language. */
return 1;
}
return 0;
/* use the default language */
return 1;
case FTS_LANGUAGE_RESULT_OK:
return 1;
/* internal language detection library failure
(e.g. invalid config). don't index anything. */
return -1;
default:
i_unreached();
}
}
static int
{
int ret;
/* we already have a language */
return -1;
} else if (ret == 0) {
/* wait for more data */
return 0;
} else {
return -1;
}
}
return -1;
if (last) {
return -1;
}
return 0;
}
static int
{
size_t i;
/* we'll need to send only full words to the backend */
/* continuing previous word */
for (i = 0; i < size; i++) {
if (IS_WORD_WHITESPACE(data[i]))
break;
}
data += i;
size -= i;
/* word is still not finished */
return 0;
}
/* we have a full word, index it */
return -1;
}
/* find the boundary for last word */
if (last)
i = size;
else {
for (i = size; i > 0; i--) {
break;
}
}
return -1;
if (i < size) {
}
}
return 0;
}
{
FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) {
FTS_BACKEND_FLAG_BUILD_FULL_WORDS) != 0) {
} else {
}
}
{
}
const char **retriable_err_msg_r,
bool *may_need_retry_r)
{
const char *retriable_error;
int ret = 0;
int deinit_ret;
do {
ret = -1;
break;
}
if (ret < 0) {
/* indexing already failed - we don't want to retry
in any case */
return -1;
}
if (deinit_ret == 0) {
/* retry the parsing */
*may_need_retry_r = TRUE;
return -1;
}
return deinit_ret < 0 ? -1 : 0;
}
static int
const char **retriable_err_msg_r,
bool *may_need_retry_r)
{
bool binary_body;
const char *error;
int ret;
return 0;
i_error("Failed to read mailbox %s mail UID=%u stream: %s",
return -1;
}
0);
for (;;) {
if (ret < 0) {
if (input->stream_errno == 0)
ret = 0;
else {
i_error("read(%s) failed: %s",
}
break;
}
/* body part changed. we're now parsing the end of
boundary, possibly followed by message epilogue */
may_need_retry_r) < 0) {
ret = -1;
break;
}
}
/* multipart. skip until beginning of next
part's headers */
}
}
/* always handle headers */
/* end of headers */
&binary_body);
if (binary_body)
} else {
if (skip_body)
continue;
}
&block))
continue;
ret = -1;
break;
}
/* end of headers */
} else {
ret = -1;
break;
}
body_added = TRUE;
}
}
if (ret == 0)
else
}
/* make sure body is added even when it doesn't exist */
}
}
{
int ret;
/* Number of attempts to be taken if retry is needed */
const char *retriable_err_msg;
bool may_need_retry;
T_BEGIN {
&may_need_retry)) < 0 &&
if (--attempts == 0) {
/* Log this as info instead of as error,
because e.g. Tika doesn't differentiate
between temporary errors and invalid
document input. */
break;
}
}
} T_END;
return ret;
}