service-monitor.c revision 17706107e6efc2f15973a7a63a834cb7c0a6dc68
5f5870385cff47efd2f58e7892f251cf13761528Timo Sirainen/* Copyright (c) 2005-2012 Dovecot authors, see the included COPYING file */
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainenstatic void service_monitor_start_extra_avail(struct service *service);
f6edc54aa72596af8da681c07223108c322712d5Timo Sirainenstatic void service_status_more(struct service_process *process,
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainenstatic void service_monitor_listen_start_force(struct service *service);
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainenstatic void service_process_kill_idle(struct service_process *process)
f6edc54aa72596af8da681c07223108c322712d5Timo Sirainen i_assert(process->available_count == service->client_limit);
c0e5c6a86e1de5d4f5591d39b4aa921a23c807d7Timo Sirainen if (service->process_avail <= service->set->process_min_avail) {
c0e5c6a86e1de5d4f5591d39b4aa921a23c807d7Timo Sirainen /* we don't have any extra idling processes anymore. */
f6edc54aa72596af8da681c07223108c322712d5Timo Sirainen } else if (process->last_kill_sent > process->last_status_update+1) {
f6edc54aa72596af8da681c07223108c322712d5Timo Sirainen service_error(service, "Process %s is ignoring idle SIGINT",
f6edc54aa72596af8da681c07223108c322712d5Timo Sirainen /* assume this process is busy */
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen if (kill(process->pid, SIGINT) < 0 && errno != ESRCH) {
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen service_error(service, "kill(%s, SIGINT) failed: %m",
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainenstatic void service_status_more(struct service_process *process,
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen process->available_count - status->available_count;
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen /* process used up all of its clients */
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen service->process_count == service->process_limit)
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen /* we may need to start more */
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainenstatic void service_status_less(struct service_process *process,
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen /* process can accept more clients again */
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen i_assert(service->process_avail <= service->process_count);
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen if (status->available_count == service->client_limit) {
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen if (service->process_avail > service->set->process_min_avail &&
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen /* we have more processes than we really need.
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen add a bit of randomness so that we don't send the
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen signal to all of them at once */
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainenservice_status_input_one(struct service *service,
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen process = hash_table_lookup(service_pids, &status->pid);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen /* we've probably wait()ed it away already. ignore */
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen if (process->uid != status->uid || process->service != service) {
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen /* a) Process was closed and another process was created with
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen the same PID, but we're still receiving status update from
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen the old process.
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen b) Some process is trying to corrupt our internal state by
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen trying to pretend to be someone else. We could use stronger
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen randomness here, but the worst they can do is DoS and there
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen are already more serious problems if someone is able to do
55bc6a7a0940ec48a68558ef70838991c5d301d2Timo Sirainen service_error(service, "Ignoring invalid update from child %s "
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen "(UID=%u)", dec2str(status->pid), status->uid);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen /* first status notification */
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen if (process->available_count == status->available_count)
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen if (process->available_count > status->available_count) {
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen /* process started servicing some more clients */
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen /* process finished servicing some clients */
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen process->available_count = status->available_count;
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainenstatic void service_status_input(struct service *service)
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen struct master_status status[1024/sizeof(struct master_status)];
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen unsigned int i, count;
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen ret = read(service->status_fd[0], &status, sizeof(status));
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen service_error(service, "read(status) failed: EOF");
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen service_error(service, "read(status) failed: %m");
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen if ((ret % sizeof(struct master_status)) != 0) {
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen service_error(service, "service sent partial status update "
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen for (i = 0; i < count; i++)
9cd232cda7563ad81c01776e5ebc5ed2b3cef898Timo Sirainen service_status_input_one(service, &status[i]);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainenstatic void service_monitor_throttle(struct service *service)
cb211cc64c1c6ac8343d60e7a058be42fdba2f71Timo Sirainen service_error(service, "command startup failed, throttling for %u secs",
cb211cc64c1c6ac8343d60e7a058be42fdba2f71Timo Sirainen service_throttle(service, service->throttle_secs);
cb211cc64c1c6ac8343d60e7a058be42fdba2f71Timo Sirainen if (service->throttle_secs > SERVICE_STARTUP_FAILURE_THROTTLE_MAX_SECS)
cb211cc64c1c6ac8343d60e7a058be42fdba2f71Timo Sirainen service->throttle_secs = SERVICE_STARTUP_FAILURE_THROTTLE_MAX_SECS;
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainenstatic void service_drop_timeout(struct service *service)
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen /* drop all pending connections */
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen while ((fd = net_accept((*lp)->fd, NULL, NULL)) > 0)
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainenstatic void service_monitor_listen_pending(struct service *service)
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen service->to_drop = timeout_add(SERVICE_DROP_TIMEOUT_MSECS,
f7f13e206c9839f6e868088034b0b59d1d9be13aTimo Sirainenstatic void service_drop_connections(struct service_listener *l)
3a79fdaf3253dae045dfa14d2a88b94086327da4Timo Sirainen unsigned int limit;
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen SERVICE_DROP_WARN_INTERVAL_SECS < ioloop_time) {
3a79fdaf3253dae045dfa14d2a88b94086327da4Timo Sirainen service->process_limit : service->client_limit;
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen "client connections are being dropped",
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen /* reached process limit, notify processes that they
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen need to start killing existing connections if they
6fdfa4d4cf14d1d7764d7faa8258f112e39c8dbeTimo Sirainen reach connection limit */
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen /* maybe this is a temporary peak, stop for a while and
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen see if it goes away */
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen /* this has been happening for a while now. just accept and
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen close the connection, so it's clear that this is happening
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen because of the limit, rather than because the service
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen processes aren't answering fast enough */
f7f13e206c9839f6e868088034b0b59d1d9be13aTimo Sirainenstatic void service_accept(struct service_listener *l)
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen if (service->process_count == service->process_limit) {
d8552f9f65e5ff64be5de9faf9a8171799a0bbecTimo Sirainen /* we've reached our limits, new clients will have to
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen wait until there are more processes available */
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen /* create a child process and let it accept() this connection */
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainenstatic void service_monitor_start_extra_avail(struct service *service)
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainen unsigned int i, count;
2806f15ceb68023baf65a9daad9dfdf54c622708Timo Sirainen if (service->process_avail >= service->set->process_min_avail ||
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainen count = service->set->process_min_avail - service->process_avail;
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainen if (service->process_count + count > service->process_limit)
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainen count = service->process_limit - service->process_count;
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainen for (i = 0; i < count; i++) {
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen if (service_process_create(service) == NULL) {
cdc8485491045d82bb98405d4b995f277d12838eTimo Sirainen /* we created some processes, they'll do the listening now */
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainenstatic void service_monitor_listen_start_force(struct service *service)
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen array_foreach(&service->listeners, listeners) {
f7f13e206c9839f6e868088034b0b59d1d9be13aTimo Sirainen l->io = io_add(l->fd, IO_READ, service_accept, l);
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainenvoid service_monitor_listen_start(struct service *service)
acef354e742a39416b0697e1554f5d49b0369850Timo Sirainen (service->process_count == service->process_limit &&
b2ed2b25c4c457ec1c99ebe5e9bd66a2e2f89cfdTimo Sirainenvoid service_monitor_listen_stop(struct service *service)
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen array_foreach(&service->listeners, listeners) {
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainenstatic int service_login_create_notify_fd(struct service *service)
3005627bf2ed223194c2d08a8c1630769d048f69Timo Sirainen str_append(prefix, service->set->master_set->base_dir);
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen fd = safe_mkstemp(prefix, 0600, (uid_t)-1, (gid_t)-1);
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen service_error(service, "safe_mkstemp(%s) failed: %m",
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen service_error(service, "unlink(%s) failed: %m", path);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainenvoid services_monitor_start(struct service_list *service_list)
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen if (pipe(service_list->master_dead_pipe_fd) < 0)
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen fd_close_on_exec(service_list->master_dead_pipe_fd[0], TRUE);
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen fd_close_on_exec(service_list->master_dead_pipe_fd[1], TRUE);
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen array_foreach(&service_list->services, services) {
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen if (service_login_create_notify_fd(service) < 0)
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen /* we haven't yet created status pipe */
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen net_set_nonblock(service->status_fd[0], TRUE);
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen fd_close_on_exec(service->status_fd[0], TRUE);
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen net_set_nonblock(service->status_fd[1], TRUE);
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen fd_close_on_exec(service->status_fd[1], TRUE);
17706107e6efc2f15973a7a63a834cb7c0a6dc68Timo Sirainen if (service_process_create(service_list->log) != NULL)
17706107e6efc2f15973a7a63a834cb7c0a6dc68Timo Sirainen service_monitor_listen_stop(service_list->log);
dc07b75b7ea83ff5f447970a20419032725271a7Timo Sirainen /* start up a process for startup-services */
dc07b75b7ea83ff5f447970a20419032725271a7Timo Sirainen array_foreach(&service_list->services, services) {
b2ed2b25c4c457ec1c99ebe5e9bd66a2e2f89cfdTimo Sirainenvoid service_monitor_stop(struct service *service)
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen for (i = 0; i < 2; i++) {
55bc6a7a0940ec48a68558ef70838991c5d301d2Timo Sirainen "close(status fd) failed: %m");
cf0ad1a0bddb0787f3d7b408a96d721a8b2a98a3Timo Sirainen "close(login notify fd) failed: %m");
1c7b0cbdb08cccbd25c19ae0fb69abe8ed9ee9b4Timo Sirainenstatic void services_monitor_wait(struct service_list *service_list)
1c7b0cbdb08cccbd25c19ae0fb69abe8ed9ee9b4Timo Sirainen time_t max_wait_time = time(NULL) + MAX_DIE_WAIT_SECS;
1c7b0cbdb08cccbd25c19ae0fb69abe8ed9ee9b4Timo Sirainen array_foreach(&service_list->services, servicep) {
1c7b0cbdb08cccbd25c19ae0fb69abe8ed9ee9b4Timo Sirainenvoid services_monitor_stop(struct service_list *service_list, bool wait)
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen if (service_list->master_dead_pipe_fd[0] != -1) {
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen if (close(service_list->master_dead_pipe_fd[0]) < 0)
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen i_error("close(master dead pipe) failed: %m");
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen if (close(service_list->master_dead_pipe_fd[1]) < 0)
29f32cdcf44cda9688576bfdc7450a8a15e90e86Timo Sirainen i_error("close(master dead pipe) failed: %m");
1c7b0cbdb08cccbd25c19ae0fb69abe8ed9ee9b4Timo Sirainen /* we've notified all children that the master is dead.
1c7b0cbdb08cccbd25c19ae0fb69abe8ed9ee9b4Timo Sirainen now wait for the children to either die or to tell that
1c7b0cbdb08cccbd25c19ae0fb69abe8ed9ee9b4Timo Sirainen they're no longer listening for new connections */
7bd72e4deca3cbf757dd1ea298486d9f3bc24226Timo Sirainen array_foreach(&service_list->services, services)
2b2e5f7a24c24d971351877ad4c5150662856bfbTimo Sirainenservice_process_failure(struct service_process *process, int status)
6c2ce1d5bf17b21e804a079eb0f973b7ab83e0d8Timo Sirainen service_process_log_status_error(process, status);
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen if (!throttle && !service->have_successful_exits) {
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen /* this service has seen no successful exits yet.
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen try to avoid failure storms by throttling the service if it
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen only keeps failing rapidly. this is no longer done after
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen one success to avoid intentional DoSing, in case attacker
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen finds a way to quickly crash his own session. */
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen if (service->exit_failure_last != ioloop_time) {
321f17803ad71171ad2408399b6cc8efd2d1479aTimo Sirainen if (++service->exit_failures_in_sec > SERVICE_MAX_EXIT_FAILURES_IN_SEC)
4f4943f6ef1bc45c23de73eebe83779712b3c8cbTimo Sirainen service_process_notify_add(service_anvil_global->kills, process);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
bad5fa318c6c1384ab83bd72d53ce06593274c18Timo Sirainen process = hash_table_lookup(service_pids, &pid);
d176f84ce5ca2073f4dfbafb457b9c74f6bf0d76Timo Sirainen /* success */
870bcf0d0c07f7d915f1f571f38968426ba575a1Timo Sirainen /* one success resets all failures */
2b2e5f7a24c24d971351877ad4c5150662856bfbTimo Sirainen throttle = service_process_failure(process, status);
57dc3cb5d5e315272353abf55f702eefc084db26Timo Sirainen /* if we're reloading, we may get here with a service list
57dc3cb5d5e315272353abf55f702eefc084db26Timo Sirainen that's going to be destroyed after this process is
57dc3cb5d5e315272353abf55f702eefc084db26Timo Sirainen destroyed. keep the list referenced until we're done. */