sd-event.c revision b7484e2a58038c57591457c1439505607bdcd833
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
/***
This file is part of systemd.
Copyright 2013 Lennart Poettering
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include "macro.h"
#include "prioq.h"
#include "hashmap.h"
#include "util.h"
#include "time-util.h"
#include "sd-id128.h"
#include "sd-event.h"
#define EPOLL_QUEUE_MAX 64
typedef enum EventSourceType {
struct sd_event_source {
unsigned n_ref;
void *userdata;
int enabled:3;
bool pending:1;
int priority;
unsigned pending_index;
unsigned prepare_index;
unsigned pending_iteration;
unsigned prepare_iteration;
union {
struct {
int fd;
bool registered:1;
} io;
struct {
unsigned earliest_index;
unsigned latest_index;
} time;
struct {
struct signalfd_siginfo siginfo;
int sig;
} signal;
struct {
int options;
} child;
struct {
} defer;
struct {
unsigned prioq_index;
} quit;
};
};
struct sd_event {
unsigned n_ref;
int epoll_fd;
int signal_fd;
int realtime_fd;
int monotonic_fd;
/* For both clocks we maintain two priority queues each, one
* ordered for the earliest times the events may be
* dispatched, and one ordered by the latest times they must
* have been dispatched. The range between the top entries in
* the two prioqs is the time window we can freely schedule
* wakeups in */
unsigned n_enabled_child_sources;
unsigned iteration;
int state;
bool quit_requested:1;
bool need_process_child:1;
};
static int pending_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
/* Enabled ones first */
return -1;
return 1;
/* Lower priority values first */
return -1;
return 1;
/* Older entries first */
if (x->pending_iteration < y->pending_iteration)
return -1;
if (x->pending_iteration > y->pending_iteration)
return 1;
/* Stability for the rest */
if (x < y)
return -1;
if (x > y)
return 1;
return 0;
}
static int prepare_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
/* Move most recently prepared ones last, so that we can stop
* preparing as soon as we hit one that has already been
* prepared in the current iteration */
if (x->prepare_iteration < y->prepare_iteration)
return -1;
if (x->prepare_iteration > y->prepare_iteration)
return 1;
/* Enabled ones first */
return -1;
return 1;
/* Lower priority values first */
return -1;
return 1;
/* Stability for the rest */
if (x < y)
return -1;
if (x > y)
return 1;
return 0;
}
static int earliest_time_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
/* Enabled ones first */
return -1;
return 1;
/* Move the pending ones to the end */
return -1;
return 1;
/* Order by time */
return -1;
return -1;
/* Stability for the rest */
if (x < y)
return -1;
if (x > y)
return 1;
return 0;
}
static int latest_time_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
/* Enabled ones first */
return -1;
return 1;
/* Move the pending ones to the end */
return -1;
return 1;
/* Order by time */
return -1;
return -1;
/* Stability for the rest */
if (x < y)
return -1;
if (x > y)
return 1;
return 0;
}
static int quit_prioq_compare(const void *a, const void *b) {
const sd_event_source *x = a, *y = b;
/* Enabled ones first */
return -1;
return 1;
/* Lower priority values first */
return -1;
return 1;
/* Stability for the rest */
if (x < y)
return -1;
if (x > y)
return 1;
return 0;
}
static void event_free(sd_event *e) {
assert(e);
if (e->epoll_fd >= 0)
if (e->signal_fd >= 0)
if (e->realtime_fd >= 0)
if (e->monotonic_fd >= 0)
prioq_free(e->pending);
prioq_free(e->prepare);
prioq_free(e->quit);
free(e->signal_sources);
free(e);
}
sd_event *e;
int r;
if (!e)
return -ENOMEM;
e->n_ref = 1;
e->original_pid = getpid();
if (!e->pending) {
r = -ENOMEM;
goto fail;
}
if (e->epoll_fd < 0) {
r = -errno;
goto fail;
}
*ret = e;
return 0;
fail:
event_free(e);
return r;
}
assert_return(e, NULL);
e->n_ref++;
return e;
}
assert_return(e, NULL);
e->n_ref--;
if (e->n_ref <= 0)
event_free(e);
return NULL;
}
static bool event_pid_changed(sd_event *e) {
assert(e);
/* We don't support people creating am event loop and keeping
* it around over a fork(). Let's complain. */
return e->original_pid != getpid();
}
static int source_io_unregister(sd_event_source *s) {
int r;
assert(s);
if (!s->io.registered)
return 0;
if (r < 0)
return -errno;
s->io.registered = false;
return 0;
}
static int source_io_register(
sd_event_source *s,
int enabled,
struct epoll_event ev = {};
int r;
assert(s);
if (enabled == SD_EVENT_ONESHOT)
if (s->io.registered)
else
if (r < 0)
return -errno;
s->io.registered = true;
return 0;
}
static void source_free(sd_event_source *s) {
assert(s);
if (s->event) {
switch (s->type) {
case SOURCE_IO:
break;
case SOURCE_MONOTONIC:
break;
case SOURCE_REALTIME:
break;
case SOURCE_SIGNAL:
if (s->event->signal_sources)
}
break;
case SOURCE_CHILD:
if (s->enabled != SD_EVENT_OFF) {
s->event->n_enabled_child_sources--;
}
}
break;
case SOURCE_DEFER:
/* nothing */
break;
case SOURCE_QUIT:
break;
}
if (s->pending)
if (s->prepare)
sd_event_unref(s->event);
}
free(s);
}
static int source_set_pending(sd_event_source *s, bool b) {
int r;
assert(s);
if (s->pending == b)
return 0;
s->pending = b;
if (b) {
if (r < 0) {
s->pending = false;
return r;
}
} else
return 0;
}
sd_event_source *s;
assert(e);
if (!s)
return NULL;
s->n_ref = 1;
s->event = sd_event_ref(e);
return s;
}
_public_ int sd_event_add_io(
sd_event *e,
int fd,
void *userdata,
sd_event_source **ret) {
sd_event_source *s;
int r;
assert_return(e, -EINVAL);
s = source_new(e, SOURCE_IO);
if (!s)
return -ENOMEM;
s->enabled = SD_EVENT_ON;
if (r < 0) {
source_free(s);
return -errno;
}
*ret = s;
return 0;
}
static int event_setup_timer_fd(
sd_event *e,
int *timer_fd,
struct epoll_event ev = {};
int r, fd;
assert(e);
return 0;
if (fd < 0)
return -errno;
if (r < 0) {
return -errno;
}
/* When we sleep for longer, we try to realign the wakeup to
the same time wihtin each second, so that events all across
the system can be coalesced into a single CPU
wakeup. However, let's take some system-specific randomness
for this value, so that in a network of systems with synced
clocks timer events are distributed a bit. Here, we
calculate a perturbation usec offset from the boot ID. */
if (sd_id128_get_boot(&bootid) >= 0)
return 0;
}
static int event_add_time_internal(
sd_event *e,
int *timer_fd,
void *userdata,
sd_event_source **ret) {
sd_event_source *s;
int r;
assert_return(e, -EINVAL);
if (!*earliest) {
if (!*earliest)
return -ENOMEM;
}
if (!*latest) {
if (!*latest)
return -ENOMEM;
}
if (*timer_fd < 0) {
if (r < 0)
return r;
}
s = source_new(e, type);
if (!s)
return -ENOMEM;
s->enabled = SD_EVENT_ONESHOT;
if (r < 0)
goto fail;
if (r < 0)
goto fail;
*ret = s;
return 0;
fail:
source_free(s);
return r;
}
void *userdata,
sd_event_source **ret) {
return event_add_time_internal(e, SOURCE_MONOTONIC, &e->monotonic_fd, CLOCK_MONOTONIC, &e->monotonic_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
}
void *userdata,
sd_event_source **ret) {
return event_add_time_internal(e, SOURCE_REALTIME, &e->realtime_fd, CLOCK_REALTIME, &e->realtime_earliest, &e->monotonic_latest, usec, accuracy, callback, userdata, ret);
}
static int event_update_signal_fd(sd_event *e) {
struct epoll_event ev = {};
bool add_to_epoll;
int r;
assert(e);
add_to_epoll = e->signal_fd < 0;
if (r < 0)
return -errno;
e->signal_fd = r;
if (!add_to_epoll)
return 0;
if (r < 0) {
e->signal_fd = -1;
return -errno;
}
return 0;
}
sd_event *e,
int sig,
void *userdata,
sd_event_source **ret) {
sd_event_source *s;
int r;
assert_return(e, -EINVAL);
if (!e->signal_sources) {
if (!e->signal_sources)
return -ENOMEM;
} else if (e->signal_sources[sig])
return -EBUSY;
s = source_new(e, SOURCE_SIGNAL);
if (!s)
return -ENOMEM;
s->enabled = SD_EVENT_ON;
e->signal_sources[sig] = s;
r = event_update_signal_fd(e);
if (r < 0) {
source_free(s);
return r;
}
}
*ret = s;
return 0;
}
sd_event *e,
int options,
void *userdata,
sd_event_source **ret) {
sd_event_source *s;
int r;
assert_return(e, -EINVAL);
if (r < 0)
return r;
return -EBUSY;
s = source_new(e, SOURCE_CHILD);
if (!s)
return -ENOMEM;
s->enabled = SD_EVENT_ONESHOT;
if (r < 0) {
source_free(s);
return r;
}
e->n_enabled_child_sources ++;
r = event_update_signal_fd(e);
if (r < 0) {
source_free(s);
return -errno;
}
}
e->need_process_child = true;
*ret = s;
return 0;
}
sd_event *e,
void *userdata,
sd_event_source **ret) {
sd_event_source *s;
int r;
assert_return(e, -EINVAL);
s = source_new(e, SOURCE_DEFER);
if (!s)
return -ENOMEM;
s->enabled = SD_EVENT_ONESHOT;
r = source_set_pending(s, true);
if (r < 0) {
source_free(s);
return r;
}
*ret = s;
return 0;
}
sd_event *e,
void *userdata,
sd_event_source **ret) {
sd_event_source *s;
int r;
assert_return(e, -EINVAL);
if (!e->quit) {
if (!e->quit)
return -ENOMEM;
}
s = source_new(e, SOURCE_QUIT);
if (!s)
return -ENOMEM;
s->enabled = SD_EVENT_ONESHOT;
if (r < 0) {
source_free(s);
return r;
}
*ret = s;
return 0;
}
assert_return(s, NULL);
s->n_ref++;
return s;
}
assert_return(s, NULL);
s->n_ref--;
if (s->n_ref <= 0)
source_free(s);
return NULL;
}
assert_return(s, NULL);
return s->event;
}
assert_return(s, -EINVAL);
return s->pending;
}
assert_return(s, -EINVAL);
}
assert_return(s, -EINVAL);
return 0;
}
int r;
assert_return(s, -EINVAL);
return 0;
if (s->enabled != SD_EVENT_OFF) {
if (r < 0)
return r;
}
return 0;
}
assert_return(s, -EINVAL);
return 0;
}
assert_return(s, -EINVAL);
}
assert_return(s, -EINVAL);
return s->priority;
}
assert_return(s, -EINVAL);
return 0;
if (s->pending)
if (s->prepare)
if (s->type == SOURCE_QUIT)
return 0;
}
assert_return(s, -EINVAL);
assert_return(m, -EINVAL);
*m = s->enabled;
return 0;
}
int r;
assert_return(s, -EINVAL);
if (s->enabled == m)
return 0;
if (m == SD_EVENT_OFF) {
switch (s->type) {
case SOURCE_IO:
r = source_io_unregister(s);
if (r < 0)
return r;
s->enabled = m;
break;
case SOURCE_MONOTONIC:
s->enabled = m;
break;
case SOURCE_REALTIME:
s->enabled = m;
break;
case SOURCE_SIGNAL:
s->enabled = m;
}
break;
case SOURCE_CHILD:
s->enabled = m;
s->event->n_enabled_child_sources--;
}
break;
case SOURCE_QUIT:
s->enabled = m;
break;
case SOURCE_DEFER:
s->enabled = m;
break;
}
} else {
switch (s->type) {
case SOURCE_IO:
if (r < 0)
return r;
s->enabled = m;
break;
case SOURCE_MONOTONIC:
s->enabled = m;
break;
case SOURCE_REALTIME:
s->enabled = m;
break;
case SOURCE_SIGNAL:
s->enabled = m;
}
break;
case SOURCE_CHILD:
s->enabled = m;
if (s->enabled == SD_EVENT_OFF) {
s->event->n_enabled_child_sources++;
}
}
break;
case SOURCE_QUIT:
s->enabled = m;
break;
case SOURCE_DEFER:
s->enabled = m;
break;
}
}
if (s->pending)
if (s->prepare)
return 0;
}
assert_return(s, -EINVAL);
return 0;
}
assert_return(s, -EINVAL);
return 0;
if (s->type == SOURCE_REALTIME) {
} else {
}
return 0;
}
assert_return(s, -EINVAL);
return 0;
}
assert_return(s, -EINVAL);
if (usec == 0)
return 0;
if (s->type == SOURCE_REALTIME)
else
return 0;
}
assert_return(s, -EINVAL);
return 0;
}
int r;
assert_return(s, -EINVAL);
return 0;
return 0;
}
if (r < 0)
return r;
if (callback) {
if (r < 0)
return r;
} else
return 0;
}
assert_return(s, NULL);
return s->userdata;
}
usec_t c;
assert(e);
assert(a <= b);
if (a <= 0)
return 0;
if (b <= a + 1)
return a;
/*
Find a good time to wake up again between times a and b. We
have two goals here:
a) We want to wake up as seldom as possible, hence prefer
later times over earlier times.
b) But if we have to wake up, then let's make sure to
dispatch as much as possible on the entire system.
We implement this by waking up everywhere at the same time
within any given second if we can, synchronised via the
perturbation value determined from the boot ID. If we can't,
then we try to find the same spot in every a 250ms
step. Otherwise, we pick the last possible time to wake up.
*/
if (c >= b) {
if (_unlikely_(c < USEC_PER_SEC))
return b;
c -= USEC_PER_SEC;
}
if (c >= a)
return c;
if (c >= b) {
return b;
c -= USEC_PER_MSEC*250;
}
if (c >= a)
return c;
return b;
}
static int event_arm_timer(
sd_event *e,
int timer_fd,
struct itimerspec its = {};
sd_event_source *a, *b;
usec_t t;
int r;
assert_se(e);
a = prioq_peek(earliest);
if (!a || a->enabled == SD_EVENT_OFF) {
return 0;
/* disarm */
if (r < 0)
return r;
return 0;
}
b = prioq_peek(latest);
if (*next == t)
return 0;
if (t == 0) {
/* We don' want to disarm here, just mean some time looooong ago. */
} else
if (r < 0)
return r;
*next = t;
return 0;
}
assert(e);
assert(s);
return source_set_pending(s, true);
}
uint64_t x;
assert(e);
if (ss < 0) {
return 0;
return -errno;
}
if (ss != sizeof(x))
return -EIO;
return 0;
}
static int process_timer(
sd_event *e,
usec_t n,
sd_event_source *s;
int r;
assert(e);
for (;;) {
s = prioq_peek(earliest);
if (!s ||
s->enabled == SD_EVENT_OFF ||
s->pending)
break;
r = source_set_pending(s, true);
if (r < 0)
return r;
}
return 0;
}
static int process_child(sd_event *e) {
sd_event_source *s;
Iterator i;
int r;
assert(e);
e->need_process_child = false;
/*
So, this is ugly. We iteratively invoke waitid() with P_PID
+ WNOHANG for each PID we wait for, instead of using
P_ALL. This is because we only want to get child
information of very specific child processes, and not all
of them. We might not have processed the SIGCHLD even of a
previous invocation and we don't want to maintain a
unbounded *per-child* event queue, hence we really don't
want anything flushed out of the kernel's queue that we
don't care about. Since this is O(n) this means that if you
have a lot of processes you probably want to handle SIGCHLD
yourself.
*/
HASHMAP_FOREACH(s, e->child_sources, i) {
if (s->pending)
continue;
if (s->enabled == SD_EVENT_OFF)
continue;
if (r < 0)
return -errno;
r = source_set_pending(s, true);
if (r < 0)
return r;
}
}
return 0;
}
bool read_one = false;
int r;
assert(e);
assert(e->signal_sources);
for (;;) {
struct signalfd_siginfo si;
sd_event_source *s;
if (ss < 0) {
return read_one;
return -errno;
}
return -EIO;
read_one = true;
r = process_child(e);
if (r < 0)
return r;
if (r > 0 || !s)
continue;
} else
if (!s)
return -EIO;
r = source_set_pending(s, true);
if (r < 0)
return r;
}
return 0;
}
static int source_dispatch(sd_event_source *s) {
int r = 0;
assert(s);
r = source_set_pending(s, false);
if (r < 0)
return r;
}
if (s->enabled == SD_EVENT_ONESHOT) {
r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
if (r < 0)
return r;
}
switch (s->type) {
case SOURCE_IO:
break;
case SOURCE_MONOTONIC:
break;
case SOURCE_REALTIME:
break;
case SOURCE_SIGNAL:
break;
case SOURCE_CHILD:
break;
case SOURCE_DEFER:
break;
case SOURCE_QUIT:
break;
}
return r;
}
static int event_prepare(sd_event *e) {
int r;
assert(e);
for (;;) {
sd_event_source *s;
s = prioq_peek(e->prepare);
break;
s->prepare_iteration = e->iteration;
if (r < 0)
return r;
if (r < 0)
return r;
}
return 0;
}
static int dispatch_quit(sd_event *e) {
sd_event_source *p;
int r;
assert(e);
p = prioq_peek(e->quit);
if (!p || p->enabled == SD_EVENT_OFF) {
e->state = SD_EVENT_FINISHED;
return 0;
}
sd_event_ref(e);
e->iteration++;
e->state = SD_EVENT_QUITTING;
r = source_dispatch(p);
e->state = SD_EVENT_PASSIVE;
sd_event_unref(e);
return r;
}
sd_event_source *p;
assert(e);
p = prioq_peek(e->pending);
if (!p)
return NULL;
if (p->enabled == SD_EVENT_OFF)
return NULL;
return p;
}
sd_event_source *p;
int r, i, m;
assert_return(e, -EINVAL);
if (e->quit_requested)
return dispatch_quit(e);
sd_event_ref(e);
e->iteration++;
e->state = SD_EVENT_RUNNING;
r = event_prepare(e);
if (r < 0)
goto finish;
if (event_next_pending(e) || e->need_process_child)
timeout = 0;
if (timeout > 0) {
r = event_arm_timer(e, e->monotonic_fd, e->monotonic_earliest, e->monotonic_latest, &e->monotonic_next);
if (r < 0)
goto finish;
r = event_arm_timer(e, e->realtime_fd, e->realtime_earliest, e->realtime_latest, &e->realtime_next);
if (r < 0)
goto finish;
}
if (m < 0) {
goto finish;
}
dual_timestamp_get(&e->timestamp);
for (i = 0; i < m; i++) {
else
if (r < 0)
goto finish;
}
if (r < 0)
goto finish;
if (r < 0)
goto finish;
if (e->need_process_child) {
r = process_child(e);
if (r < 0)
goto finish;
}
p = event_next_pending(e);
if (!p) {
r = 0;
goto finish;
}
r = source_dispatch(p);
e->state = SD_EVENT_PASSIVE;
sd_event_unref(e);
return r;
}
int r;
assert_return(e, -EINVAL);
sd_event_ref(e);
while (e->state != SD_EVENT_FINISHED) {
if (r < 0)
goto finish;
}
r = 0;
sd_event_unref(e);
return r;
}
assert_return(e, -EINVAL);
return e->state;
}
assert_return(e, -EINVAL);
return e->quit_requested;
}
assert_return(e, -EINVAL);
e->quit_requested = true;
return 0;
}
assert_return(e, -EINVAL);
return 0;
}
assert_return(e, -EINVAL);
return 0;
}