From 2e8de0de3b5ae10277cd0fd72e782d2f6aa30c81 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Tue, 3 Oct 2017 13:58:22 -0400 Subject: [PATCH 1/4] sched: Implement SCHED_BUG() Signed-off-by: David Goulet --- src/or/scheduler.c | 29 +++++++++++++++++++++++++++++ src/or/scheduler.h | 8 ++++++++ 2 files changed, 37 insertions(+) diff --git a/src/or/scheduler.c b/src/or/scheduler.c index 22a9b574a..c656c80e9 100644 --- a/src/or/scheduler.c +++ b/src/or/scheduler.c @@ -9,6 +9,9 @@ #define SCHEDULER_KIST_PRIVATE #include "scheduler.h" #include "main.h" +#include "buffers.h" +#define TOR_CHANNEL_INTERNAL_ +#include "channeltls.h" #include @@ -667,6 +670,32 @@ scheduler_channel_wants_writes(channel_t *chan) } } +/* Log warn the given channel and extra scheduler context as well. This is + * used by SCHED_BUG() in order to be able to extract as much information as + * we can when we hit a bug. Channel chan can be NULL. */ +void +scheduler_bug_occurred(const channel_t *chan) +{ + char buf[128]; + + if (chan != NULL) { + const size_t outbuf_len = + buf_datalen(TO_CONN(BASE_CHAN_TO_TLS((channel_t *) chan)->conn)->outbuf); + tor_snprintf(buf, sizeof(buf), + "Channel %" PRIu64 " in state %s and scheduler state %d." + " Num cells on cmux: %d. Connection outbuf len: %lu.", + chan->global_identifier, + channel_state_to_string(chan->state), + chan->scheduler_state, circuitmux_num_cells(chan->cmux), + outbuf_len); + } + + log_warn(LD_BUG, "%s Num pending channels: %d. Channel in pending list: %s", + (chan != NULL) ? buf : "No channel in bug context.", + smartlist_len(channels_pending), + (smartlist_pos(channels_pending, chan) == -1) ? "no" : "yes"); +} + #ifdef TOR_UNIT_TESTS /* diff --git a/src/or/scheduler.h b/src/or/scheduler.h index 0ec24db93..47c98f096 100644 --- a/src/or/scheduler.h +++ b/src/or/scheduler.h @@ -143,6 +143,14 @@ MOCK_DECL(void, scheduler_channel_has_waiting_cells, (channel_t *chan)); /********************************* * Defined in scheduler.c *********************************/ + +/* Triggers a BUG() and extra information with chan if available. */ +#define SCHED_BUG(cond, chan) \ + (PREDICT_UNLIKELY(cond) ? \ + ((BUG(cond)) ? (scheduler_bug_occurred(chan), 1) : 0) : 0) + +void scheduler_bug_occurred(const channel_t *chan); + smartlist_t *get_channels_pending(void); MOCK_DECL(int, scheduler_compare_channels, (const void *c1_v, const void *c2_v)); From 3931a6f264355d7fe017219f76bd5bec1d8bcbd8 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Tue, 3 Oct 2017 14:38:30 -0400 Subject: [PATCH 2/4] sched: Use SCHED_BUG() macro in scheduler When a BUG() occurs, this macro will print extra information about the state of the scheduler and the given channel if any. This will help us greatly to fix future bugs in the scheduler especially when they occur rarely. Fixes #23753 Signed-off-by: David Goulet --- changes/bug23753 | 4 ++++ src/or/scheduler.c | 2 +- src/or/scheduler_kist.c | 8 ++++---- 3 files changed, 9 insertions(+), 5 deletions(-) create mode 100644 changes/bug23753 diff --git a/changes/bug23753 b/changes/bug23753 new file mode 100644 index 000000000..8782a8e2d --- /dev/null +++ b/changes/bug23753 @@ -0,0 +1,4 @@ + o Minor features (logging, scheduler): + - Introduce a SCHED_BUG() function to log extra information about the + scheduler state if we ever catch a bug in the scheduler. Closes ticket + 23753. diff --git a/src/or/scheduler.c b/src/or/scheduler.c index c656c80e9..67911ff81 100644 --- a/src/or/scheduler.c +++ b/src/or/scheduler.c @@ -607,7 +607,7 @@ scheduler_release_channel,(channel_t *chan)) } if (chan->scheduler_state == SCHED_CHAN_PENDING) { - if (smartlist_pos(channels_pending, chan) == -1) { + if (SCHED_BUG(smartlist_pos(channels_pending, chan) == -1, chan)) { log_warn(LD_SCHED, "Scheduler asked to release channel %" PRIu64 " " "but it wasn't in channels_pending", chan->global_identifier); diff --git a/src/or/scheduler_kist.c b/src/or/scheduler_kist.c index d269fb49b..ea6910c70 100644 --- a/src/or/scheduler_kist.c +++ b/src/or/scheduler_kist.c @@ -116,7 +116,7 @@ channel_outbuf_length(channel_t *chan) /* In theory, this can not happen because we can not scheduler a channel * without a connection that has its outbuf initialized. Just in case, bug * on this so we can understand a bit more why it happened. */ - if (BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL)) { + if (SCHED_BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL, chan)) { return 0; } return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf); @@ -370,7 +370,7 @@ socket_can_write(socket_table_t *table, const channel_t *chan) { socket_table_ent_t *ent = NULL; ent = socket_table_search(table, chan); - IF_BUG_ONCE(!ent) { + if (SCHED_BUG(!ent, chan)) { return 1; // Just return true, saying that kist wouldn't limit the socket } @@ -390,7 +390,7 @@ update_socket_info(socket_table_t *table, const channel_t *chan) { socket_table_ent_t *ent = NULL; ent = socket_table_search(table, chan); - IF_BUG_ONCE(!ent) { + if (SCHED_BUG(!ent, chan)) { return; // Whelp. Entry didn't exist for some reason so nothing to do. } update_socket_info_impl(ent); @@ -402,7 +402,7 @@ update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes) { socket_table_ent_t *ent = NULL; ent = socket_table_search(table, chan); - IF_BUG_ONCE(!ent) { + if (SCHED_BUG(!ent, chan)) { return; // Whelp. Entry didn't exist so nothing to do. } From 52050bb2c6b498c05c7ec54947deff5f662db34e Mon Sep 17 00:00:00 2001 From: Matt Traudt Date: Tue, 3 Oct 2017 16:48:35 -0400 Subject: [PATCH 3/4] sched: Add another SCHED_BUG() callsite --- src/or/scheduler_kist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/or/scheduler_kist.c b/src/or/scheduler_kist.c index ea6910c70..1bc92f249 100644 --- a/src/or/scheduler_kist.c +++ b/src/or/scheduler_kist.c @@ -572,7 +572,7 @@ kist_scheduler_run(void) /* get best channel */ chan = smartlist_pqueue_pop(cp, scheduler_compare_channels, offsetof(channel_t, sched_heap_idx)); - IF_BUG_ONCE(!chan) { + if (SCHED_BUG(!chan, NULL)) { /* Some-freaking-how a NULL got into the channels_pending. That should * never happen, but it should be harmless to ignore it and keep looping. */ From 6dc591a024e4dde8f4beafc7d42bdaeac996cca4 Mon Sep 17 00:00:00 2001 From: David Goulet Date: Thu, 26 Oct 2017 14:44:44 -0400 Subject: [PATCH 4/4] sched: Rate limit scheduler_bug_occurred() Just in case we end up hitting a SCHED_BUG() multiple times, rate limit the log warning. Signed-off-by: David Goulet --- src/or/scheduler.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/or/scheduler.c b/src/or/scheduler.c index 67911ff81..97b6d40b1 100644 --- a/src/or/scheduler.c +++ b/src/or/scheduler.c @@ -690,10 +690,21 @@ scheduler_bug_occurred(const channel_t *chan) outbuf_len); } - log_warn(LD_BUG, "%s Num pending channels: %d. Channel in pending list: %s", - (chan != NULL) ? buf : "No channel in bug context.", - smartlist_len(channels_pending), - (smartlist_pos(channels_pending, chan) == -1) ? "no" : "yes"); + { + char *msg; + /* Rate limit every 60 seconds. If we start seeing this every 60 sec, we + * know something is stuck/wrong. It *should* be loud but not too much. */ + static ratelim_t rlimit = RATELIM_INIT(60); + if ((msg = rate_limit_log(&rlimit, approx_time()))) { + log_warn(LD_BUG, "%s Num pending channels: %d. " + "Channel in pending list: %s.%s", + (chan != NULL) ? buf : "No channel in bug context.", + smartlist_len(channels_pending), + (smartlist_pos(channels_pending, chan) == -1) ? "no" : "yes", + msg); + tor_free(msg); + } + } } #ifdef TOR_UNIT_TESTS