diff --git a/changes/bug23753 b/changes/bug23753 new file mode 100644 index 000000000..8782a8e2d --- /dev/null +++ b/changes/bug23753 @@ -0,0 +1,4 @@ + o Minor features (logging, scheduler): + - Introduce a SCHED_BUG() function to log extra information about the + scheduler state if we ever catch a bug in the scheduler. Closes ticket + 23753. diff --git a/src/or/scheduler.c b/src/or/scheduler.c index 22a9b574a..97b6d40b1 100644 --- a/src/or/scheduler.c +++ b/src/or/scheduler.c @@ -9,6 +9,9 @@ #define SCHEDULER_KIST_PRIVATE #include "scheduler.h" #include "main.h" +#include "buffers.h" +#define TOR_CHANNEL_INTERNAL_ +#include "channeltls.h" #include @@ -604,7 +607,7 @@ scheduler_release_channel,(channel_t *chan)) } if (chan->scheduler_state == SCHED_CHAN_PENDING) { - if (smartlist_pos(channels_pending, chan) == -1) { + if (SCHED_BUG(smartlist_pos(channels_pending, chan) == -1, chan)) { log_warn(LD_SCHED, "Scheduler asked to release channel %" PRIu64 " " "but it wasn't in channels_pending", chan->global_identifier); @@ -667,6 +670,43 @@ scheduler_channel_wants_writes(channel_t *chan) } } +/* Log warn the given channel and extra scheduler context as well. This is + * used by SCHED_BUG() in order to be able to extract as much information as + * we can when we hit a bug. Channel chan can be NULL. */ +void +scheduler_bug_occurred(const channel_t *chan) +{ + char buf[128]; + + if (chan != NULL) { + const size_t outbuf_len = + buf_datalen(TO_CONN(BASE_CHAN_TO_TLS((channel_t *) chan)->conn)->outbuf); + tor_snprintf(buf, sizeof(buf), + "Channel %" PRIu64 " in state %s and scheduler state %d." + " Num cells on cmux: %d. Connection outbuf len: %lu.", + chan->global_identifier, + channel_state_to_string(chan->state), + chan->scheduler_state, circuitmux_num_cells(chan->cmux), + outbuf_len); + } + + { + char *msg; + /* Rate limit every 60 seconds. If we start seeing this every 60 sec, we + * know something is stuck/wrong. It *should* be loud but not too much. */ + static ratelim_t rlimit = RATELIM_INIT(60); + if ((msg = rate_limit_log(&rlimit, approx_time()))) { + log_warn(LD_BUG, "%s Num pending channels: %d. " + "Channel in pending list: %s.%s", + (chan != NULL) ? buf : "No channel in bug context.", + smartlist_len(channels_pending), + (smartlist_pos(channels_pending, chan) == -1) ? "no" : "yes", + msg); + tor_free(msg); + } + } +} + #ifdef TOR_UNIT_TESTS /* diff --git a/src/or/scheduler.h b/src/or/scheduler.h index 0ec24db93..47c98f096 100644 --- a/src/or/scheduler.h +++ b/src/or/scheduler.h @@ -143,6 +143,14 @@ MOCK_DECL(void, scheduler_channel_has_waiting_cells, (channel_t *chan)); /********************************* * Defined in scheduler.c *********************************/ + +/* Triggers a BUG() and extra information with chan if available. */ +#define SCHED_BUG(cond, chan) \ + (PREDICT_UNLIKELY(cond) ? \ + ((BUG(cond)) ? (scheduler_bug_occurred(chan), 1) : 0) : 0) + +void scheduler_bug_occurred(const channel_t *chan); + smartlist_t *get_channels_pending(void); MOCK_DECL(int, scheduler_compare_channels, (const void *c1_v, const void *c2_v)); diff --git a/src/or/scheduler_kist.c b/src/or/scheduler_kist.c index 03aa9c45b..a3b74e8cc 100644 --- a/src/or/scheduler_kist.c +++ b/src/or/scheduler_kist.c @@ -116,7 +116,7 @@ channel_outbuf_length(channel_t *chan) /* In theory, this can not happen because we can not scheduler a channel * without a connection that has its outbuf initialized. Just in case, bug * on this so we can understand a bit more why it happened. */ - if (BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL)) { + if (SCHED_BUG(BASE_CHAN_TO_TLS(chan)->conn == NULL, chan)) { return 0; } return buf_datalen(TO_CONN(BASE_CHAN_TO_TLS(chan)->conn)->outbuf); @@ -371,7 +371,7 @@ socket_can_write(socket_table_t *table, const channel_t *chan) { socket_table_ent_t *ent = NULL; ent = socket_table_search(table, chan); - IF_BUG_ONCE(!ent) { + if (SCHED_BUG(!ent, chan)) { return 1; // Just return true, saying that kist wouldn't limit the socket } @@ -391,7 +391,7 @@ update_socket_info(socket_table_t *table, const channel_t *chan) { socket_table_ent_t *ent = NULL; ent = socket_table_search(table, chan); - IF_BUG_ONCE(!ent) { + if (SCHED_BUG(!ent, chan)) { return; // Whelp. Entry didn't exist for some reason so nothing to do. } update_socket_info_impl(ent); @@ -403,7 +403,7 @@ update_socket_written(socket_table_t *table, channel_t *chan, size_t bytes) { socket_table_ent_t *ent = NULL; ent = socket_table_search(table, chan); - IF_BUG_ONCE(!ent) { + if (SCHED_BUG(!ent, chan)) { return; // Whelp. Entry didn't exist so nothing to do. } @@ -573,7 +573,7 @@ kist_scheduler_run(void) /* get best channel */ chan = smartlist_pqueue_pop(cp, scheduler_compare_channels, offsetof(channel_t, sched_heap_idx)); - IF_BUG_ONCE(!chan) { + if (SCHED_BUG(!chan, NULL)) { /* Some-freaking-how a NULL got into the channels_pending. That should * never happen, but it should be harmless to ignore it and keep looping. */