tor/src/common/workqueue.c


/* copyright (c) 2013-2015, The Tor Project, Inc. */
/* See LICENSE for licensing information */

/**
 * \file workqueue.c
 *
 * \brief Implements worker threads, queues of work for them, and mechanisms
 * for them to send answers back to the main thread.
 *
 * The main structure here is a threadpool_t : it manages a set of worker
 * threads, a queue of pending work, and a reply queue.  Every piece of work
 * is a workqueue_entry_t, containing data to process and a function to
 * process it with.
 *
 * The main thread informs the worker threads of pending work by using a
 * condition variable.  The workers inform the main process of completed work
 * by using an alert_sockets_t object, as implemented in compat_threads.c.
 *
 * The main thread can also queue an "update" that will be handled by all the
 * workers.  This is useful for updating state that all the workers share.
 *
 * In Tor today, there is currently only one thread pool, used in cpuworker.c.
 */

#include "orconfig.h"
#include "compat.h"
#include "compat_libevent.h"
#include "compat_threads.h"
#include "crypto_rand.h"
#include "util.h"
#include "workqueue.h"
#include "tor_queue.h"
#include "torlog.h"

#include <event2/event.h>

#define WORKQUEUE_PRIORITY_FIRST WQ_PRI_HIGH
#define WORKQUEUE_PRIORITY_LAST WQ_PRI_LOW
#define WORKQUEUE_N_PRIORITIES (((int) WORKQUEUE_PRIORITY_LAST)+1)

TOR_TAILQ_HEAD(work_tailq_t, workqueue_entry_s);
typedef struct work_tailq_t work_tailq_t;

struct threadpool_s {
  /** An array of pointers to workerthread_t: one for each running worker
   * thread. */
  struct workerthread_s **threads;

  /** Condition variable that we wait on when we have no work, and which
   * gets signaled when our queue becomes nonempty. */
  tor_cond_t condition;
  /** Queues of pending work that we have to do. The queue with priority
   * <b>p</b> is work[p]. */
  work_tailq_t work[WORKQUEUE_N_PRIORITIES];

  /** Weak RNG, used to decide when to ignore priority. */
  tor_weak_rng_t weak_rng;

  /** The current 'update generation' of the threadpool.  Any thread that is
   * at an earlier generation needs to run the update function. */
  unsigned generation;

  /** Function that should be run for updates on each thread. */
  workqueue_reply_t (*update_fn)(void *, void *);
  /** Function to free update arguments if they can't be run. */
  void (*free_update_arg_fn)(void *);
  /** Array of n_threads update arguments. */
  void **update_args;
  /** Event to notice when another thread has sent a reply. */
  struct event *reply_event;
  void (*reply_cb)(threadpool_t *);

  /** Number of elements in threads. */
  int n_threads;
  /** Mutex to protect all the above fields. */
  tor_mutex_t lock;

  /** A reply queue to use when constructing new threads. */
  replyqueue_t *reply_queue;

  /** Functions used to allocate and free thread state. */
  void *(*new_thread_state_fn)(void*);
  void (*free_thread_state_fn)(void*);
  void *new_thread_state_arg;
};

/** Used to put a workqueue_priority_t value into a bitfield. */
#define workqueue_priority_bitfield_t ENUM_BF(workqueue_priority_t)
/** Number of bits needed to hold all legal values of workqueue_priority_t */
#define WORKQUEUE_PRIORITY_BITS 2

struct workqueue_entry_s {
  /** The next workqueue_entry_t that's pending on the same thread or
   * reply queue. */
  TOR_TAILQ_ENTRY(workqueue_entry_s) next_work;
  /** The threadpool to which this workqueue_entry_t was assigned. This field
   * is set when the workqueue_entry_t is created, and won't be cleared until
   * after it's handled in the main thread. */
  struct threadpool_s *on_pool;
  /** True iff this entry is waiting for a worker to start processing it. */
  uint8_t pending;
  /** Priority of this entry. */
  workqueue_priority_bitfield_t priority : WORKQUEUE_PRIORITY_BITS;
  /** Function to run in the worker thread. */
  workqueue_reply_t (*fn)(void *state, void *arg);
  /** Function to run while processing the reply queue. */
  void (*reply_fn)(void *arg);
  /** Argument for the above functions. */
  void *arg;
};

struct replyqueue_s {
  /** Mutex to protect the answers field */
  tor_mutex_t lock;
  /** Doubly-linked list of answers that the reply queue needs to handle. */
  TOR_TAILQ_HEAD(, workqueue_entry_s) answers;

  /** Mechanism to wake up the main thread when it is receiving answers. */
  alert_sockets_t alert;
};

/** A worker thread represents a single thread in a thread pool. */
typedef struct workerthread_s {
  /** Which thread it this?  In range 0..in_pool->n_threads-1 */
  int index;
  /** The pool this thread is a part of. */
  struct threadpool_s *in_pool;
  /** User-supplied state field that we pass to the worker functions of each
   * work item. */
  void *state;
  /** Reply queue to which we pass our results. */
  replyqueue_t *reply_queue;
  /** The current update generation of this thread */
  unsigned generation;
  /** One over the probability of taking work from a lower-priority queue. */
  int32_t lower_priority_chance;
} workerthread_t;

static void queue_reply(replyqueue_t *queue, workqueue_entry_t *work);

/** Allocate and return a new workqueue_entry_t, set up to run the function
 * <b>fn</b> in the worker thread, and <b>reply_fn</b> in the main
 * thread. See threadpool_queue_work() for full documentation. */
static workqueue_entry_t *
workqueue_entry_new(workqueue_reply_t (*fn)(void*, void*),
                    void (*reply_fn)(void*),
                    void *arg)
{
  workqueue_entry_t *ent = tor_malloc_zero(sizeof(workqueue_entry_t));
  ent->fn = fn;
  ent->reply_fn = reply_fn;
  ent->arg = arg;
  ent->priority = WQ_PRI_HIGH;
  return ent;
}

#define workqueue_entry_free(ent) \
  FREE_AND_NULL(workqueue_entry_t, workqueue_entry_free_, (ent))

/**
 * Release all storage held in <b>ent</b>. Call only when <b>ent</b> is not on
 * any queue.
 */
static void
workqueue_entry_free_(workqueue_entry_t *ent)
{
  if (!ent)
    return;
  memset(ent, 0xf0, sizeof(*ent));
  tor_free(ent);
}

/**
 * Cancel a workqueue_entry_t that has been returned from
 * threadpool_queue_work.
 *
 * You must not call this function on any work whose reply function has been
 * executed in the main thread; that will cause undefined behavior (probably,
 * a crash).
 *
 * If the work is cancelled, this function return the argument passed to the
 * work function. It is the caller's responsibility to free this storage.
 *
 * This function will have no effect if the worker thread has already executed
 * or begun to execute the work item.  In that case, it will return NULL.
 */
void *
workqueue_entry_cancel(workqueue_entry_t *ent)
{
  int cancelled = 0;
  void *result = NULL;
  tor_mutex_acquire(&ent->on_pool->lock);
  workqueue_priority_t prio = ent->priority;
  if (ent->pending) {
    TOR_TAILQ_REMOVE(&ent->on_pool->work[prio], ent, next_work);
    cancelled = 1;
    result = ent->arg;
  }
  tor_mutex_release(&ent->on_pool->lock);

  if (cancelled) {
    workqueue_entry_free(ent);
  }
  return result;
}

/**DOCDOC

   must hold lock */
static int
worker_thread_has_work(workerthread_t *thread)
{
  unsigned i;
  for (i = WORKQUEUE_PRIORITY_FIRST; i <= WORKQUEUE_PRIORITY_LAST; ++i) {
    if (!TOR_TAILQ_EMPTY(&thread->in_pool->work[i]))
        return 1;
  }
  return thread->generation != thread->in_pool->generation;
}

/** Extract the next workqueue_entry_t from the the thread's pool, removing
 * it from the relevant queues and marking it as non-pending.
 *
 * The caller must hold the lock. */
static workqueue_entry_t *
worker_thread_extract_next_work(workerthread_t *thread)
{
  threadpool_t *pool = thread->in_pool;
  work_tailq_t *queue = NULL, *this_queue;
  unsigned i;
  for (i = WORKQUEUE_PRIORITY_FIRST; i <= WORKQUEUE_PRIORITY_LAST; ++i) {
    this_queue = &pool->work[i];
    if (!TOR_TAILQ_EMPTY(this_queue)) {
      queue = this_queue;
      if (! tor_weak_random_one_in_n(&pool->weak_rng,
                                     thread->lower_priority_chance)) {
        /* Usually we'll just break now, so that we can get out of the loop
         * and use the queue where we found work. But with a small
         * probability, we'll keep looking for lower priority work, so that
         * we don't ignore our low-priority queues entirely. */
        break;
      }
    }
  }

  if (queue == NULL)
    return NULL;

  workqueue_entry_t *work = TOR_TAILQ_FIRST(queue);
  TOR_TAILQ_REMOVE(queue, work, next_work);
  work->pending = 0;
  return work;
}

/**
 * Main function for the worker thread.
 */
static void
worker_thread_main(void *thread_)
{
  workerthread_t *thread = thread_;
  threadpool_t *pool = thread->in_pool;
  workqueue_entry_t *work;
  workqueue_reply_t result;

  tor_mutex_acquire(&pool->lock);
  while (1) {
    /* lock must be held at this point. */
    while (worker_thread_has_work(thread)) {
      /* lock must be held at this point. */
      if (thread->in_pool->generation != thread->generation) {
        void *arg = thread->in_pool->update_args[thread->index];
        thread->in_pool->update_args[thread->index] = NULL;
        workqueue_reply_t (*update_fn)(void*,void*) =
            thread->in_pool->update_fn;
        thread->generation = thread->in_pool->generation;
        tor_mutex_release(&pool->lock);

        workqueue_reply_t r = update_fn(thread->state, arg);

        if (r != WQ_RPL_REPLY) {
          return;
        }

        tor_mutex_acquire(&pool->lock);
        continue;
      }
      work = worker_thread_extract_next_work(thread);
      if (BUG(work == NULL))
        break;
      tor_mutex_release(&pool->lock);

      /* We run the work function without holding the thread lock. This
       * is the main thread's first opportunity to give us more work. */
      result = work->fn(thread->state, work->arg);

      /* Queue the reply for the main thread. */
      queue_reply(thread->reply_queue, work);

      /* We may need to exit the thread. */
      if (result != WQ_RPL_REPLY) {
        return;
      }
      tor_mutex_acquire(&pool->lock);
    }
    /* At this point the lock is held, and there is no work in this thread's
     * queue. */

    /* TODO: support an idle-function */

    /* Okay. Now, wait till somebody has work for us. */
    if (tor_cond_wait(&pool->condition, &pool->lock, NULL) < 0) {
      log_warn(LD_GENERAL, "Fail tor_cond_wait.");
    }
  }
}

/** Put a reply on the reply queue.  The reply must not currently be on
 * any thread's work queue. */
static void
queue_reply(replyqueue_t *queue, workqueue_entry_t *work)
{
  int was_empty;
  tor_mutex_acquire(&queue->lock);
  was_empty = TOR_TAILQ_EMPTY(&queue->answers);
  TOR_TAILQ_INSERT_TAIL(&queue->answers, work, next_work);
  tor_mutex_release(&queue->lock);

  if (was_empty) {
    if (queue->alert.alert_fn(queue->alert.write_fd) < 0) {
      /* XXXX complain! */
    }
  }
}

/** Allocate and start a new worker thread to use state object <b>state</b>,
 * and send responses to <b>replyqueue</b>. */
static workerthread_t *
workerthread_new(int32_t lower_priority_chance,
                 void *state, threadpool_t *pool, replyqueue_t *replyqueue)
{
  workerthread_t *thr = tor_malloc_zero(sizeof(workerthread_t));
  thr->state = state;
  thr->reply_queue = replyqueue;
  thr->in_pool = pool;
  thr->lower_priority_chance = lower_priority_chance;

  if (spawn_func(worker_thread_main, thr) < 0) {
    //LCOV_EXCL_START
    tor_assert_nonfatal_unreached();
    log_err(LD_GENERAL, "Can't launch worker thread.");
    tor_free(thr);
    return NULL;
    //LCOV_EXCL_STOP
  }

  return thr;
}

/**
 * Queue an item of work for a thread in a thread pool.  The function
 * <b>fn</b> will be run in a worker thread, and will receive as arguments the
 * thread's state object, and the provided object <b>arg</b>. It must return
 * one of WQ_RPL_REPLY, WQ_RPL_ERROR, or WQ_RPL_SHUTDOWN.
 *
 * Regardless of its return value, the function <b>reply_fn</b> will later be
 * run in the main thread when it invokes replyqueue_process(), and will
 * receive as its argument the same <b>arg</b> object.  It's the reply
 * function's responsibility to free the work object.
 *
 * On success, return a workqueue_entry_t object that can be passed to
 * workqueue_entry_cancel(). On failure, return NULL.  (Failure is not
 * currently possible, but callers should check anyway.)
 *
 * Items are executed in a loose priority order -- each thread will usually
 * take from the queued work with the highest prioirity, but will occasionally
 * visit lower-priority queues to keep them from starving completely.
 *
 * Note that because of priorities and thread behavior, work items may not
 * be executed strictly in order.
 */
workqueue_entry_t *
threadpool_queue_work_priority(threadpool_t *pool,
                               workqueue_priority_t prio,
                               workqueue_reply_t (*fn)(void *, void *),
                               void (*reply_fn)(void *),
                               void *arg)
{
  tor_assert(((int)prio) >= WORKQUEUE_PRIORITY_FIRST &&
             ((int)prio) <= WORKQUEUE_PRIORITY_LAST);

  workqueue_entry_t *ent = workqueue_entry_new(fn, reply_fn, arg);
  ent->on_pool = pool;
  ent->pending = 1;
  ent->priority = prio;

  tor_mutex_acquire(&pool->lock);

  TOR_TAILQ_INSERT_TAIL(&pool->work[prio], ent, next_work);

  tor_cond_signal_one(&pool->condition);

  tor_mutex_release(&pool->lock);

  return ent;
}

/** As threadpool_queue_work_priority(), but assumes WQ_PRI_HIGH */
workqueue_entry_t *
threadpool_queue_work(threadpool_t *pool,
                      workqueue_reply_t (*fn)(void *, void *),
                      void (*reply_fn)(void *),
                      void *arg)
{
  return threadpool_queue_work_priority(pool, WQ_PRI_HIGH, fn, reply_fn, arg);
}

/**
 * Queue a copy of a work item for every thread in a pool.  This can be used,
 * for example, to tell the threads to update some parameter in their states.
 *
 * Arguments are as for <b>threadpool_queue_work</b>, except that the
 * <b>arg</b> value is passed to <b>dup_fn</b> once per each thread to
 * make a copy of it.
 *
 * UPDATE FUNCTIONS MUST BE IDEMPOTENT.  We do not guarantee that every update
 * will be run.  If a new update is scheduled before the old update finishes
 * running, then the new will replace the old in any threads that haven't run
 * it yet.
 *
 * Return 0 on success, -1 on failure.
 */
int
threadpool_queue_update(threadpool_t *pool,
                         void *(*dup_fn)(void *),
                         workqueue_reply_t (*fn)(void *, void *),
                         void (*free_fn)(void *),
                         void *arg)
{
  int i, n_threads;
  void (*old_args_free_fn)(void *arg);
  void **old_args;
  void **new_args;

  tor_mutex_acquire(&pool->lock);
  n_threads = pool->n_threads;
  old_args = pool->update_args;
  old_args_free_fn = pool->free_update_arg_fn;

  new_args = tor_calloc(n_threads, sizeof(void*));
  for (i = 0; i < n_threads; ++i) {
    if (dup_fn)
      new_args[i] = dup_fn(arg);
    else
      new_args[i] = arg;
  }

  pool->update_args = new_args;
  pool->free_update_arg_fn = free_fn;
  pool->update_fn = fn;
  ++pool->generation;

  tor_cond_signal_all(&pool->condition);

  tor_mutex_release(&pool->lock);

  if (old_args) {
    for (i = 0; i < n_threads; ++i) {
      if (old_args[i] && old_args_free_fn)
        old_args_free_fn(old_args[i]);
    }
    tor_free(old_args);
  }

  return 0;
}

/** Don't have more than this many threads per pool. */
#define MAX_THREADS 1024

/** For half of our threads, choose lower priority queues with probability
 * 1/N for each of these values. Both are chosen somewhat arbitrarily.  If
 * CHANCE_PERMISSIVE is too low, then we have a risk of low-priority tasks
 * stalling forever.  If it's too high, we have a risk of low-priority tasks
 * grabbing half of the threads. */
#define CHANCE_PERMISSIVE 37
#define CHANCE_STRICT INT32_MAX

/** Launch threads until we have <b>n</b>. */
static int
threadpool_start_threads(threadpool_t *pool, int n)
{
  if (BUG(n < 0))
    return -1; // LCOV_EXCL_LINE
  if (n > MAX_THREADS)
    n = MAX_THREADS;

  tor_mutex_acquire(&pool->lock);

  if (pool->n_threads < n)
    pool->threads = tor_reallocarray(pool->threads,
                                     sizeof(workerthread_t*), n);

  while (pool->n_threads < n) {
    /* For half of our threads, we'll choose lower priorities permissively;
     * for the other half, we'll stick more strictly to higher priorities.
     * This keeps slow low-priority tasks from taking over completely. */
    int32_t chance = (pool->n_threads & 1) ? CHANCE_STRICT : CHANCE_PERMISSIVE;

    void *state = pool->new_thread_state_fn(pool->new_thread_state_arg);
    workerthread_t *thr = workerthread_new(chance,
                                           state, pool, pool->reply_queue);

    if (!thr) {
      //LCOV_EXCL_START
      tor_assert_nonfatal_unreached();
      pool->free_thread_state_fn(state);
      tor_mutex_release(&pool->lock);
      return -1;
      //LCOV_EXCL_STOP
    }
    thr->index = pool->n_threads;
    pool->threads[pool->n_threads++] = thr;
  }
  tor_mutex_release(&pool->lock);

  return 0;
}

/**
 * Construct a new thread pool with <b>n</b> worker threads, configured to
 * send their output to <b>replyqueue</b>.  The threads' states will be
 * constructed with the <b>new_thread_state_fn</b> call, receiving <b>arg</b>
 * as its argument.  When the threads close, they will call
 * <b>free_thread_state_fn</b> on their states.
 */
threadpool_t *
threadpool_new(int n_threads,
               replyqueue_t *replyqueue,
               void *(*new_thread_state_fn)(void*),
               void (*free_thread_state_fn)(void*),
               void *arg)
{
  threadpool_t *pool;
  pool = tor_malloc_zero(sizeof(threadpool_t));
  tor_mutex_init_nonrecursive(&pool->lock);
  tor_cond_init(&pool->condition);
  unsigned i;
  for (i = WORKQUEUE_PRIORITY_FIRST; i <= WORKQUEUE_PRIORITY_LAST; ++i) {
    TOR_TAILQ_INIT(&pool->work[i]);
  }
  {
    unsigned seed;
    crypto_rand((void*)&seed, sizeof(seed));
    tor_init_weak_random(&pool->weak_rng, seed);
  }

  pool->new_thread_state_fn = new_thread_state_fn;
  pool->new_thread_state_arg = arg;
  pool->free_thread_state_fn = free_thread_state_fn;
  pool->reply_queue = replyqueue;

  if (threadpool_start_threads(pool, n_threads) < 0) {
    //LCOV_EXCL_START
    tor_assert_nonfatal_unreached();
    tor_cond_uninit(&pool->condition);
    tor_mutex_uninit(&pool->lock);
    tor_free(pool);
    return NULL;
    //LCOV_EXCL_STOP
  }

  return pool;
}

/** Return the reply queue associated with a given thread pool. */
replyqueue_t *
threadpool_get_replyqueue(threadpool_t *tp)
{
  return tp->reply_queue;
}

/** Allocate a new reply queue.  Reply queues are used to pass results from
 * worker threads to the main thread.  Since the main thread is running an
 * IO-centric event loop, it needs to get woken up with means other than a
 * condition variable. */
replyqueue_t *
replyqueue_new(uint32_t alertsocks_flags)
{
  replyqueue_t *rq;

  rq = tor_malloc_zero(sizeof(replyqueue_t));
  if (alert_sockets_create(&rq->alert, alertsocks_flags) < 0) {
    //LCOV_EXCL_START
    tor_free(rq);
    return NULL;
    //LCOV_EXCL_STOP
  }

  tor_mutex_init(&rq->lock);
  TOR_TAILQ_INIT(&rq->answers);

  return rq;
}

/** Internal: Run from the libevent mainloop when there is work to handle in
 * the reply queue handler. */
static void
reply_event_cb(evutil_socket_t sock, short events, void *arg)
{
  threadpool_t *tp = arg;
  (void) sock;
  (void) events;
  replyqueue_process(tp->reply_queue);
  if (tp->reply_cb)
    tp->reply_cb(tp);
}

/** Register the threadpool <b>tp</b>'s reply queue with the libevent
 * mainloop of <b>base</b>. If <b>tp</b> is provided, it is run after
 * each time there is work to process from the reply queue. Return 0 on
 * success, -1 on failure.
 */
int
threadpool_register_reply_event(threadpool_t *tp,
                                void (*cb)(threadpool_t *tp))
{
  struct event_base *base = tor_libevent_get_base();

  if (tp->reply_event) {
    tor_event_free(tp->reply_event);
  }
  tp->reply_event = tor_event_new(base,
                                  tp->reply_queue->alert.read_fd,
                                  EV_READ|EV_PERSIST,
                                  reply_event_cb,
                                  tp);
  tor_assert(tp->reply_event);
  tp->reply_cb = cb;
  return event_add(tp->reply_event, NULL);
}

/**
 * Process all pending replies on a reply queue. The main thread should call
 * this function every time the socket returned by replyqueue_get_socket() is
 * readable.
 */
void
replyqueue_process(replyqueue_t *queue)
{
  int r = queue->alert.drain_fn(queue->alert.read_fd);
  if (r < 0) {
    //LCOV_EXCL_START
    static ratelim_t warn_limit = RATELIM_INIT(7200);
    log_fn_ratelim(&warn_limit, LOG_WARN, LD_GENERAL,
                 "Failure from drain_fd: %s",
                   tor_socket_strerror(-r));
    //LCOV_EXCL_STOP
  }

  tor_mutex_acquire(&queue->lock);
  while (!TOR_TAILQ_EMPTY(&queue->answers)) {
    /* lock must be held at this point.*/
    workqueue_entry_t *work = TOR_TAILQ_FIRST(&queue->answers);
    TOR_TAILQ_REMOVE(&queue->answers, work, next_work);
    tor_mutex_release(&queue->lock);
    work->on_pool = NULL;

    work->reply_fn(work->arg);
    workqueue_entry_free(work);

    tor_mutex_acquire(&queue->lock);
  }

  tor_mutex_release(&queue->lock);
}