diff --git a/.gitignore b/.gitignore index dad8be1a4..c71dc026b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,11 @@ cscope.* # OSX junk *.dSYM .DS_Store +# updateFallbackDirs.py temp files +details-*.json +uptime-*.json +*.full_url +*.last_modified # / /Makefile diff --git a/changes/feature15775-fallback b/changes/feature15775-fallback new file mode 100644 index 000000000..567d01cc7 --- /dev/null +++ b/changes/feature15775-fallback @@ -0,0 +1,19 @@ + o Major features (directory mirrors): + - Include an opt-in trial list of Default Fallback Directories in + add_default_fallback_dir_servers(). + "Tor has included a feature to fetch the initial consensus from nodes + other than the authorities for a while now. We just haven't shipped a + list of alternate locations for clients to go to yet. + Reasons why we might want to ship tor with a list of additional places + where clients can find the consensus is that it makes authority + reachability and BW less important. + We want them to have been around and using their current key, address, + and port for a while now (120 days), and have been running, a guard, + and a v2 directory mirror for most of that time." + We exclude BadExits and tor versions that aren't recommended. + We include an IPv6 address for each FallbackDir (#8374). + (Tor might not use IPv6 fallbacks until #6027 is merged.) + The unit test ensures that we successfully load all included + default fallback directories. + Closes ticket #15775. Patch by "teor". + OnionOO script by "weasel", "teor", "gsathya", and "karsten". diff --git a/scripts/maint/fallback.blacklist b/scripts/maint/fallback.blacklist new file mode 100644 index 000000000..919ae3360 --- /dev/null +++ b/scripts/maint/fallback.blacklist @@ -0,0 +1,19 @@ +# updateFallbackDirs.py directory mirror blacklist +# +# Format: +# [ IPv4[:DirPort] ] [ orport= ] [ id= ] ... +# [ ipv6=[:] ] +# +# If a sufficiently specific group of attributes matches, the directory mirror +# will be excluded: (each group is listed on its own line) +# , +# , +# +# , +# , +# If DirPort and ORPort are not present, the entire IP address is blacklisted. +# (The blacklist overrides the whitelist.) + +# If a relay operator doesn't want their relay to be a FallbackDir, +# enter the following information here: +# : orport= id= ipv6=: diff --git a/scripts/maint/fallback.whitelist b/scripts/maint/fallback.whitelist new file mode 100644 index 000000000..a88dfaaef --- /dev/null +++ b/scripts/maint/fallback.whitelist @@ -0,0 +1,13 @@ +# updateFallbackDirs.py directory mirror whitelist +# +# Format: +# IPv4:DirPort orport= id= [ ipv6=: ] +# +# All attributes must match for the directory mirror to be included. +# If the fallback has an ipv6 key, the whitelist line must also have +# it, and vice versa, otherwise they don't match. +# (The blacklist overrides the whitelist.) + +# If a relay operator wants their relay to be a FallbackDir, +# enter the following information here: +# : orport= id= [ ipv6=: ] diff --git a/scripts/maint/updateFallbackDirs.py b/scripts/maint/updateFallbackDirs.py new file mode 100755 index 000000000..8fe234f20 --- /dev/null +++ b/scripts/maint/updateFallbackDirs.py @@ -0,0 +1,1225 @@ +#!/usr/bin/python + +# Usage: scripts/maint/updateFallbackDirs.py > src/or/fallback_dirs.inc +# +# Then read the generated list to ensure no-one slipped anything funny into +# their name or contactinfo + +# Script by weasel, April 2015 +# Portions by gsathya & karsten, 2013 +# https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py +# Modifications by teor, 2015 + +import StringIO +import string +import re +import datetime +import gzip +import os.path +import json +import math +import sys +import urllib +import urllib2 +import hashlib +import dateutil.parser +# bson_lazy provides bson +#from bson import json_util + +import logging +logging.basicConfig(level=logging.INFO) + +## Top-Level Configuration + +# Output all candidate fallbacks, or only output selected fallbacks? +OUTPUT_CANDIDATES = False + +## OnionOO Settings + +ONIONOO = 'https://onionoo.torproject.org/' +#ONIONOO = 'https://onionoo.thecthulhu.com/' + +# Don't bother going out to the Internet, just use the files available locally, +# even if they're very old +LOCAL_FILES_ONLY = False + +## Whitelist / Blacklist Filter Settings + +# The whitelist contains entries that are included if all attributes match +# (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport) +# The blacklist contains (partial) entries that are excluded if any +# sufficiently specific group of attributes matches: +# IPv4 & DirPort +# IPv4 & ORPort +# ID +# IPv6 & DirPort +# IPv6 & IPv6 ORPort +# If neither port is included in the blacklist, the entire IP address is +# blacklisted. + +# What happens to entries in neither list? +# When True, they are included, when False, they are excluded +INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False + +# If an entry is in both lists, what happens? +# When True, it is excluded, when False, it is included +BLACKLIST_EXCLUDES_WHITELIST_ENTRIES = True + +WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist' +BLACKLIST_FILE_NAME = 'scripts/maint/fallback.blacklist' + +# The number of bytes we'll read from a filter file before giving up +MAX_LIST_FILE_SIZE = 1024 * 1024 + +## Eligibility Settings + +ADDRESS_AND_PORT_STABLE_DAYS = 120 +# What time-weighted-fraction of these flags must FallbackDirs +# Equal or Exceed? +CUTOFF_RUNNING = .95 +CUTOFF_V2DIR = .95 +CUTOFF_GUARD = .95 +# What time-weighted-fraction of these flags must FallbackDirs +# Equal or Fall Under? +# .00 means no bad exits +PERMITTED_BADEXIT = .00 + +## List Length Limits + +# The target for these parameters is 20% of the guards in the network +# This is around 200 as of October 2015 +FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else 0.2 + +# Limit the number of fallbacks (eliminating lowest by weight) +MAX_FALLBACK_COUNT = 500 +# Emit a C #error if the number of fallbacks is below +MIN_FALLBACK_COUNT = 100 + +## Fallback Weight Settings + +# Any fallback with the Exit flag has its weight multipled by this fraction +EXIT_WEIGHT_FRACTION = 0.2 + +# If True, emit a C #error if we can't satisfy various constraints +# If False, emit a C comment instead +STRICT_FALLBACK_WEIGHTS = False + +# Limit the proportional weight +# If a single fallback's weight is too high, it will see too many clients +# We reweight using a lower threshold to provide some leeway for: +# * elimination of low weight relays +# * consensus weight changes +# * fallback directory losses over time +# A relay weighted at 1 in 10 fallbacks will see about 10% of clients that +# use the fallback directories. (The 9 directory authorities see a similar +# proportion of clients.) +TARGET_MAX_WEIGHT_FRACTION = 1/10.0 +REWEIGHTING_FUDGE_FACTOR = 0.8 +MAX_WEIGHT_FRACTION = TARGET_MAX_WEIGHT_FRACTION * REWEIGHTING_FUDGE_FACTOR +# If a single fallback's weight is too low, it's pointless adding it. +# (Final weights may be slightly higher than this, due to low weight relays +# being excluded.) +# A relay weighted at 1 in 1000 fallbacks will see about 0.1% of clients. +MIN_WEIGHT_FRACTION = 0.0 if OUTPUT_CANDIDATES else 1/1000.0 + +## Other Configuration Parameters + +# older entries' weights are adjusted with ALPHA^(age in days) +AGE_ALPHA = 0.99 + +# this factor is used to scale OnionOO entries to [0,1] +ONIONOO_SCALE_ONE = 999. + +## Parsing Functions + +def parse_ts(t): + return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S") + +def remove_bad_chars(raw_string, bad_char_list): + # Remove each character in the bad_char_list + escaped_string = raw_string + for c in bad_char_list: + escaped_string = escaped_string.replace(c, '') + return escaped_string + +def cleanse_whitespace(raw_string): + # Replace all whitespace characters with a space + escaped_string = raw_string + for c in string.whitespace: + escaped_string = escaped_string.replace(c, ' ') + return escaped_string + +def cleanse_c_multiline_comment(raw_string): + # Prevent a malicious / unanticipated string from breaking out + # of a C-style multiline comment + # This removes '/*' and '*/' + # To deal with '//', the end comment must be on its own line + bad_char_list = '*' + # Prevent a malicious string from using C nulls + bad_char_list += '\0' + # Be safer by removing bad characters entirely + escaped_string = remove_bad_chars(raw_string, bad_char_list) + # Embedded newlines should be removed by tor/onionoo, but let's be paranoid + escaped_string = cleanse_whitespace(escaped_string) + # Some compilers may further process the content of comments + # There isn't much we can do to cover every possible case + # But comment-based directives are typically only advisory + return escaped_string + +def cleanse_c_string(raw_string): + # Prevent a malicious address/fingerprint string from breaking out + # of a C-style string + bad_char_list = '"' + # Prevent a malicious string from using escapes + bad_char_list += '\\' + # Prevent a malicious string from using C nulls + bad_char_list += '\0' + # Be safer by removing bad characters entirely + escaped_string = remove_bad_chars(raw_string, bad_char_list) + # Embedded newlines should be removed by tor/onionoo, but let's be paranoid + escaped_string = cleanse_whitespace(escaped_string) + # Some compilers may further process the content of strings + # There isn't much we can do to cover every possible case + # But this typically only results in changes to the string data + return escaped_string + +## OnionOO Source Functions + +# a dictionary of source metadata for each onionoo query we've made +fetch_source = {} + +# register source metadata for 'what' +# assumes we only retrieve one document for each 'what' +def register_fetch_source(what, url, relays_published, version): + fetch_source[what] = {} + fetch_source[what]['url'] = url + fetch_source[what]['relays_published'] = relays_published + fetch_source[what]['version'] = version + +# list each registered source's 'what' +def fetch_source_list(): + return sorted(fetch_source.keys()) + +# given 'what', provide a multiline C comment describing the source +def describe_fetch_source(what): + desc = '/*' + desc += '\n' + desc += 'Onionoo Source: ' + desc += cleanse_c_multiline_comment(what) + desc += ' Date: ' + desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published']) + desc += ' Version: ' + desc += cleanse_c_multiline_comment(fetch_source[what]['version']) + desc += '\n' + desc += 'URL: ' + desc += cleanse_c_multiline_comment(fetch_source[what]['url']) + desc += '\n' + desc += '*/' + return desc + +## File Processing Functions + +def write_to_file(str, file_name, max_len): + try: + with open(file_name, 'w') as f: + f.write(str[0:max_len]) + except EnvironmentError, error: + logging.debug('Writing file %s failed: %d: %s'% + (file_name, + error.errno, + error.strerror) + ) + +def read_from_file(file_name, max_len): + try: + if os.path.isfile(file_name): + with open(file_name, 'r') as f: + return f.read(max_len) + except EnvironmentError, error: + logging.debug('Loading file %s failed: %d: %s'% + (file_name, + error.errno, + error.strerror) + ) + return None + +def load_possibly_compressed_response_json(response): + if response.info().get('Content-Encoding') == 'gzip': + buf = StringIO.StringIO( response.read() ) + f = gzip.GzipFile(fileobj=buf) + return json.load(f) + else: + return json.load(response) + +def load_json_from_file(json_file_name): + # An exception here may be resolved by deleting the .last_modified + # and .json files, and re-running the script + try: + with open(json_file_name, 'r') as f: + return json.load(f) + except EnvironmentError, error: + raise Exception('Reading not-modified json file %s failed: %d: %s'% + (json_file_name, + error.errno, + error.strerror) + ) + +## OnionOO Functions + +def onionoo_fetch(what, **kwargs): + params = kwargs + params['type'] = 'relay' + #params['limit'] = 10 + params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS,) + params['last_seen_days'] = '-7' + params['flag'] = 'V2Dir' + url = ONIONOO + what + '?' + urllib.urlencode(params) + + # Unfortunately, the URL is too long for some OS filenames, + # but we still don't want to get files from different URLs mixed up + base_file_name = what + '-' + hashlib.sha1(url).hexdigest() + + full_url_file_name = base_file_name + '.full_url' + MAX_FULL_URL_LENGTH = 1024 + + last_modified_file_name = base_file_name + '.last_modified' + MAX_LAST_MODIFIED_LENGTH = 64 + + json_file_name = base_file_name + '.json' + + if LOCAL_FILES_ONLY: + # Read from the local file, don't write to anything + response_json = load_json_from_file(json_file_name) + else: + # store the full URL to a file for debugging + # no need to compare as long as you trust SHA-1 + write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH) + + request = urllib2.Request(url) + request.add_header('Accept-encoding', 'gzip') + + # load the last modified date from the file, if it exists + last_mod_date = read_from_file(last_modified_file_name, + MAX_LAST_MODIFIED_LENGTH) + if last_mod_date is not None: + request.add_header('If-modified-since', last_mod_date) + + # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT + if last_mod_date is not None: + last_mod = dateutil.parser.parse(last_mod_date) + else: + # Never modified - use start of epoch + last_mod = datetime.datetime.utcfromtimestamp(0) + # strip any timezone out (in case they're supported in future) + last_mod = last_mod.replace(tzinfo=None) + + response_code = 0 + try: + response = urllib2.urlopen(request) + response_code = response.getcode() + except urllib2.HTTPError, error: + response_code = error.code + # strip any timezone out (to match dateutil.parser) + six_hours_ago = datetime.datetime.utcnow() + six_hours_ago = six_hours_ago.replace(tzinfo=None) + six_hours_ago -= datetime.timedelta(hours=6) + # Not Modified and still recent enough to be useful (Globe uses 6 hours) + if response_code == 304: + if last_mod < six_hours_ago: + raise Exception("Outdated data from " + url + ": " + + str(error.code) + ": " + error.reason) + else: + pass + else: + raise Exception("Could not get " + url + ": " + + str(error.code) + ": " + error.reason) + + if response_code == 200: # OK + + response_json = load_possibly_compressed_response_json(response) + + with open(json_file_name, 'w') as f: + # use the most compact json representation to save space + json.dump(response_json, f, separators=(',',':')) + + # store the last modified date in its own file + if response.info().get('Last-modified') is not None: + write_to_file(response.info().get('Last-Modified'), + last_modified_file_name, + MAX_LAST_MODIFIED_LENGTH) + + elif response_code == 304: # Not Modified + + response_json = load_json_from_file(json_file_name) + + else: # Unexpected HTTP response code not covered in the HTTPError above + raise Exception("Unexpected HTTP response code to " + url + ": " + + str(response_code)) + + register_fetch_source(what, + url, + response_json['relays_published'], + response_json['version']) + + return response_json + +def fetch(what, **kwargs): + #x = onionoo_fetch(what, **kwargs) + # don't use sort_keys, as the order of or_addresses is significant + #print json.dumps(x, indent=4, separators=(',', ': ')) + #sys.exit(0) + + return onionoo_fetch(what, **kwargs) + +## Fallback Candidate Class + +class Candidate(object): + CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.now() + - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS)) + + def __init__(self, details): + for f in ['fingerprint', 'nickname', 'last_changed_address_or_port', + 'consensus_weight', 'or_addresses', 'dir_address']: + if not f in details: raise Exception("Document has no %s field."%(f,)) + + if not 'contact' in details: + details['contact'] = None + if not 'flags' in details or details['flags'] is None: + details['flags'] = [] + details['last_changed_address_or_port'] = parse_ts( + details['last_changed_address_or_port']) + self._data = details + self._stable_sort_or_addresses() + + self._fpr = self._data['fingerprint'] + self._running = self._guard = self._v2dir = 0. + self._split_dirport() + self._compute_orport() + if self.orport is None: + raise Exception("Failed to get an orport for %s."%(self._fpr,)) + self._compute_ipv6addr() + if self.ipv6addr is None: + logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,)) + # Reduce the weight of exits to EXIT_WEIGHT_FRACTION * consensus_weight + if self.is_exit(): + current_weight = self._data['consensus_weight'] + exit_weight = current_weight * EXIT_WEIGHT_FRACTION + self._data['original_consensus_weight'] = current_weight + self._data['consensus_weight'] = exit_weight + + def _stable_sort_or_addresses(self): + # replace self._data['or_addresses'] with a stable ordering, + # sorting the secondary addresses in string order + # leave the received order in self._data['or_addresses_raw'] + self._data['or_addresses_raw'] = self._data['or_addresses'] + or_address_primary = self._data['or_addresses'][:1] + # subsequent entries in the or_addresses array are in an arbitrary order + # so we stabilise the addresses by sorting them in string order + or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:]) + or_addresses_stable = or_address_primary + or_addresses_secondaries_stable + self._data['or_addresses'] = or_addresses_stable + + def get_fingerprint(self): + return self._fpr + + # is_valid_ipv[46]_address by gsathya, karsten, 2013 + @staticmethod + def is_valid_ipv4_address(address): + if not isinstance(address, (str, unicode)): + return False + + # check if there are four period separated values + if address.count(".") != 3: + return False + + # checks that each value in the octet are decimal values between 0-255 + for entry in address.split("."): + if not entry.isdigit() or int(entry) < 0 or int(entry) > 255: + return False + elif entry[0] == "0" and len(entry) > 1: + return False # leading zeros, for instance in "1.2.3.001" + + return True + + @staticmethod + def is_valid_ipv6_address(address): + if not isinstance(address, (str, unicode)): + return False + + # remove brackets + address = address[1:-1] + + # addresses are made up of eight colon separated groups of four hex digits + # with leading zeros being optional + # https://en.wikipedia.org/wiki/IPv6#Address_format + + colon_count = address.count(":") + + if colon_count > 7: + return False # too many groups + elif colon_count != 7 and not "::" in address: + return False # not enough groups and none are collapsed + elif address.count("::") > 1 or ":::" in address: + return False # multiple groupings of zeros can't be collapsed + + found_ipv4_on_previous_entry = False + for entry in address.split(":"): + # If an IPv6 address has an embedded IPv4 address, + # it must be the last entry + if found_ipv4_on_previous_entry: + return False + if not re.match("^[0-9a-fA-f]{0,4}$", entry): + if not Candidate.is_valid_ipv4_address(entry): + return False + else: + found_ipv4_on_previous_entry = True + + return True + + def _split_dirport(self): + # Split the dir_address into dirip and dirport + (self.dirip, _dirport) = self._data['dir_address'].split(':', 2) + self.dirport = int(_dirport) + + def _compute_orport(self): + # Choose the first ORPort that's on the same IPv4 address as the DirPort. + # In rare circumstances, this might not be the primary ORPort address. + # However, _stable_sort_or_addresses() ensures we choose the same one + # every time, even if onionoo changes the order of the secondaries. + self._split_dirport() + self.orport = None + for i in self._data['or_addresses']: + if i != self._data['or_addresses'][0]: + logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i)) + (ipaddr, port) = i.rsplit(':', 1) + if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr): + self.orport = int(port) + return + + def _compute_ipv6addr(self): + # Choose the first IPv6 address that uses the same port as the ORPort + # Or, choose the first IPv6 address in the list + # _stable_sort_or_addresses() ensures we choose the same IPv6 address + # every time, even if onionoo changes the order of the secondaries. + self.ipv6addr = None + self.ipv6orport = None + # Choose the first IPv6 address that uses the same port as the ORPort + for i in self._data['or_addresses']: + (ipaddr, port) = i.rsplit(':', 1) + if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr): + self.ipv6addr = ipaddr + self.ipv6orport = port + return + # Choose the first IPv6 address in the list + for i in self._data['or_addresses']: + (ipaddr, port) = i.rsplit(':', 1) + if Candidate.is_valid_ipv6_address(ipaddr): + self.ipv6addr = ipaddr + self.ipv6orport = port + return + + @staticmethod + def _extract_generic_history(history, which='unknown'): + # given a tree like this: + # { + # "1_month": { + # "count": 187, + # "factor": 0.001001001001001001, + # "first": "2015-02-27 06:00:00", + # "interval": 14400, + # "last": "2015-03-30 06:00:00", + # "values": [ + # 999, + # 999 + # ] + # }, + # "1_week": { + # "count": 169, + # "factor": 0.001001001001001001, + # "first": "2015-03-23 07:30:00", + # "interval": 3600, + # "last": "2015-03-30 07:30:00", + # "values": [ ...] + # }, + # "1_year": { + # "count": 177, + # "factor": 0.001001001001001001, + # "first": "2014-04-11 00:00:00", + # "interval": 172800, + # "last": "2015-03-29 00:00:00", + # "values": [ ...] + # }, + # "3_months": { + # "count": 185, + # "factor": 0.001001001001001001, + # "first": "2014-12-28 06:00:00", + # "interval": 43200, + # "last": "2015-03-30 06:00:00", + # "values": [ ...] + # } + # }, + # extract exactly one piece of data per time interval, + # using smaller intervals where available. + # + # returns list of (age, length, value) dictionaries. + + generic_history = [] + + periods = history.keys() + periods.sort(key = lambda x: history[x]['interval']) + now = datetime.datetime.now() + newest = now + for p in periods: + h = history[p] + interval = datetime.timedelta(seconds = h['interval']) + this_ts = parse_ts(h['last']) + + if (len(h['values']) != h['count']): + logging.warn('Inconsistent value count in %s document for %s' + %(p, which)) + for v in reversed(h['values']): + if (this_ts <= newest): + generic_history.append( + { 'age': (now - this_ts).total_seconds(), + 'length': interval.total_seconds(), + 'value': v + }) + newest = this_ts + this_ts -= interval + + if (this_ts + interval != parse_ts(h['first'])): + logging.warn('Inconsistent time information in %s document for %s' + %(p, which)) + + #print json.dumps(generic_history, sort_keys=True, + # indent=4, separators=(',', ': ')) + return generic_history + + @staticmethod + def _avg_generic_history(generic_history): + a = [] + for i in generic_history: + if (i['length'] is not None + and i['age'] is not None + and i['value'] is not None): + w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24)) + a.append( (i['value'] * w, w) ) + + sv = math.fsum(map(lambda x: x[0], a)) + sw = math.fsum(map(lambda x: x[1], a)) + + return sv/sw + + def _add_generic_history(self, history): + periods = r['read_history'].keys() + periods.sort(key = lambda x: r['read_history'][x]['interval'] ) + + print periods + + def add_running_history(self, history): + pass + + def add_uptime(self, uptime): + logging.debug('Adding uptime %s.'%(self._fpr,)) + + # flags we care about: Running, V2Dir, Guard + if not 'flags' in uptime: + logging.debug('No flags in document for %s.'%(self._fpr,)) + return + + for f in ['Running', 'Guard', 'V2Dir']: + if not f in uptime['flags']: + logging.debug('No %s in flags for %s.'%(f, self._fpr,)) + return + + running = self._extract_generic_history(uptime['flags']['Running'], + '%s-Running'%(self._fpr)) + guard = self._extract_generic_history(uptime['flags']['Guard'], + '%s-Guard'%(self._fpr)) + v2dir = self._extract_generic_history(uptime['flags']['V2Dir'], + '%s-V2Dir'%(self._fpr)) + if 'BadExit' in uptime['flags']: + badexit = self._extract_generic_history(uptime['flags']['BadExit'], + '%s-BadExit'%(self._fpr)) + + self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE + self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE + self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE + self._badexit = None + if 'BadExit' in uptime['flags']: + self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE + + def is_candidate(self): + if (self._data['last_changed_address_or_port'] > + self.CUTOFF_ADDRESS_AND_PORT_STABLE): + logging.debug('%s not a candidate: changed address/port recently (%s)', + self._fpr, self._data['last_changed_address_or_port']) + return False + if self._running < CUTOFF_RUNNING: + logging.debug('%s not a candidate: running avg too low (%lf)', + self._fpr, self._running) + return False + if self._guard < CUTOFF_GUARD: + logging.debug('%s not a candidate: guard avg too low (%lf)', + self._fpr, self._guard) + return False + if self._v2dir < CUTOFF_V2DIR: + logging.debug('%s not a candidate: v2dir avg too low (%lf)', + self._fpr, self._v2dir) + return False + if self._badexit is not None and self._badexit > PERMITTED_BADEXIT: + logging.debug('%s not a candidate: badexit avg too high (%lf)', + self._fpr, self._badexit) + return False + # if the relay doesn't report a version, also exclude the relay + if (not self._data.has_key('recommended_version') + or not self._data['recommended_version']): + return False + return True + + def is_in_whitelist(self, relaylist): + """ A fallback matches if each key in the whitelist line matches: + ipv4 + dirport + orport + id + ipv6 address and port (if present) + If the fallback has an ipv6 key, the whitelist line must also have + it, and vice versa, otherwise they don't match. """ + for entry in relaylist: + if entry['ipv4'] != self.dirip: + continue + if int(entry['dirport']) != self.dirport: + continue + if int(entry['orport']) != self.orport: + continue + if entry['id'] != self._fpr: + continue + if (entry.has_key('ipv6') + and self.ipv6addr is not None and self.ipv6orport is not None): + # if both entry and fallback have an ipv6 address, compare them + if entry['ipv6'] != self.ipv6addr + ':' + self.ipv6orport: + continue + # if the fallback has an IPv6 address but the whitelist entry + # doesn't, or vice versa, the whitelist entry doesn't match + elif entry.has_key('ipv6') and self.ipv6addr is None: + continue + elif not entry.has_key('ipv6') and self.ipv6addr is not None: + continue + return True + return False + + def is_in_blacklist(self, relaylist): + """ A fallback matches a blacklist line if a sufficiently specific group + of attributes matches: + ipv4 & dirport + ipv4 & orport + id + ipv6 & dirport + ipv6 & ipv6 orport + If the fallback and the blacklist line both have an ipv6 key, + their values will be compared, otherwise, they will be ignored. + If there is no dirport and no orport, the entry matches all relays on + that ip. """ + for entry in relaylist: + for key in entry: + value = entry[key] + if key == 'ipv4' and value == self.dirip: + # if the dirport is present, check it too + if entry.has_key('dirport'): + if int(entry['dirport']) == self.dirport: + return True + # if the orport is present, check it too + elif entry.has_key('orport'): + if int(entry['orport']) == self.orport: + return True + else: + return True + if key == 'id' and value == self._fpr: + return True + if (key == 'ipv6' + and self.ipv6addr is not None and self.ipv6orport is not None): + # if both entry and fallback have an ipv6 address, compare them, + # otherwise, disregard ipv6 addresses + if value == self.ipv6addr + ':' + self.ipv6orport: + # if the dirport is present, check it too + if entry.has_key('dirport'): + if int(entry['dirport']) == self.dirport: + return True + # if the orport is present, check it too + elif entry.has_key('orport'): + if int(entry['orport']) == self.orport: + return True + else: + return True + return False + + def is_exit(self): + return 'Exit' in self._data['flags'] + + def is_guard(self): + return 'Guard' in self._data['flags'] + + def fallback_weight_fraction(self, total_weight): + return float(self._data['consensus_weight']) / total_weight + + # return the original consensus weight, if it exists, + # or, if not, return the consensus weight + def original_consensus_weight(self): + if self._data.has_key('original_consensus_weight'): + return self._data['original_consensus_weight'] + else: + return self._data['consensus_weight'] + + def original_fallback_weight_fraction(self, total_weight): + return float(self.original_consensus_weight()) / total_weight + + def fallbackdir_line(self, total_weight, original_total_weight): + # /* + # nickname + # flags + # weight / total (percentage) + # [original weight / original total (original percentage)] + # [contact] + # */ + # "address:dirport orport=port id=fingerprint" + # "[ipv6=addr:orport]" + # "weight=num", + # Multiline C comment + s = '/*' + s += '\n' + s += cleanse_c_multiline_comment(self._data['nickname']) + s += '\n' + s += 'Flags: ' + s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags']))) + s += '\n' + weight = self._data['consensus_weight'] + percent_weight = self.fallback_weight_fraction(total_weight)*100 + s += 'Fallback Weight: %d / %d (%.3f%%)'%(weight, total_weight, + percent_weight) + s += '\n' + o_weight = self.original_consensus_weight() + if o_weight != weight: + o_percent_weight = self.original_fallback_weight_fraction( + original_total_weight)*100 + s += 'Consensus Weight: %d / %d (%.3f%%)'%(o_weight, + original_total_weight, + o_percent_weight) + s += '\n' + if self._data['contact'] is not None: + s += cleanse_c_multiline_comment(self._data['contact']) + s += '\n' + s += '*/' + s += '\n' + # Multi-Line C string with trailing comma (part of a string list) + # This makes it easier to diff the file, and remove IPv6 lines using grep + # Integers don't need escaping + s += '"%s orport=%d id=%s"'%( + cleanse_c_string(self._data['dir_address']), + self.orport, + cleanse_c_string(self._fpr)) + s += '\n' + if self.ipv6addr is not None: + s += '" ipv6=%s:%s"'%( + cleanse_c_string(self.ipv6addr), cleanse_c_string(self.ipv6orport)) + s += '\n' + s += '" weight=%d",'%(weight) + return s + +## Fallback Candidate List Class + +class CandidateList(dict): + def __init__(self): + pass + + def _add_relay(self, details): + if not 'dir_address' in details: return + c = Candidate(details) + self[ c.get_fingerprint() ] = c + + def _add_uptime(self, uptime): + try: + fpr = uptime['fingerprint'] + except KeyError: + raise Exception("Document has no fingerprint field.") + + try: + c = self[fpr] + except KeyError: + logging.debug('Got unknown relay %s in uptime document.'%(fpr,)) + return + + c.add_uptime(uptime) + + def _add_details(self): + logging.debug('Loading details document.') + d = fetch('details', + fields=('fingerprint,nickname,contact,last_changed_address_or_port,' + + 'consensus_weight,or_addresses,dir_address,' + + 'recommended_version,flags')) + logging.debug('Loading details document done.') + + if not 'relays' in d: raise Exception("No relays found in document.") + + for r in d['relays']: self._add_relay(r) + + def _add_uptimes(self): + logging.debug('Loading uptime document.') + d = fetch('uptime') + logging.debug('Loading uptime document done.') + + if not 'relays' in d: raise Exception("No relays found in document.") + for r in d['relays']: self._add_uptime(r) + + def add_relays(self): + self._add_details() + self._add_uptimes() + + def count_guards(self): + guard_count = 0 + for fpr in self.keys(): + if self[fpr].is_guard(): + guard_count += 1 + return guard_count + + # Find fallbacks that fit the uptime, stability, and flags criteria + def compute_fallbacks(self): + self.fallbacks = map(lambda x: self[x], + sorted( + filter(lambda x: self[x].is_candidate(), + self.keys()), + key=lambda x: self[x]._data['consensus_weight'], + reverse=True) + ) + + @staticmethod + def load_relaylist(file_name): + """ Read each line in the file, and parse it like a FallbackDir line: + an IPv4 address and optional port: + : + which are parsed into dictionary entries: + ipv4= + dirport= + followed by a series of key=value entries: + orport= + id= + ipv6=: + each line's key/value pairs are placed in a dictonary, + (of string -> string key/value pairs), + and these dictionaries are placed in an array. + comments start with # and are ignored """ + relaylist = [] + file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE) + if file_data is None: + return relaylist + for line in file_data.split('\n'): + relay_entry = {} + # ignore comments + line_comment_split = line.split('#') + line = line_comment_split[0] + # cleanup whitespace + line = cleanse_whitespace(line) + line = line.strip() + if len(line) == 0: + continue + for item in line.split(' '): + item = item.strip() + if len(item) == 0: + continue + key_value_split = item.split('=') + kvl = len(key_value_split) + if kvl < 1 or kvl > 2: + print '#error Bad %s item: %s, format is key=value.'%( + file_name, item) + if kvl == 1: + # assume that entries without a key are the ipv4 address, + # perhaps with a dirport + ipv4_maybe_dirport = key_value_split[0] + ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':') + dirl = len(ipv4_maybe_dirport_split) + if dirl < 1 or dirl > 2: + print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%( + file_name, item) + if dirl >= 1: + relay_entry['ipv4'] = ipv4_maybe_dirport_split[0] + if dirl == 2: + relay_entry['dirport'] = ipv4_maybe_dirport_split[1] + elif kvl == 2: + relay_entry[key_value_split[0]] = key_value_split[1] + relaylist.append(relay_entry) + return relaylist + + # apply the fallback whitelist and blacklist + def apply_filter_lists(self): + excluded_count = 0 + logging.debug('Applying whitelist and blacklist.') + # parse the whitelist and blacklist + whitelist = self.load_relaylist(WHITELIST_FILE_NAME) + blacklist = self.load_relaylist(BLACKLIST_FILE_NAME) + filtered_fallbacks = [] + for f in self.fallbacks: + in_whitelist = f.is_in_whitelist(whitelist) + in_blacklist = f.is_in_blacklist(blacklist) + if in_whitelist and in_blacklist: + if BLACKLIST_EXCLUDES_WHITELIST_ENTRIES: + # exclude + excluded_count += 1 + logging.debug('Excluding %s: in both blacklist and whitelist.' % + f._fpr) + else: + # include + filtered_fallbacks.append(f) + elif in_whitelist: + # include + filtered_fallbacks.append(f) + elif in_blacklist: + # exclude + excluded_count += 1 + logging.debug('Excluding %s: in blacklist.' % + f._fpr) + else: + if INCLUDE_UNLISTED_ENTRIES: + # include + filtered_fallbacks.append(f) + else: + # exclude + excluded_count += 1 + logging.debug('Excluding %s: in neither blacklist nor whitelist.' % + f._fpr) + self.fallbacks = filtered_fallbacks + return excluded_count + + @staticmethod + def summarise_filters(initial_count, excluded_count): + return '/* Whitelist & blacklist excluded %d of %d candidates. */'%( + excluded_count, initial_count) + + # Remove any fallbacks in excess of MAX_FALLBACK_COUNT, + # starting with the lowest-weighted fallbacks + # total_weight should be recalculated after calling this + def exclude_excess_fallbacks(self): + self.fallbacks = self.fallbacks[:MAX_FALLBACK_COUNT] + + # Clamp the weight of all fallbacks to MAX_WEIGHT_FRACTION * total_weight + # fallbacks are kept sorted, but since excessive weights are reduced to + # the maximum acceptable weight, these relays end up with equal weights + def clamp_high_weight_fallbacks(self, total_weight): + if MAX_WEIGHT_FRACTION * len(self.fallbacks) < 1.0: + error_str = 'Max Fallback Weight %.3f%% is unachievable'%( + MAX_WEIGHT_FRACTION) + error_str += ' with Current Fallback Count %d.'%(len(self.fallbacks)) + if STRICT_FALLBACK_WEIGHTS: + print '#error ' + error_str + else: + print '/* ' + error_str + ' */' + relays_clamped = 0 + max_acceptable_weight = total_weight * MAX_WEIGHT_FRACTION + for f in self.fallbacks: + frac_weight = f.fallback_weight_fraction(total_weight) + if frac_weight > MAX_WEIGHT_FRACTION: + relays_clamped += 1 + current_weight = f._data['consensus_weight'] + # if we already have an original weight, keep it + if (not f._data.has_key('original_consensus_weight') + or f._data['original_consensus_weight'] == current_weight): + f._data['original_consensus_weight'] = current_weight + f._data['consensus_weight'] = max_acceptable_weight + return relays_clamped + + # Remove any fallbacks with weights lower than MIN_WEIGHT_FRACTION + # total_weight should be recalculated after calling this + def exclude_low_weight_fallbacks(self, total_weight): + self.fallbacks = filter( + lambda x: + x.fallback_weight_fraction(total_weight) >= MIN_WEIGHT_FRACTION, + self.fallbacks) + + def fallback_weight_total(self): + return sum(f._data['consensus_weight'] for f in self.fallbacks) + + def fallback_min_weight(self): + if len(self.fallbacks) > 0: + return self.fallbacks[-1] + else: + return None + + def fallback_max_weight(self): + if len(self.fallbacks) > 0: + return self.fallbacks[0] + else: + return None + + def summarise_fallbacks(self, eligible_count, eligible_weight, + relays_clamped, clamped_weight, + guard_count, target_count, max_count): + # Report: + # the number of fallback directories (with min & max limits); + # #error if below minimum count + # the total weight, min & max fallback proportions + # #error if outside max weight proportion + # Multiline C comment with #error if things go bad + s = '/*' + s += '\n' + s += 'Fallback Directory Summary' + s += '\n' + # Integers don't need escaping in C comments + fallback_count = len(self.fallbacks) + if FALLBACK_PROPORTION_OF_GUARDS is None: + fallback_proportion = '' + else: + fallback_proportion = ' (%d * %f)'%(guard_count, + FALLBACK_PROPORTION_OF_GUARDS) + s += 'Final Count: %d (Eligible %d, Usable %d, Target %d%s, '%( + min(max_count, fallback_count), + eligible_count, + fallback_count, + target_count, + fallback_proportion) + s += 'Clamped to %d)'%( + MAX_FALLBACK_COUNT) + s += '\n' + if fallback_count < MIN_FALLBACK_COUNT: + s += '*/' + s += '\n' + # We must have a minimum number of fallbacks so they are always + # reachable, and are in diverse locations + s += '#error Fallback Count %d is too low. '%(fallback_count) + s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT) + s += 'Try adding entries to the whitelist, ' + s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.' + s += '\n' + s += '/*' + s += '\n' + total_weight = self.fallback_weight_total() + min_fb = self.fallback_min_weight() + min_weight = min_fb._data['consensus_weight'] + min_percent = min_fb.fallback_weight_fraction(total_weight)*100.0 + max_fb = self.fallback_max_weight() + max_weight = max_fb._data['consensus_weight'] + max_frac = max_fb.fallback_weight_fraction(total_weight) + max_percent = max_frac*100.0 + s += 'Final Weight: %d (Eligible %d)'%(total_weight, eligible_weight) + s += '\n' + s += 'Max Weight: %d (%.3f%%) (Clamped to %.3f%%)'%( + max_weight, + max_percent, + TARGET_MAX_WEIGHT_FRACTION*100) + s += '\n' + s += 'Min Weight: %d (%.3f%%) (Clamped to %.3f%%)'%( + min_weight, + min_percent, + MIN_WEIGHT_FRACTION*100) + s += '\n' + if eligible_count != fallback_count: + s += 'Excluded: %d (Clamped, Below Target, or Low Weight)'%( + eligible_count - fallback_count) + s += '\n' + if relays_clamped > 0: + s += 'Clamped: %d (%.3f%%) Excess Weight, '%( + clamped_weight, + (100.0 * clamped_weight) / total_weight) + s += '%d High Weight Fallbacks (%.1f%%)'%( + relays_clamped, + (100.0 * relays_clamped) / fallback_count) + s += '\n' + s += '*/' + if max_frac > TARGET_MAX_WEIGHT_FRACTION: + s += '\n' + # We must restrict the maximum fallback weight, so an adversary + # at or near the fallback doesn't see too many clients + error_str = 'Max Fallback Weight %.3f%% is too high. '%(max_frac*100) + error_str += 'Must be at most %.3f%% for client anonymity.'%( + TARGET_MAX_WEIGHT_FRACTION*100) + if STRICT_FALLBACK_WEIGHTS: + s += '#error ' + error_str + else: + s += '/* ' + error_str + ' */' + return s + +## Main Function + +def list_fallbacks(): + """ Fetches required onionoo documents and evaluates the + fallback directory criteria for each of the relays """ + + candidates = CandidateList() + candidates.add_relays() + + guard_count = candidates.count_guards() + if FALLBACK_PROPORTION_OF_GUARDS is None: + target_count = MAX_FALLBACK_COUNT + else: + target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS) + # the maximum number of fallbacks is the least of: + # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count) + # - the maximum fallback count (MAX_FALLBACK_COUNT) + max_count = min(target_count, MAX_FALLBACK_COUNT) + + candidates.compute_fallbacks() + + initial_count = len(candidates.fallbacks) + excluded_count = candidates.apply_filter_lists() + print candidates.summarise_filters(initial_count, excluded_count) + + eligible_count = len(candidates.fallbacks) + eligible_weight = candidates.fallback_weight_total() + + # print the raw fallback list + #total_weight = candidates.fallback_weight_total() + #for x in candidates.fallbacks: + # print x.fallbackdir_line(total_weight, total_weight) + + # When candidates are excluded, total_weight decreases, and + # the proportional weight of other candidates increases. + candidates.exclude_excess_fallbacks() + total_weight = candidates.fallback_weight_total() + + # When candidates are reweighted, total_weight decreases, and + # the proportional weight of other candidates increases. + # Previously low-weight candidates might obtain sufficient proportional + # weights to be included. + # Save the weight at which we reweighted fallbacks for the summary. + pre_clamp_total_weight = total_weight + relays_clamped = candidates.clamp_high_weight_fallbacks(total_weight) + + # When candidates are excluded, total_weight decreases, and + # the proportional weight of other candidates increases. + # No new low weight candidates will be created during exclusions. + # However, high weight candidates may increase over the maximum proportion. + # This should not be an issue, except in pathological cases. + candidates.exclude_low_weight_fallbacks(total_weight) + total_weight = candidates.fallback_weight_total() + + # check we haven't exceeded TARGET_MAX_WEIGHT_FRACTION + # since reweighting preserves the orginal sort order, + # the maximum weights will be at the head of the list + if len(candidates.fallbacks) > 0: + max_weight_fb = candidates.fallback_max_weight() + max_weight = max_weight_fb.fallback_weight_fraction(total_weight) + if max_weight > TARGET_MAX_WEIGHT_FRACTION: + error_str = 'Maximum fallback weight: %.3f%% exceeds target %.3f%%. '%( + max_weight, + TARGET_MAX_WEIGHT_FRACTION) + error_str += 'Try decreasing REWEIGHTING_FUDGE_FACTOR.' + if STRICT_FALLBACK_WEIGHTS: + print '#error ' + error_str + else: + print '/* ' + error_str + ' */' + + print candidates.summarise_fallbacks(eligible_count, eligible_weight, + relays_clamped, + pre_clamp_total_weight - total_weight, + guard_count, target_count, max_count) + else: + print '/* No Fallbacks met criteria */' + + for s in fetch_source_list(): + print describe_fetch_source(s) + + for x in candidates.fallbacks[:max_count]: + print x.fallbackdir_line(total_weight, pre_clamp_total_weight) + #print json.dumps(candidates[x]._data, sort_keys=True, indent=4, + # separators=(',', ': '), default=json_util.default) + +if __name__ == "__main__": + list_fallbacks() diff --git a/src/or/config.c b/src/or/config.c index 9b570323d..9ec47d245 100644 --- a/src/or/config.c +++ b/src/or/config.c @@ -960,6 +960,7 @@ add_default_fallback_dir_servers,(void)) { int i; const char *fallback[] = { +#include "fallback_dirs.inc" NULL }; for (i=0; fallback[i]; i++) { diff --git a/src/or/fallback_dirs.inc b/src/or/fallback_dirs.inc new file mode 100644 index 000000000..d9214cb88 --- /dev/null +++ b/src/or/fallback_dirs.inc @@ -0,0 +1 @@ +/* This list will be empty until opt-ins are finalised. */ diff --git a/src/or/include.am b/src/or/include.am index 1180239c8..5ec96e5a9 100644 --- a/src/or/include.am +++ b/src/or/include.am @@ -155,6 +155,7 @@ ORHEADERS = \ src/or/dnsserv.h \ src/or/eventdns_tor.h \ src/or/ext_orport.h \ + src/or/fallback_dirs.inc \ src/or/fp_pair.h \ src/or/geoip.h \ src/or/entrynodes.h \ diff --git a/src/or/routerlist.c b/src/or/routerlist.c index 28b5eb118..051bac5de 100644 --- a/src/or/routerlist.c +++ b/src/or/routerlist.c @@ -1299,8 +1299,8 @@ router_get_fallback_dir_servers(void) /** Try to find a running dirserver that supports operations of type. * * If there are no running dirservers in our routerlist and the - * PDS_RETRY_IF_NO_SERVERS flag is set, set all the authoritative ones - * as running again, and pick one. + * PDS_RETRY_IF_NO_SERVERS flag is set, set all the fallback ones + * (including authorities) as running again, and pick one. * * If the PDS_IGNORE_FASCISTFIREWALL flag is set, then include * dirservers that we can't reach. @@ -1308,8 +1308,9 @@ router_get_fallback_dir_servers(void) * If the PDS_ALLOW_SELF flag is not set, then don't include ourself * (if we're a dirserver). * - * Don't pick an authority if any non-authority is viable; try to avoid using - * servers that have returned 503 recently. + * Don't pick a fallback directory mirror if any non-fallback is viable; + * (the fallback directory mirrors include the authorities) + * try to avoid using servers that have returned 503 recently. */ const routerstatus_t * router_pick_directory_server(dirinfo_type_t type, int flags) @@ -1336,7 +1337,7 @@ router_pick_directory_server(dirinfo_type_t type, int flags) log_info(LD_DIR, "No reachable router entries for dirservers. " "Trying them all again."); - /* mark all authdirservers as up again */ + /* mark all fallback directory mirrors as up again */ mark_all_dirservers_up(fallback_dir_servers); /* try again */ choice = router_pick_directory_server_impl(type, flags, NULL); diff --git a/src/test/test_config.c b/src/test/test_config.c index 0137d1c49..53fc693a3 100644 --- a/src/test/test_config.c +++ b/src/test/test_config.c @@ -1692,9 +1692,9 @@ test_config_adding_dir_servers(void *arg) ); /* We need to know if add_default_fallback_dir_servers is called, + * whatever the size of the list in fallback_dirs.inc, * so we use a version of add_default_fallback_dir_servers that adds - * one known default fallback directory. - * There doesn't appear to be any need to test it unmocked. */ + * one known default fallback directory. */ MOCK(add_default_fallback_dir_servers, add_default_fallback_dir_servers_known_default); @@ -3492,6 +3492,33 @@ test_config_use_multiple_directories(void *arg) tor_free(options); } +static void +test_config_default_fallback_dirs(void *arg) +{ + const char *fallback[] = { +#include "../or/fallback_dirs.inc" + NULL + }; + + int n_included_fallback_dirs = 0; + int n_added_fallback_dirs = 0; + + (void)arg; + clear_dir_servers(); + + while (fallback[n_included_fallback_dirs]) + n_included_fallback_dirs++; + + add_default_fallback_dir_servers(); + + n_added_fallback_dirs = smartlist_len(router_get_fallback_dir_servers()); + + tt_assert(n_included_fallback_dirs == n_added_fallback_dirs); + + done: + clear_dir_servers(); +} + #define CONFIG_TEST(name, flags) \ { #name, test_config_ ## name, flags, NULL, NULL } @@ -3503,6 +3530,7 @@ struct testcase_t config_tests[] = { CONFIG_TEST(adding_default_trusted_dir_servers, TT_FORK), CONFIG_TEST(adding_dir_servers, TT_FORK), CONFIG_TEST(default_dir_servers, TT_FORK), + CONFIG_TEST(default_fallback_dirs, 0), CONFIG_TEST(resolve_my_address, TT_FORK), CONFIG_TEST(addressmap, 0), CONFIG_TEST(parse_bridge_line, 0),