# # Add caching so we don't visit web site so often # _cache = None def find_zip(zipcode): """Return town and information for "zipcode" by querying uszip.com""" global _cache if _cache is None: import shelve _cache = shelve.open("uszip_cache") try: return _cache[zipcode] except KeyError: print "Not in cache" pass import urllib town = "unknown" info = dict() conn = urllib.urlopen("http://www.uszip.com/zip/%s" % zipcode) for line in conn: if "is the zip code of " in line: town = _get_town(line) else: _check_info(line, info) conn.close() if town != "unknown": _cache[zipcode] = (town, info) return town, info def _get_town(line): """Extract town name if present""" line = _strip_tags(line) key_string = "is the zip code of " try: n = line.index(key_string) except ValueError: return "unknown" else: return line[n + len(key_string):] # Info_list is a list of 2-tuples of (dictionary_key, label) # for information available from uszip.com Info_list = [ ( "population", "Population:" ), ( "housing", "Housing Units:" ), ( "land_area", "Land Area:" ), ( "water_area", "Water Area:" ), ( "latitude", "Latitude:" ), ( "longitude", "Longitude:" ), ] def _check_info(line, attrs): """Extract attribute if present""" line = _strip_tags(line) for key, label in Info_list: try: n = line.index(label) except ValueError: continue else: start = n + len(label) stop = start + 1 while stop < len(line): if line[stop].isspace(): break stop += 1 attrs[key] = line[start:stop].strip() break def _strip_tags(line): """Remove HTML tags from string, leaving only "real" text""" keep = [] in_tag = False for c in line: if in_tag: if c == '>': in_tag = False else: if c == '<': in_tag = True else: keep.append(c) return ''.join(keep).strip()