from __future__ import print_function # # Function for reporting how long a function call takes # def runtime(func, *args, **kw): """Report how long the given functions takes to run. Arguments: func: function to time args: arguments to pass to function kw: keyword arguments to pass to function Returns: Return value from function """ import time # time.clock returns time elapsed since the first call to time.clock # to microsecond precision (most of the time) start = time.clock() result = func(*args, **kw) end = time.clock() try: name = func.func_name # Python 2 except AttributeError: name = func.__name__ # Python 3 print("-- %s took %0.3f us" % (name, (end - start) * 1e6)) return result # # Construct a histogram of words found in a file # import string def process_file(filename): """Read text file and return histogram of words. Arguments: filename: path to text file (string) Returns: Dictionary whose keys are words and values are counts (dict) """ h = dict() fp = open(filename) for line in fp: _process_line(line, h) fp.close() return h def _process_line(line, h): """Process one line of text and update histogram. Arguments: line: input text (string) h: histogram dictionary (dict) Returns: None """ line = line.replace('-', ' ') for word in line.split(): word = word.strip(string.punctuation + string.whitespace) word = word.lower() h[word] = h.get(word, 0) + 1 hist = process_file('bee.txt') # # Count total number of words and number of unique words # def total_words(h): """Return total number of words in histogram. Arguments: h: histogram dictionary (dict) Returns: Number of words (int) """ return sum(h.values()) def different_words(h): """Return number of different words in histogram. Arguments: h: histogram dictionary (dict) Returns: Number of words (int) """ return len(h) #count = total_words(hist) count = runtime(total_words, hist) print('Total number of words:', count) #count = different_words(hist) count = runtime(different_words, hist) print('Number of different words:', count) # # Find most commonly used words # def most_common(h): """Return histogram words ordered by number of appearances. Arguments: h: histogram dictionary (dict) Returns: List of 2-tuples of (count, word) where count is sorted from largest to smallest. (list) """ t = [] for key, value in h.items(): t.append((value, key)) t.sort(reverse=True) return t def print_most_common(hist, num=10): """Print the 'num' most common words in histogram. Arguments: h: histogram dictionary (dict) num: number of words to print (int, default 10) Returns: None """ t = most_common(hist) print('The most common words are:') for freq, word in t[0:num]: print(word, '\t', freq) print() #print_most_common(hist) print("(Timing) ", end='') runtime(print_most_common, hist) #print_most_common(hist, 5) print("(Timing) ", end='') runtime(print_most_common, hist, 5) # # Find words that appear in one histogram but not another # def subtract(d1, d2): """Compute the words that appear in one histogram but not another. Arguments: d1: histogram dictionary (dict) d2: histogram dictionary (dict) Returns: Set of words that appear in d1 but not d2 (set) """ res = set() for key in d1: if key not in d2: res.add(key) return res def set_subtract(d1, d2): """Compute the words that appear in one histogram but not another. Arguments: d1: histogram dictionary (dict) d2: histogram dictionary (dict) Returns: Set of words that appear in d1 but not d2 (set) """ s1 = set(d1.keys()) s2 = set(d2.keys()) return s1 - s2 words = process_file('words') #diff = subtract(hist, words) print() diff = runtime(subtract, hist, words) print("The words in the book that aren't in the word list are:") for word in diff: print(word, end=' ') print() #sdiff = set_subtract(hist, words) sdiff = runtime(set_subtract, hist, words) print("The words in the book that aren't in the word list are:") for word in sdiff: print(word, end=' ') print() # # Choose random word from histogram with probability reflecting # histogram frequencies # def random_word(h): """Select a random word from histogram. Arguments: h: histogram dictionary (dict) Returns: A random word from histogram with probability reflecting histogram frequencies (string) """ import random # Construct a list of words where each word is replicated # the number of times it appears (count in histogram). # Potential problem: list might get very long. t = [] for word, freq in h.items(): t.extend([word] * freq) # Select a random word from list. return random.choice(t) def random_word2(h): """Select a random word from histogram. Arguments: h: histogram dictionary (dict) Returns: A random word from histogram with probability reflecting histogram frequencies (string) """ import random # 1. Use keys to get a list of the words in the book words = list(h.keys()) # 2. Build a list that contains the cumulative sum of the word # frequencies (see Exercise 10.1). The last item in this # list is the total numer of words in the book, n. acc_freq = [] n = 0 for word in words: n += h[word] acc_freq.append(n) # 3. Choose a randum numver from 1 to n. Use a bisection serach # (See Exercise 10.8) to find the index where the random # number would be inserted in the cumulative sum. # I'm going to use a linear search because it's simpler. nth = random.randint(0, n) index = len(acc_freq) - 1 for i, af in enumerate(acc_freq): if nth < af: index = i break # 4. Use the index to find the corresponding word in the # word list. return words[i] print() #word = random_word(hist) word = runtime(random_word, hist) print("Random word:", word) #word = random_word2(hist) word = runtime(random_word2, hist) print("Another random word:", word)