from __future__ import print_function
#
# Function for reporting how long a function call takes
#
def runtime(func, *args, **kw):
"""Report how long the given functions takes to run.
Arguments:
func: function to time
args: arguments to pass to function
kw: keyword arguments to pass to function
Returns:
Return value from function
"""
import time
# time.clock returns time elapsed since the first call to time.clock
# to microsecond precision (most of the time)
start = time.clock()
result = func(*args, **kw)
end = time.clock()
try:
name = func.func_name # Python 2
except AttributeError:
name = func.__name__ # Python 3
print("-- %s took %0.3f us" % (name, (end - start) * 1e6))
return result
#
# Construct a histogram of words found in a file
#
import string
def process_file(filename):
"""Read text file and return histogram of words.
Arguments:
filename: path to text file (string)
Returns:
Dictionary whose keys are words and values are counts (dict)
"""
h = dict()
fp = open(filename)
for line in fp:
_process_line(line, h)
fp.close()
return h
def _process_line(line, h):
"""Process one line of text and update histogram.
Arguments:
line: input text (string)
h: histogram dictionary (dict)
Returns:
None
"""
line = line.replace('-', ' ')
for word in line.split():
word = word.strip(string.punctuation + string.whitespace)
word = word.lower()
h[word] = h.get(word, 0) + 1
hist = process_file('bee.txt')
#
# Count total number of words and number of unique words
#
def total_words(h):
"""Return total number of words in histogram.
Arguments:
h: histogram dictionary (dict)
Returns:
Number of words (int)
"""
return sum(h.values())
def different_words(h):
"""Return number of different words in histogram.
Arguments:
h: histogram dictionary (dict)
Returns:
Number of words (int)
"""
return len(h)
#count = total_words(hist)
count = runtime(total_words, hist)
print('Total number of words:', count)
#count = different_words(hist)
count = runtime(different_words, hist)
print('Number of different words:', count)
#
# Find most commonly used words
#
def most_common(h):
"""Return histogram words ordered by number of appearances.
Arguments:
h: histogram dictionary (dict)
Returns:
List of 2-tuples of (count, word) where count is sorted
from largest to smallest. (list)
"""
t = []
for key, value in h.items():
t.append((value, key))
t.sort(reverse=True)
return t
def print_most_common(hist, num=10):
"""Print the 'num' most common words in histogram.
Arguments:
h: histogram dictionary (dict)
num: number of words to print (int, default 10)
Returns:
None
"""
t = most_common(hist)
print('The most common words are:')
for freq, word in t[0:num]:
print(word, '\t', freq)
print()
#print_most_common(hist)
print("(Timing) ", end='')
runtime(print_most_common, hist)
#print_most_common(hist, 5)
print("(Timing) ", end='')
runtime(print_most_common, hist, 5)
#
# Find words that appear in one histogram but not another
#
def subtract(d1, d2):
"""Compute the words that appear in one histogram but not another.
Arguments:
d1: histogram dictionary (dict)
d2: histogram dictionary (dict)
Returns:
Set of words that appear in d1 but not d2 (set)
"""
res = set()
for key in d1:
if key not in d2:
res.add(key)
return res
def set_subtract(d1, d2):
"""Compute the words that appear in one histogram but not another.
Arguments:
d1: histogram dictionary (dict)
d2: histogram dictionary (dict)
Returns:
Set of words that appear in d1 but not d2 (set)
"""
s1 = set(d1.keys())
s2 = set(d2.keys())
return s1 - s2
words = process_file('words')
#diff = subtract(hist, words)
print()
diff = runtime(subtract, hist, words)
print("The words in the book that aren't in the word list are:")
for word in diff:
print(word, end=' ')
print()
#sdiff = set_subtract(hist, words)
sdiff = runtime(set_subtract, hist, words)
print("The words in the book that aren't in the word list are:")
for word in sdiff:
print(word, end=' ')
print()
#
# Choose random word from histogram with probability reflecting
# histogram frequencies
#
def random_word(h):
"""Select a random word from histogram.
Arguments:
h: histogram dictionary (dict)
Returns:
A random word from histogram with probability reflecting
histogram frequencies (string)
"""
import random
# Construct a list of words where each word is replicated
# the number of times it appears (count in histogram).
# Potential problem: list might get very long.
t = []
for word, freq in h.items():
t.extend([word] * freq)
# Select a random word from list.
return random.choice(t)
def random_word2(h):
"""Select a random word from histogram.
Arguments:
h: histogram dictionary (dict)
Returns:
A random word from histogram with probability reflecting
histogram frequencies (string)
"""
import random
# 1. Use keys to get a list of the words in the book
words = list(h.keys())
# 2. Build a list that contains the cumulative sum of the word
# frequencies (see Exercise 10.1). The last item in this
# list is the total numer of words in the book, n.
acc_freq = []
n = 0
for word in words:
n += h[word]
acc_freq.append(n)
# 3. Choose a randum numver from 1 to n. Use a bisection serach
# (See Exercise 10.8) to find the index where the random
# number would be inserted in the cumulative sum.
# I'm going to use a linear search because it's simpler.
nth = random.randint(0, n)
index = len(acc_freq) - 1
for i, af in enumerate(acc_freq):
if nth < af:
index = i
break
# 4. Use the index to find the corresponding word in the
# word list.
return words[i]
print()
#word = random_word(hist)
word = runtime(random_word, hist)
print("Random word:", word)
#word = random_word2(hist)
word = runtime(random_word2, hist)
print("Another random word:", word)