histogram.py

from __future__ import print_function

#
# Function for reporting how long a function call takes
#
def runtime(func, *args, **kw):
    """Report how long the given functions takes to run.
    
    Arguments:
      func: function to time
      args: arguments to pass to function
      kw: keyword arguments to pass to function
    Returns:
      Return value from function
    """
    import time
    # time.clock returns time elapsed since the first call to time.clock
    # to microsecond precision (most of the time)
    start = time.clock()
    result = func(*args, **kw)
    end = time.clock()
    try:
        name = func.func_name       # Python 2
    except AttributeError:
        name = func.__name__        # Python 3
    print("-- %s took %0.3f us" % (name, (end - start) * 1e6))
    return result

#
# Construct a histogram of words found in a file
#
import string

def process_file(filename):
    """Read text file and return histogram of words.

    Arguments:
      filename: path to text file (string)
    Returns:
      Dictionary whose keys are words and values are counts (dict)
    """
    h = dict()
    fp = open(filename)
    for line in fp:
        _process_line(line, h)
    fp.close()
    return h

def _process_line(line, h):
    """Process one line of text and update histogram.

    Arguments:
      line: input text (string)
      h: histogram dictionary (dict)
    Returns:
      None
    """
    line = line.replace('-', ' ')
    for word in line.split():
        word = word.strip(string.punctuation + string.whitespace)
        word = word.lower()

        h[word] = h.get(word, 0) + 1

hist = process_file('bee.txt')

#
# Count total number of words and number of unique words
#
def total_words(h):
    """Return total number of words in histogram.
    
    Arguments:
      h: histogram dictionary (dict)
    Returns:
      Number of words (int)
    """
    return sum(h.values())

def different_words(h):
    """Return number of different words in histogram.
    
    Arguments:
      h: histogram dictionary (dict)
    Returns:
      Number of words (int)
    """
    return len(h)

#count = total_words(hist)
count = runtime(total_words, hist)
print('Total number of words:', count)
#count = different_words(hist)
count = runtime(different_words, hist)
print('Number of different words:', count)

#
# Find most commonly used words
#
def most_common(h):
    """Return histogram words ordered by number of appearances.
    
    Arguments:
      h: histogram dictionary (dict)
    Returns:
      List of 2-tuples of (count, word) where count is sorted
      from largest to smallest.  (list)
    """
    t = []
    for key, value in h.items():
        t.append((value, key))

    t.sort(reverse=True)
    return t

def print_most_common(hist, num=10):
    """Print the 'num' most common words in histogram.
    
    Arguments:
      h: histogram dictionary (dict)
      num: number of words to print (int, default 10)
    Returns:
      None
    """
    t = most_common(hist)
    print('The most common words are:')
    for freq, word in t[0:num]:
        print(word, '\t', freq)

print()
#print_most_common(hist)
print("(Timing) ", end='')
runtime(print_most_common, hist)
#print_most_common(hist, 5)
print("(Timing) ", end='')
runtime(print_most_common, hist, 5)

#
# Find words that appear in one histogram but not another
#
def subtract(d1, d2):
    """Compute the words that appear in one histogram but not another.
    
    Arguments:
      d1: histogram dictionary (dict)
      d2: histogram dictionary (dict)
    Returns:
      Set of words that appear in d1 but not d2 (set)
    """
    res = set()
    for key in d1:
        if key not in d2:
            res.add(key)
    return res

def set_subtract(d1, d2):
    """Compute the words that appear in one histogram but not another.
    
    Arguments:
      d1: histogram dictionary (dict)
      d2: histogram dictionary (dict)
    Returns:
      Set of words that appear in d1 but not d2 (set)
    """
    s1 = set(d1.keys())
    s2 = set(d2.keys())
    return s1 - s2

words = process_file('words')

#diff = subtract(hist, words)
print()
diff = runtime(subtract, hist, words)
print("The words in the book that aren't in the word list are:")
for word in diff:
    print(word, end=' ')
print()
#sdiff = set_subtract(hist, words)
sdiff = runtime(set_subtract, hist, words)
print("The words in the book that aren't in the word list are:")
for word in sdiff:
    print(word, end=' ')
print()

#
# Choose random word from histogram with probability reflecting
# histogram frequencies
#

def random_word(h):
    """Select a random word from histogram.
    
    Arguments:
      h: histogram dictionary (dict)
    Returns:
      A random word from histogram with probability reflecting
      histogram frequencies (string)
    """
    import random
    # Construct a list of words where each word is replicated
    # the number of times it appears (count in histogram).
    # Potential problem: list might get very long.
    t = []
    for word, freq in h.items():
        t.extend([word] * freq)
    # Select a random word from list.
    return random.choice(t)

def random_word2(h):
    """Select a random word from histogram.
    
    Arguments:
      h: histogram dictionary (dict)
    Returns:
      A random word from histogram with probability reflecting
      histogram frequencies (string)
    """
    import random
    # 1. Use keys to get a list of the words in the book
    words = list(h.keys())
    # 2. Build a list that contains the cumulative sum of the word
    #    frequencies (see Exercise 10.1).  The last item in this
    #    list is the total numer of words in the book, n.
    acc_freq = []
    n = 0
    for word in words:
        n += h[word]
        acc_freq.append(n)
    # 3. Choose a randum numver from 1 to n.  Use a bisection serach
    #    (See Exercise 10.8) to find the index where the random
    #    number would be inserted in the cumulative sum.
    # I'm going to use a linear search because it's simpler.
    nth = random.randint(0, n)
    index = len(acc_freq) - 1
    for i, af in enumerate(acc_freq):
        if nth < af:
            index = i
            break
    # 4. Use the index to find the corresponding word in the
    #    word list.
    return words[i]

print()
#word = random_word(hist)
word = runtime(random_word, hist)
print("Random word:", word)
#word = random_word2(hist)
word = runtime(random_word2, hist)
print("Another random word:", word)