markov1_prep.py

#
# Compute prefix-suffix maps for a file
#
import string
non_word = string.punctuation + string.whitespace

def process_file(filename):
    one_gram = {}
    two_gram = {}
    prefix = ( None, None )
    with open(filename) as fp:
        for line in fp:
            prefix = process_line(line, prefix, one_gram, two_gram)
    return one_gram, two_gram

def process_line(line, prefix, h1, h2):
    line = line.replace('-', ' ')
    for word in line.split():
        word = word.strip(non_word).lower()
        if prefix[1] is not None:
            h1.setdefault(prefix[1], []).append(word)
        if prefix[0] is not None and prefix[1] is not None:
            h2.setdefault(prefix, []).append(word)
        prefix = shift(prefix, word)
    return prefix

def shift(prefix, word):
    return prefix[1:] + (word,)

#
# Compute uni- and bigrams for all .txt files
#
def walk(dir):
    import os
    import os.path
    for name in os.listdir(dir):
        path = os.path.join(dir, name)
        if os.path.isdir(path):
            walk(path)
        else:
            #print(path)
            root, ext = os.path.splitext(name)
            if ext == ".txt":
                prep_file(path)

def prep_file(path):
    h1, h2 = process_file(path)
    import os.path
    root, ext = os.path.splitext(path)
    shelf_name = root + ".shelf"
    import shelve
    s = shelve.open(shelf_name)
    s["h1"] = h1
    s["h2"] = h2
    s.close()    # flushes shelve data out to disk file
    print("%s prepped" % path)

walk(".")