#
# Compute prefix-suffix maps for a file
#
import string
non_word = string.punctuation + string.whitespace
def process_file(filename):
one_gram = {}
two_gram = {}
prefix = ( None, None )
with open(filename) as fp:
for line in fp:
prefix = process_line(line, prefix, one_gram, two_gram)
return one_gram, two_gram
def process_line(line, prefix, h1, h2):
line = line.replace('-', ' ')
for word in line.split():
word = word.strip(non_word).lower()
if prefix[1] is not None:
h1.setdefault(prefix[1], []).append(word)
if prefix[0] is not None and prefix[1] is not None:
h2.setdefault(prefix, []).append(word)
prefix = shift(prefix, word)
return prefix
def shift(prefix, word):
return prefix[1:] + (word,)
#
# Compute uni- and bigrams for all .txt files
#
def walk(dir):
import os
import os.path
for name in os.listdir(dir):
path = os.path.join(dir, name)
if os.path.isdir(path):
walk(path)
else:
#print(path)
root, ext = os.path.splitext(name)
if ext == ".txt":
prep_file(path)
def prep_file(path):
h1, h2 = process_file(path)
import os.path
root, ext = os.path.splitext(path)
shelf_name = root + ".shelf"
import shelve
s = shelve.open(shelf_name)
s["h1"] = h1
s["h2"] = h2
s.close() # flushes shelve data out to disk file
print("%s prepped" % path)
walk(".")