from __future__ import print_function
#
# Compute prefix-suffix maps for a file
#
import string
non_word = string.punctuation + string.whitespace
def process_file(filename):
one_gram = {}
two_gram = {}
prefix = ( None, None )
with open(filename) as fp:
for line in fp:
prefix = _process_line(line, prefix, one_gram, two_gram)
return one_gram, two_gram
def _process_line(line, prefix, h1, h2):
line = line.replace('-', ' ')
for word in line.split():
word = word.strip(non_word).lower()
if prefix[1] is not None:
h1.setdefault(prefix[1], []).append(word)
if prefix[0] is not None and prefix[1] is not None:
h2.setdefault(prefix, []).append(word)
prefix = shift(prefix, word)
return prefix
def shift(prefix, word):
return prefix[1:] + (word,)
h1, h2 = process_file('grimm.txt')
import pprint
#print("1-gram:")
pprint.pprint(h1)
#print("2-gram:")
pprint.pprint(h2)
#
# Generate a non-sensical sentence using bigram
#
def compose(h1, h2, count=10):
import random
first_word = random.choice(list(h1.keys()))
second_word = random.choice(h1[first_word])
prefix = (first_word, second_word)
print(first_word, second_word, end=' ')
for n in range(2, count):
next_word = random.choice(h2[prefix])
print(next_word, end=' ')
prefix = shift(prefix, next_word)
print()
compose(h1, h2, count=20)