markov.py

from __future__ import print_function

#
# Compute prefix-suffix maps for a file
#
import string
non_word = string.punctuation + string.whitespace

def process_file(filename):
    one_gram = {}
    two_gram = {}
    prefix = ( None, None )
    with open(filename) as fp:
        for line in fp:
            prefix = _process_line(line, prefix, one_gram, two_gram)
    return one_gram, two_gram

def _process_line(line, prefix, h1, h2):
    line = line.replace('-', ' ')
    for word in line.split():
        word = word.strip(non_word).lower()
        if prefix[1] is not None:
            h1.setdefault(prefix[1], []).append(word)
        if prefix[0] is not None and prefix[1] is not None:
            h2.setdefault(prefix, []).append(word)
        prefix = shift(prefix, word)
    return prefix

def shift(prefix, word):
    return prefix[1:] + (word,)

h1, h2 = process_file('grimm.txt')
import pprint
#print("1-gram:")
pprint.pprint(h1)
#print("2-gram:")
pprint.pprint(h2)

#
# Generate a non-sensical sentence using bigram
#
def compose(h1, h2, count=10):
    import random
    first_word = random.choice(list(h1.keys()))
    second_word = random.choice(h1[first_word])
    prefix = (first_word, second_word)
    print(first_word, second_word, end=' ')
    for n in range(2, count):
        next_word = random.choice(h2[prefix])
        print(next_word, end=' ')
        prefix = shift(prefix, next_word)
    print()

compose(h1, h2, count=20)