markov6_gram.py

from __future__ import print_function

"""Module for constructing and using unigram and bigram data."""

# This is a progression from markov5_*.py.  Instead of providing
# only I/O functions, which intimately depends on the actual data
# structures passed, we "abstract" the concepts of how to construct
# and use prefix-suffix mappings.  This time, we use a class instead
# of a plain dictionary to store the mapping; in addition, we also
# keep track of the length of the prefixes that are the keys to
# the dictionary so that we can detect usage errors.

def shift(prefix, word):
    return prefix[1:] + (word,)

class Mapping:
    "prefix-suffix map"

def newMapping(prefix_length):
    m = Mapping()
    m.prefix_length = prefix_length
    m.map = {}
    return m

def addSuffix(m, prefix, suffix):
    assert len(prefix) == m.prefix_length 
    h = m.map.setdefault(prefix, {})
    h[suffix] = h.get(suffix, 0) + 1

def random_prefix(m):
    import random
    return random.choice(list(m.map.keys()))

def random_suffix(m, prefix):
    assert len(prefix) == m.prefix_length 
    import random
    t = []
    for suffix, count in m.map[prefix].items():
        t.extend([suffix] * count)
    return random.choice(t)

#
# Now the I/O functions can use the mapping functions without
# worrying about the actual represenations being used since
# they only call the mapping functions and do not try to
# access any content directly.
#
def write_grams(path, unigram_mapping, bigram_mapping):
    with _open_gram_file(path, "w") as f:
        _write_mapping(f, unigram_mapping)
        print(file=f)
        _write_mapping(f, bigram_mapping)

def read_grams(path):
    f = _open_gram_file(path)
    line_number = 0
    unigram_mapping = newMapping(1)
    bigram_mapping = newMapping(2)
    line_number = _read_mapping(f, unigram_mapping, 0)
    line_number = _read_mapping(f, bigram_mapping, line_number)
    f.close()
    return unigram_mapping, bigram_mapping
    
# Note the use of leading _ for names of functions that should
# only be used by functions within this module.  This is a Python
# convention.  So if you import a module and use a function whose
# name begins with _, you are not using the module API and, if
# your code breaks because the module writer changes or even
# removes that function, IT'S YOUR FAULT!

def _open_gram_file(path, mode="r"):
    # Return the file object to the data file associated with
    # the text at the given path
    import os.path
    root, ext = os.path.splitext(path)
    tsv_name = root + ".tsv"
    return open(tsv_name, mode)

def _write_mapping(f, m):
    for prefix, suffix in m.map.items():
        print("%s\t%s" % ('\t'.join(prefix), '\t'.join(suffix)), file=f)

def _read_mapping(f, m, line_number):
    for line in f:
        line_number += 1
        values = line.strip().split('\t')
        if len(values) == 1:
            # Empty line marks end of section
            break
        elif len(values) > m.prefix_length:
            prefix = tuple(values[:m.prefix_length])
            for suffix in values[m.prefix_length:]:
                addSuffix(m, prefix, suffix)
        else:
            # We throw an IOError when we encounter data that
            # we cannot handle because it's the same type of
            # exception thrown when a file is missing.  Any
            # callers can catch both errors with a single
            # except clause.  This is nice since there is no
            # need to distinguish the error types (no file,
            # bad file) since the end effect is the same:
            # there's no data available.
            raise IOError("bad data at line %d" % line_number)
    return line_number