markov5_gram.py

from __future__ import print_function

"""Module for constructing and using unigram and bigram data."""

# This is a progression from markov4_*.py.  Instead of providing
# only I/O functions, which intimately depends on the actual data
# structures passed, we "abstract" the concepts of how to construct
# and use prefix-suffix mappings.
#
# Whereas markov4_gram.py uses a redundant list as the value of our
# mapping dictionary, we use a histogram here; the tradeoff is that
# we are using less memory (by not storing redundant words) at the
# expense of computation time (when generating random suffixes).  This
# is an example of how we changed the _implementation_ without
# disturbing the _interface_, so that markov5_prep.py and markov5_use.py
# are unchanged from there markov4 counterparts except for importing
# functions from markov5_gram instead of markov4_gram.  (In normal
# development, version numbers generally do not appear in file names
# and no changes in the calling modules would be necessary.)

def shift(prefix, word):
    return prefix[1:] + (word,)

def newMapping():
    return {}

def addSuffix(m, prefix, suffix):
    h = m.setdefault(prefix, {})
    h[suffix] = h.get(suffix, 0) + 1

def random_prefix(m):
    import random
    return random.choice(list(m.keys()))

def random_suffix(m, prefix):
    import random
    t = []
    for suffix, count in m[prefix].items():
        t.extend([suffix] * count)
    return random.choice(t)

#
# Now the I/O functions can use the mapping functions without
# worrying about the actual represenations being used since
# they only call the mapping functions and do not try to
# access any content directly.
#
def write_grams(path, unigram_mapping, bigram_mapping):
    with _open_gram_file(path, "w") as f:
        _write_mapping(f, unigram_mapping)
        print(file=f)
        _write_mapping(f, bigram_mapping)

def read_grams(path):
    f = _open_gram_file(path)
    line_number = 0
    unigram_mapping = newMapping()
    bigram_mapping = newMapping()
    line_number = _read_mapping(f, unigram_mapping, 1, 0)
    line_number = _read_mapping(f, bigram_mapping, 2, line_number)
    f.close()
    return unigram_mapping, bigram_mapping
    
# Note the use of leading _ for names of functions that should
# only be used by functions within this module.  This is a Python
# convention.  So if you import a module and use a function whose
# name begins with _, you are not using the module API and, if
# your code breaks because the module writer changes or even
# removes that function, IT'S YOUR FAULT!

def _open_gram_file(path, mode="r"):
    # Return the file object to the data file associated with
    # the text at the given path
    import os.path
    root, ext = os.path.splitext(path)
    tsv_name = root + ".tsv"
    return open(tsv_name, mode)

def _write_mapping(f, m):
    for prefix, suffix in m.items():
        print("%s\t%s" % ('\t'.join(prefix), '\t'.join(suffix)), file=f)

def _read_mapping(f, m, prefix_length, line_number):
    for line in f:
        line_number += 1
        values = line.strip().split('\t')
        if len(values) == 1:
            # Empty line marks end of section
            break
        elif len(values) > prefix_length:
            prefix = tuple(values[:prefix_length])
            for suffix in values[prefix_length:]:
                addSuffix(m, prefix, suffix)
        else:
            # We throw an IOError when we encounter data that
            # we cannot handle because it's the same type of
            # exception thrown when a file is missing.  Any
            # callers can catch both errors with a single
            # except clause.  This is nice since there is no
            # need to distinguish the error types (no file,
            # bad file) since the end effect is the same:
            # there's no data available.
            raise IOError("bad data at line %d" % line_number)
    return line_number