markov4_gram.py

from __future__ import print_function

"""Module for constructing and using unigram and bigram data."""

# This is a progression from markov3_*.py.  Instead of providing
# only I/O functions, which intimately depends on the actual data
# structures passed, we "abstract" the concepts of how to construct
# and use prefix-suffix mappings.

def shift(prefix, word):
    return prefix[1:] + (word,)

def newMapping():
    return dict()

def addSuffix(m, prefix, suffix):
    m.setdefault(prefix, []).append(suffix)
#    # Here is the equivalent EAFP code
#     try:
#         m[prefix].append(suffix)
#     except KeyError:
#         m[prefix] = [ suffix ]

def random_prefix(m):
    import random
    return random.choice(list(m.keys()))

def random_suffix(m, prefix):
    import random
    return random.choice(m[prefix])

#
# Now the I/O functions can use the mapping functions without
# worrying about the actual represenations being used since
# they only call the mapping functions and do not try to
# access any content directly.
#
def write_grams(path, unigram_mapping, bigram_mapping):
    with _open_gram_file(path, "w") as f:
        _write_mapping(f, unigram_mapping)
        print(file=f)
        _write_mapping(f, bigram_mapping)

def read_grams(path):
    f = _open_gram_file(path)
    line_number = 0
    unigram_mapping = newMapping()
    bigram_mapping = newMapping()
    line_number = _read_mapping(f, unigram_mapping, 1, 0)
    line_number = _read_mapping(f, bigram_mapping, 2, line_number)
    f.close()
    return unigram_mapping, bigram_mapping
    
# Note the use of leading _ for names of functions that should
# only be used by functions within this module.  This is a Python
# convention.  So if you import a module and use a function whose
# name begins with _, you are not using the module API and, if
# your code breaks because the module writer changes or even
# removes that function, IT'S YOUR FAULT!

def _open_gram_file(path, mode="r"):
    # Return the file object to the data file associated with
    # the text at the given path
    import os.path
    root, ext = os.path.splitext(path)
    tsv_name = root + ".tsv"
    return open(tsv_name, mode)

def _write_mapping(f, m):
    for prefix, suffix in m.items():
        print("%s\t%s" % ('\t'.join(prefix), '\t'.join(suffix)), file=f)

def _read_mapping(f, m, prefix_length, line_number):
    for line in f:
        line_number += 1
        values = line.strip().split('\t')
        if len(values) == 1:
            # Empty line marks end of section
            break
        elif len(values) > prefix_length:
            prefix = tuple(values[:prefix_length])
            for suffix in values[prefix_length:]:
                addSuffix(m, prefix, suffix)
        else:
            # We throw an IOError when we encounter data that
            # we cannot handle because it's the same type of
            # exception thrown when a file is missing.  Any
            # callers can catch both errors with a single
            # except clause.  This is nice since there is no
            # need to distinguish the error types (no file,
            # bad file) since the end effect is the same:
            # there's no data available.
            raise IOError("bad data at line %d" % line_number)
    return line_number