from __future__ import print_function
"""Module for constructing and using unigram and bigram data."""
# This is a progression from markov3_*.py. Instead of providing
# only I/O functions, which intimately depends on the actual data
# structures passed, we "abstract" the concepts of how to construct
# and use prefix-suffix mappings.
def shift(prefix, word):
return prefix[1:] + (word,)
def newMapping():
return dict()
def addSuffix(m, prefix, suffix):
m.setdefault(prefix, []).append(suffix)
# # Here is the equivalent EAFP code
# try:
# m[prefix].append(suffix)
# except KeyError:
# m[prefix] = [ suffix ]
def random_prefix(m):
import random
return random.choice(list(m.keys()))
def random_suffix(m, prefix):
import random
return random.choice(m[prefix])
#
# Now the I/O functions can use the mapping functions without
# worrying about the actual represenations being used since
# they only call the mapping functions and do not try to
# access any content directly.
#
def write_grams(path, unigram_mapping, bigram_mapping):
with _open_gram_file(path, "w") as f:
_write_mapping(f, unigram_mapping)
print(file=f)
_write_mapping(f, bigram_mapping)
def read_grams(path):
f = _open_gram_file(path)
line_number = 0
unigram_mapping = newMapping()
bigram_mapping = newMapping()
line_number = _read_mapping(f, unigram_mapping, 1, 0)
line_number = _read_mapping(f, bigram_mapping, 2, line_number)
f.close()
return unigram_mapping, bigram_mapping
# Note the use of leading _ for names of functions that should
# only be used by functions within this module. This is a Python
# convention. So if you import a module and use a function whose
# name begins with _, you are not using the module API and, if
# your code breaks because the module writer changes or even
# removes that function, IT'S YOUR FAULT!
def _open_gram_file(path, mode="r"):
# Return the file object to the data file associated with
# the text at the given path
import os.path
root, ext = os.path.splitext(path)
tsv_name = root + ".tsv"
return open(tsv_name, mode)
def _write_mapping(f, m):
for prefix, suffix in m.items():
print("%s\t%s" % ('\t'.join(prefix), '\t'.join(suffix)), file=f)
def _read_mapping(f, m, prefix_length, line_number):
for line in f:
line_number += 1
values = line.strip().split('\t')
if len(values) == 1:
# Empty line marks end of section
break
elif len(values) > prefix_length:
prefix = tuple(values[:prefix_length])
for suffix in values[prefix_length:]:
addSuffix(m, prefix, suffix)
else:
# We throw an IOError when we encounter data that
# we cannot handle because it's the same type of
# exception thrown when a file is missing. Any
# callers can catch both errors with a single
# except clause. This is nice since there is no
# need to distinguish the error types (no file,
# bad file) since the end effect is the same:
# there's no data available.
raise IOError("bad data at line %d" % line_number)
return line_number