from __future__ import print_function """Module for constructing and using unigram and bigram data.""" # This is a progression from markov3_*.py. Instead of providing # only I/O functions, which intimately depends on the actual data # structures passed, we "abstract" the concepts of how to construct # and use prefix-suffix mappings. def shift(prefix, word): return prefix[1:] + (word,) def newMapping(): return dict() def addSuffix(m, prefix, suffix): m.setdefault(prefix, []).append(suffix) # # Here is the equivalent EAFP code # try: # m[prefix].append(suffix) # except KeyError: # m[prefix] = [ suffix ] def random_prefix(m): import random return random.choice(list(m.keys())) def random_suffix(m, prefix): import random return random.choice(m[prefix]) # # Now the I/O functions can use the mapping functions without # worrying about the actual represenations being used since # they only call the mapping functions and do not try to # access any content directly. # def write_grams(path, unigram_mapping, bigram_mapping): with _open_gram_file(path, "w") as f: _write_mapping(f, unigram_mapping) print(file=f) _write_mapping(f, bigram_mapping) def read_grams(path): f = _open_gram_file(path) line_number = 0 unigram_mapping = newMapping() bigram_mapping = newMapping() line_number = _read_mapping(f, unigram_mapping, 1, 0) line_number = _read_mapping(f, bigram_mapping, 2, line_number) f.close() return unigram_mapping, bigram_mapping # Note the use of leading _ for names of functions that should # only be used by functions within this module. This is a Python # convention. So if you import a module and use a function whose # name begins with _, you are not using the module API and, if # your code breaks because the module writer changes or even # removes that function, IT'S YOUR FAULT! def _open_gram_file(path, mode="r"): # Return the file object to the data file associated with # the text at the given path import os.path root, ext = os.path.splitext(path) tsv_name = root + ".tsv" return open(tsv_name, mode) def _write_mapping(f, m): for prefix, suffix in m.items(): print("%s\t%s" % ('\t'.join(prefix), '\t'.join(suffix)), file=f) def _read_mapping(f, m, prefix_length, line_number): for line in f: line_number += 1 values = line.strip().split('\t') if len(values) == 1: # Empty line marks end of section break elif len(values) > prefix_length: prefix = tuple(values[:prefix_length]) for suffix in values[prefix_length:]: addSuffix(m, prefix, suffix) else: # We throw an IOError when we encounter data that # we cannot handle because it's the same type of # exception thrown when a file is missing. Any # callers can catch both errors with a single # except clause. This is nice since there is no # need to distinguish the error types (no file, # bad file) since the end effect is the same: # there's no data available. raise IOError("bad data at line %d" % line_number) return line_number