from __future__ import print_function """Module for constructing and using unigram and bigram data.""" # This is a progression from markov4_*.py. Instead of providing # only I/O functions, which intimately depends on the actual data # structures passed, we "abstract" the concepts of how to construct # and use prefix-suffix mappings. # # Whereas markov4_gram.py uses a redundant list as the value of our # mapping dictionary, we use a histogram here; the tradeoff is that # we are using less memory (by not storing redundant words) at the # expense of computation time (when generating random suffixes). This # is an example of how we changed the _implementation_ without # disturbing the _interface_, so that markov5_prep.py and markov5_use.py # are unchanged from there markov4 counterparts except for importing # functions from markov5_gram instead of markov4_gram. (In normal # development, version numbers generally do not appear in file names # and no changes in the calling modules would be necessary.) def shift(prefix, word): return prefix[1:] + (word,) def newMapping(): return {} def addSuffix(m, prefix, suffix): h = m.setdefault(prefix, {}) h[suffix] = h.get(suffix, 0) + 1 def random_prefix(m): import random return random.choice(list(m.keys())) def random_suffix(m, prefix): import random t = [] for suffix, count in m[prefix].items(): t.extend([suffix] * count) return random.choice(t) # # Now the I/O functions can use the mapping functions without # worrying about the actual represenations being used since # they only call the mapping functions and do not try to # access any content directly. # def write_grams(path, unigram_mapping, bigram_mapping): with _open_gram_file(path, "w") as f: _write_mapping(f, unigram_mapping) print(file=f) _write_mapping(f, bigram_mapping) def read_grams(path): f = _open_gram_file(path) line_number = 0 unigram_mapping = newMapping() bigram_mapping = newMapping() line_number = _read_mapping(f, unigram_mapping, 1, 0) line_number = _read_mapping(f, bigram_mapping, 2, line_number) f.close() return unigram_mapping, bigram_mapping # Note the use of leading _ for names of functions that should # only be used by functions within this module. This is a Python # convention. So if you import a module and use a function whose # name begins with _, you are not using the module API and, if # your code breaks because the module writer changes or even # removes that function, IT'S YOUR FAULT! def _open_gram_file(path, mode="r"): # Return the file object to the data file associated with # the text at the given path import os.path root, ext = os.path.splitext(path) tsv_name = root + ".tsv" return open(tsv_name, mode) def _write_mapping(f, m): for prefix, suffix in m.items(): print("%s\t%s" % ('\t'.join(prefix), '\t'.join(suffix)), file=f) def _read_mapping(f, m, prefix_length, line_number): for line in f: line_number += 1 values = line.strip().split('\t') if len(values) == 1: # Empty line marks end of section break elif len(values) > prefix_length: prefix = tuple(values[:prefix_length]) for suffix in values[prefix_length:]: addSuffix(m, prefix, suffix) else: # We throw an IOError when we encounter data that # we cannot handle because it's the same type of # exception thrown when a file is missing. Any # callers can catch both errors with a single # except clause. This is nice since there is no # need to distinguish the error types (no file, # bad file) since the end effect is the same: # there's no data available. raise IOError("bad data at line %d" % line_number) return line_number