from __future__ import print_function """Module for constructing and using unigram and bigram data.""" # This is a progression from markov5_*.py. Instead of providing # only I/O functions, which intimately depends on the actual data # structures passed, we "abstract" the concepts of how to construct # and use prefix-suffix mappings. This time, we use a class instead # of a plain dictionary to store the mapping; in addition, we also # keep track of the length of the prefixes that are the keys to # the dictionary so that we can detect usage errors. def shift(prefix, word): return prefix[1:] + (word,) class Mapping: "prefix-suffix map" def newMapping(prefix_length): m = Mapping() m.prefix_length = prefix_length m.map = {} return m def addSuffix(m, prefix, suffix): assert len(prefix) == m.prefix_length h = m.map.setdefault(prefix, {}) h[suffix] = h.get(suffix, 0) + 1 def random_prefix(m): import random return random.choice(list(m.map.keys())) def random_suffix(m, prefix): assert len(prefix) == m.prefix_length import random t = [] for suffix, count in m.map[prefix].items(): t.extend([suffix] * count) return random.choice(t) # # Now the I/O functions can use the mapping functions without # worrying about the actual represenations being used since # they only call the mapping functions and do not try to # access any content directly. # def write_grams(path, unigram_mapping, bigram_mapping): with _open_gram_file(path, "w") as f: _write_mapping(f, unigram_mapping) print(file=f) _write_mapping(f, bigram_mapping) def read_grams(path): f = _open_gram_file(path) line_number = 0 unigram_mapping = newMapping(1) bigram_mapping = newMapping(2) line_number = _read_mapping(f, unigram_mapping, 0) line_number = _read_mapping(f, bigram_mapping, line_number) f.close() return unigram_mapping, bigram_mapping # Note the use of leading _ for names of functions that should # only be used by functions within this module. This is a Python # convention. So if you import a module and use a function whose # name begins with _, you are not using the module API and, if # your code breaks because the module writer changes or even # removes that function, IT'S YOUR FAULT! def _open_gram_file(path, mode="r"): # Return the file object to the data file associated with # the text at the given path import os.path root, ext = os.path.splitext(path) tsv_name = root + ".tsv" return open(tsv_name, mode) def _write_mapping(f, m): for prefix, suffix in m.map.items(): print("%s\t%s" % ('\t'.join(prefix), '\t'.join(suffix)), file=f) def _read_mapping(f, m, line_number): for line in f: line_number += 1 values = line.strip().split('\t') if len(values) == 1: # Empty line marks end of section break elif len(values) > m.prefix_length: prefix = tuple(values[:m.prefix_length]) for suffix in values[m.prefix_length:]: addSuffix(m, prefix, suffix) else: # We throw an IOError when we encounter data that # we cannot handle because it's the same type of # exception thrown when a file is missing. Any # callers can catch both errors with a single # except clause. This is nice since there is no # need to distinguish the error types (no file, # bad file) since the end effect is the same: # there's no data available. raise IOError("bad data at line %d" % line_number) return line_number