from __future__ import print_function """Module for reading and writing unigram and bigram data using tab-separated-value (TSV) format.""" # We choose to put these functions together in a single module # because changing one immediately requires changing the other # to match. If the functions are dispersed into individual # programs, we would need to make the same modification in all # programs instead of just in this one module. def write_grams(path, m1, m2): with _open_gram_file(path, "w") as f: for prefix, suffix in m1.items(): print("%s\t%s" % (prefix, '\t'.join(suffix)), file=f) print(file=f) for prefix, suffix in m2.items(): print("%s\t%s\t%s" % (prefix[0], prefix[1], '\t'.join(suffix)), file=f) def read_grams(path): # We throw an IOError when we encounter data that we cannot # handle because it's the same type of exception thrown when # a file is missing. Any callers can catch both errors with # a single except clause. This is nice since there is no # need to distinguish the error types (no file, bad file) # since the end effect is the same: there's no data available. f = _open_gram_file(path) line_number = 0 m1 = {} for line in f: line_number += 1 values = line.strip().split('\t') if len(values) == 1: break elif len(values) > 1: m1[values[0]] = values[1:] else: raise IOError("unexpected m1 data at line %d" % line_number) m2 = {} for line in f: line_number += 1 values = line.strip().split('\t') if len(values) == 1: break elif len(values) > 2: prefix = (values[0], values[1]) m2[prefix] = values[2:] else: raise IOError("unexpected m2 data at line %d" % line_number) f.close() return m1, m2 def _open_gram_file(path, mode="r"): # Return the file object to the data file associated with # the text at the given path import os.path root, ext = os.path.splitext(path) tsv_name = root + ".tsv" return open(tsv_name, mode) # # Shift is placed in this file because it is often used by # calling scripts that use either read_grams or write_grams # def shift(prefix, word): return prefix[1:] + (word,)