pattern.py

class Pattern:
	"""Pattern is a sequence of values that define which indices
	must have the same values.  For example, "AAB" means positions
	0 and 1 must be the same because both are "A"s."""

	def __init__(self, pattern):
		"""Initialize a Pattern instance.

		'watermark' - value representing a watermark,
		typically a string."""
		raise NotImplementedError("Pattern.__init__")

	def length(self):
		"""Return length of pattern."""
		raise NotImplementedError("Pattern.length")

	def value(self, i):
		"""Return pattern value at index 'i'.

		'i' - value index, integer."""
		raise NotImplementedError("Pattern.value")

	def find_all_mappings(self, watermark):
		"""Return a list of mapping.Mapping instances that
		represent all possible mappings from watermark values
		to pattern values.  Watermarks must be longer than
		patterns and there may possibly be multiple mappings
		in different parts of the watermark for the same
		pattern.
		
		'watermark' - watermark to search against, an instance
		supporting watermark.Watermark API."""
		raise NotImplementedError("Pattern.find_all_mappings")

class StringPattern(Pattern):

	def __init__(self, pattern):
		self.pattern = pattern		# public
		v2i = {}			# value-to-indices map
		for i in range(self.length()):
			v = self.value(i)
			try:
				v2i[v].append(i)
			except KeyError:
				v2i[v] = [ i ]
		# _same_values is a list of 2-tuples where the first
		# element is the pattern value and the second is the
		# list of indices where the value occurs.  We keep
		# lists even with only one index because it is also
		# used to build the watermark-pattern value mapping.
		self._same_values = [ (v, indices)
					for v, indices in v2i.items() ]

	def length(self):
		return len(self.pattern)

	def value(self, i):
		return self.pattern[i]

	def find_all_mappings(self, watermark):
		from mapping import MappingSet
		mappings = MappingSet()
		# Check each possible starting index for mapping
		for start in watermark.pattern_start_indices(self.length()):
			m = self._find_mapping(watermark, start)
			if m:
				mappings.add(m)
		return mappings

	def _find_mapping(self, watermark, start):
		# Return a mapping.Mapping instance if watermark
		# (starting at 'start') matches pattern, or None
		# if it does not match.
		from mapping import Mapping
		m = Mapping()
		w_values = watermark.values(start, self.length())
		for pattern_value, indices in self._same_values:
			# Add mapping from watermark to pattern value.
			watermark_value = w_values[indices[0]]
			m[watermark_value] = pattern_value
			for i in indices[1:]:
				# Watermark values must match at the same
				# indices as pattern values.  If any does
				# not, there cannot be any mapping.
				if w_values[i] != watermark_value:
					return None
		return m

if __name__ == "__main__":
	import unittest
	class TestPattern(unittest.TestCase):

		def setUp(self):
			self.p = StringPattern("XXYY")
			from watermark import DNAWatermark
			self.wm1 = DNAWatermark("TTTATGATGCGGCGGAAA")
			self.wm2 = DNAWatermark("ATGATGCGGCGGAAA")
			self.wm3 = DNAWatermark("AAAATGATGCGGCGG")
			self.wnm = DNAWatermark("TTTATGGTGCGGCGGAAA")

		def test_match(self):
			from mapping import Mapping, MappingSet
			answer = MappingSet([ Mapping({'ATG': 'X',
							'CGG': 'Y'}) ])
			ms = self.p.find_all_mappings(self.wm1)
			self.assertEqual(ms, answer)
			ms = self.p.find_all_mappings(self.wm2)
			self.assertEqual(ms, answer)
			ms = self.p.find_all_mappings(self.wm3)
			self.assertEqual(ms, answer)

		def test_no_match(self):
			ms = self.p.find_all_mappings(self.wnm)
			self.assertEqual(len(ms), 0)
	unittest.main()