challenge1.py

# Watermarks from JVCI's synthetic bacterium
# See Figure S1 in http://www.sciencemag.org/content/suppl/2010/05/18/science.1190719.DC1/Gibson.SOM.pdf

wm1 = "GTTCGAATATTTCTATAGCTGTACATATTGTAATGCTGATAACTAATACTGTGCGCTTGACTGTGATCCTGATAAATAACTTCTTCTGTAGGGTAGAGTTTTATTTAAGGCTACTCACTGGTTGCAAACCAATGCCGTACATTACTAGCTTGATCCTTGGTCGGTCATTGGGGGATATCTCTTACTAATAGAGCGGCCTATCGCGTATTCTCGCCGGACCCCCCTCTCCCACACCAGCGGTGTAGCATCACCAAGAAAATGAGGGGAACGGATGAGGAACGAGTGGGGGCTCATTGCTGATCATAATGACTGTTTATATACTAATGCCGTCAACTGTTTGCTGTGATACTGTGCTTTCGAGGGCGGGAGATTCGTTTTTGACATACATAAATATCATGACAAAACAGCCGGTCATGACAAAACAGCCGGTCATAATAGATTAGCCGGTGACTGTGAAACTAAAGCTACTAATGCCGTCAATAAATATGATAATAGCAACGGCACTGACTGTGAAACTAAAGCCGGCACTCATAATAGATTAGCCGGAGTCGTATTCATAGCCGGTAGATATCACTATAAGGCCCAGGATCATGATGAACACAGCACCACGTCGTCGTCCGAGTTTTTTTGCTGCGACGTCTATACCACGGAAGCTGATCATAAATAGTTTTTTTGCTGCGGCACTAGAGCCGGACAAGCACACTACGTTTGTAAATACATCGTTCCGAATTGTAAATAATTTAATTTCGTATTTAAATTATATGATCACTGGCTATAGTCTAGTGATAACTACAATAGCTAGCAATAAGTCATATATAACAATAGCTGAACCTGTGCTACATATCCGCTATACGGTAGATATCACTATAAGGCCCAGGACAATAGCTGAACTGACGTCAGCAACTACGTTTAGCTTGACTGTGGTCGGTTTTTTTGCTGCGACGTCTATACGGAAGCTCATAACTATAAGAGCGGCACTAGAGCCGGCACACAAGCCGGCACAGTCGTATTCATAGCCGGCACTCATGACAAAACAGC"

wm2 = "CAACTGGCAGCATAAAACATATAGAACTACCTGCTATAAGTGATACAACTGTTTTCATAGTAAAACATACAACGTTGCTGATAGTACTCCTAAGTGATAGCTTAGTGCGTTTAGCATATATTGTAGGCTTCATAATAAGTGATATTTTAGCTACGTAACTAAATAAACTAGCTATGACTGTACTCCTAAGTGATATTTTCATCCTTTGCAATACAATAACTACTACATCAATAGTGCGTGATATGCCTGTGCTAGATATAGAACACATAACTACGTTTGCTGTTTTCAGTGATATGCTAGTTTCATCTATAGATATAGGCTGCTTAGATTCCCTACTAGCTATTTCTGTAGGTGATATACGTCCATTGCATAAGTTAATGCATTTAACTAGCTGTGATACTATAGCATCCCCATTCCTAGTGCATATTTTCATCCTAGTGCTACGTGATATAATTGTACTAATGCCTGTAGATAATTTAATGCCTGGCTCGTTTGTAGGTGATAATTTAGTGCCTGTAAAACATATACCTGAGTGCTCGTTGCGTGATAGTTCGTTCATGCATATACAACTAGGCTGCTGTGATATGGTCACTGCCCTTACTGTGCTACATATTACTGCGAGGGGGATGACGTATAAACCTGTTGTAAGTGATATGACGTATATAACTACTAGTGATATGACGTATAGGCTAGAACAACGTGATATGACGTATATGACTACTGTCCCAAACATCAGTGATATGACGTATACTATAATTTCTATAATAGTGATAAATAAACCTGGGCTAAATACGTTCCTGAATACGTGGCATAAACCTGGGCTAACGAGGAATACCCATAGTTTAGCAATAAGCTATAGTTCGTCATTTTTAA"

wm3 = "TTTAACCATATTTAAATATCATCCTGATTTTCACTGGCTCGTTGCGTGATATAGATTCTACTGTAGTGCTAGATAGTTCTGTACTAGGTGATACTATAGATTTCATAGATAGCACTACTGGCTTCATGCTAGGCATCCCAATAGCTAGTGATAGTTTAGTGCATACAACGTCATGTGATACAACGTTGCTGGCTGTAGATACAACGTCGTATTCTGTAAGTGATACAATAGCTATTGCTGTGCATAGGCCTATAGTGGCTGTAACTAGTGATATCACGTAACAACCATATAAGTTAGATTTAATGCCCCTGACTGAACGCTCGTTGCGTGATAGTTTAGGCTCGTTGCATACAACTGTGATTTTCATAAAACAACGTGATAATTTAGTGCTAGATAAGTTCCGCTTAGCAAGTGATAGTTTCCGCTTGACTGTGCATAGTTCGTTCATGCGCTCGTTGCGTGATAAACTAGGCAGCTTCACAACTGATAATTTAATTGCTGATATTGCTGGCTGTCTAGTGCTAGTGATCATAGTGCGTGATAGTTTAAGCTGCTCTGTTTTAGATATCACGTGCTTGATAATGAAACTAACTAGTGATACTACGTAGTTAACTATGAATAGGCCTACTGTAAATTCAATAGTGCGTGATATTGAACTAGATTCTGCAACTGCTAATATGCCGTGCTGCACGTTTGGTGATAGTTTAGCATGCTTCACTATAATAAATATGGTAGTTGTAACTACTGCGAATAGGGGGAGCTTAATAAATATGATCACTGTGCTACGCTATATGCCGTTGAATATAGGCTATATGATCATAACATATATAGCTATAAGTGATAAGTTCCTGAATATAGGCTATATGATCATAACATATACAACTGTACTCATGAATAAGTTAACGAGGA"

wm4 = "TTTCATTGCTGATCACTGTAGATATAGTGCATTCTATAAGTCGCTCCCACAGGCTAGTGCTGCGCACGTTTTTCAGTGATATTATCCTAGTGCTACATAACATCATAGTGCGTGATAAACCTGATACAATAGGTGATATCATAGCAACTGAACTGACGTTGCATAGCTCAACTGTGATCAGTGATATAGATTCTGATACTATAGCAACGTTGCGTGATATTTTCACTACTGGCTTGACTGTAGTGCATATGATAGTACGTCTAACTAGCATAACTAGTGATAGTTATATTTCTATAGCTGTACATATTGTAATGCTGATAACTAGTGATATAATCCAACTAGATAGTCCTGAACTGATCCCTATGCTAACTAGTGATAAACTAACTGATACATCGTTCCTGCTACGTGATAGCTTCACTGAGTTCCATACATCGTCGTGCTTAAACATCAGTGATAACACTATAGAGTTCATAGATACTGCATTAACTAGTGATATGACTGCAAATAGCTTGACGTTTTGCAGTCTAAAACAACGTGATAATTCTGTAGTGCTAGATACTATAGATTTCCTGCTAAGTGATAAGTCTACTGATTTACTAATGAATAGCTTGGTTTTGGCATACACTGTGCGCTGCACTGGTGATAGCTTTTCGTTGATGAATAATTTCCCTAGCACTGTGCGTGATATGCTAGATTCTGTAGATAGGCTAAATTCGTCTACGTTTGTAGGTGATAGTTTAGTTGCTGTAACTAATATTATCCCTGTGCCGTTGCTAAGCTGTGATATCATAGTGCTGCTAGATATGATAAGCAAACTAATAGAGTCGAGGGGGAGTCTCATAGTGAATACTGATATTTTAGTGCTGCCGTTGAATAAGTTCCCTGAACATTGTGATACTGATATTTTAGTGCTGCCGTTGAATATCCTGCATTTAACTAGCTTGATAGTGCATTCGAGGAATACCCATACTACTGTTTTCATAGCTAATTATAGGCTAACATTGCCAATAGTGC"

wm_list = [ wm1, wm2, wm3, wm4 ]

hint_list = [
	"TO LIVE, TO ERR, TO FALL, TO TRIUMPH, TO RECREATE LIFE OUT OF LIFE.",
	"SEE THINGS NOT AS THEY ARE, BUT AS THEY MIGHT BE.",
	"WHAT I CANNOT BUILD, I CANNOT UNDERSTAND." ,
# If we add the hints below, the results look better.
# These hints are guesses made after looking at the initial
# results generated using only the hints above.
#	"ABCDEFGHIJKLMNOPQRSTUVWXYZ\n 0123456789",
#	"<A HREF=\"MAILTO:MROQSTIZ@JCVI.ORG\">HERE!</A>",
]

# These are guesses for individual triplet-to-letter maps
# that are difficult to incorporate into hints (without
# making the hints so large as to make the exercise pointless).
known = {
#	"CCC":"-",
#	"GAA":"'"
}

# This a much simpler test case of "watermarks" and "hints"
# where we know the right answer
if 0:
	wm_list = [ "AAABBBAAAAAABBB" ]
	hint_list = [ "XYXX" ]

# Assumptions:
#   Letters are encoded by DNA triplets
#   Only uppercase letters are used (from the hints, may not be correct)
#   Simple substitution cipher
# Approach:
#   Each hint defines a certain pattern of appearance for letters.
#   If we find the same pattern for triplets in the watermarks, we
#   can create a triplet-to-letter mapping.  There may be multiple
#   mappings that satisfy the hints, so we want to construct all
#   mappings from all hints and then combine them if they are
#   consistent with each other.  Finally, we decode the watermarks
#   using the combined mapping.
#   Problems:
#   (a) There may be more than one mapping that satisfy the hint
#   patterns.  In this case, we will have multiple "plaintext" and
#   we have to figure out which one is correct by comparing them.
#   (b) There will be missing mappings, ie triplets for which we
#   do not have a corresponding letter.  This is because we only have
#   a subset of letters in the hints.  For example, J and K do not
#   appear in any hints, so there is no direct way to identify the
#   triplet patterns that correspond to J and K.  However, we can
#   examine the first-round results and guess where certain letters
#   should be; for example, if we see ABCDEFGHI__LMNOPQ, we can be
#   fairly certain that the missing letters are J and K.  We can then
#   expand the set of hints by including these guessed patterns; in
#   the above example, we would add a hint of ABCDEFGHIJKLMNOPQRSTUVWXYZ.
#   Note that any unmapped triplet that only appear once in the
#   watermarks cannot be mapped algorithmically; we can only guess
#   at what it is likely to be based on patterns that we recognize
#   using contexts other than the hints.
#   

def main():

	# Get possible mappings
	letter2triplet_mappings = get_mapping(hint_list, wm_list)

	# Decode watermarks using each possible mapping
	for letter2triplet in letter2triplet_mappings:
		triplet2letter = invert_mapping(letter2triplet)
		triplet2letter.update(known.items())
		for wm in wm_list:
			for r in apply_mapping(triplet2letter, wm):
				if r:
					print r
			print

def get_mapping(hint_list, wm_list):

	# Loop through hints to get the possible mappings,
	# combining possibilities from all hints.
	mappings = None
	for hint in hint_list:
		m = process_hint(hint, wm_list)
		if mappings is None:
			mappings = m
		else:
			mappings = reconcile(mappings, m)

	# Check whether results are reasonable
	if len(mappings) == 0:
		print "ERROR: cannot find any suitable mapping"
	elif len(mappings) > 1:
		print "WARNING: %d mappings found" % len(m)
	return mappings

def process_hint(hint, wm_list):

	# Use the pattern defined by "hint" and apply it to each
	# watermark in "wm_list" to see if we can generate
	# triplet-to-letter mappings.
	mappings = list()
	for wm in wm_list:
		mappings.extend(find_mappings(build_pattern(hint), hint, wm))

	# If the hint appears twice in a watermark, or it appears
	# in multiple watermarks, we will generate a mapping for
	# each time it appears.  If multiple mappings are identical,
	# we only need one copy.
	mappings = unique_mappings(mappings)
	return mappings

def unique_mappings(orig_mappings):

	# Remove any duplicate mappings in the list
	mappings = list()
	for om in orig_mappings:
		for m in mappings:
			if om == m:
				break
		else:
			mappings.append(om)
	return mappings

def build_pattern(hint):

	# A pattern is a list of indices of the same length as the hint.
	# The value of the pattern at index i is the index 
	# in "hint" where hint[i] last occurs.  For example, "AAB"
	# has a pattern of [ 1, 1, 2 ].  When we want to see if a
	# string matches our pattern, we check if string[i] ==
	# string[pattern[i]].  (There is a hole in this logic where
	# do not enforce that two different letters must _not_ have
	# the same mapping, but the mapping reconciliation code should
	# catch that and eliminate the bad mapping.)
	letters = dict( [ (letter, i)
				for i, letter in enumerate(hint) ] )
	pattern = [ letters[letter] for letter in hint ]
	return pattern

def find_mappings(pattern, hint, wm):

	# Dividing the watermark into triplets, find triplets that
	# exhibit the sequence defined by "pattern"
	mappings = list()
	unique_indices = set(pattern)
	triplet_length = len(pattern) * 3
	for i in range(0, len(wm) - triplet_length + 1):
		if pattern_match(pattern, wm, i):
			mappings.append(mapping(unique_indices, hint, wm, i))
	return mappings

def triplet(wm, start, i):

	# Return the triplet in "watermark" at position "i" when
	# the pattern begins at "start".
	wm_i = start + i * 3
	return wm[wm_i:wm_i+3]

def pattern_match(pattern, wm, start):

	# Get len(pattern) triplets from wm starting at offset "start"
	# and see if they match pattern.  (See comments in "build_pattern"
	# above for how pattern matching occurs.)
	for i, first_i in enumerate(pattern):
		if first_i == i:
			# Letter always equal to itself
			continue
		if triplet(wm, start, i) != triplet(wm, start, first_i):
			return False
	return True

def mapping(unique_indices, hint, wm, start):

	# Assuming that the pattern for "hint" matches watermark
	# "wm" at offset "start", create the letter-to-triplet
	# mapping.  Only indices in "unique_indices" need to be
	# processed because all other indices are redundant.
	return dict( [ (hint[i], triplet(wm, start, i))
				for i in unique_indices ] )

def reconcile(mappings1, mappings2):

	# Each group of mappings consist of letter-to-triplet mappings
	# Take all pairs of mapping, one from each group, and check
	# whether they are compatible.  If so, create and save a
	# combined mapping.
	mappings = list()
	for m1 in mappings1:
		for m2 in mappings2:
			m = combine(m1, m2)
			if m:
				mappings.append(m)
	return unique_mappings(mappings)

def combine(m1, m2):

	# Combine two mappings.  If any letter appears in both mappings
	# and map to different triplets, then we return None indicating
	# that the mappings are incompatible.  Otherwise, we create
	# a single mapping that includes the all the correspondences
	# from both mappings.
	m = dict()
	for k in m1.keys() + m2.keys():
		try:
			v1 = m1[k]
		except KeyError:
			m[k] = m2[k]
			continue
		try:
			v2 = m2[k]
		except KeyError:
			m[k] = m1[k]
			continue
		if v1 != v2:
			return None
		m[k] = v1
	return m

def invert_mapping(m):

	# Invert the mapping, making the values into keys and vice
	# versa.  This assumes that the mapping is one-to-one.
	return dict( [ (v, k) for k, v in m.iteritems() ] )

def apply_mapping(m, wm):

	# Assume there is no frame shift and simply apply the
	# triplet-to-letter mapping "m" to watermark "wm"
	return [ substitute(m, wm) ]
	# UNUSED: We can handle frame shifts by returning multiple
	# "plaintext" for each watermark.
	# Return the three possible substitution results
	#return [ substitute(m, wm, offset) for offset in range(3) ]

def substitute(m, wm, offset=0):

	# Apply the correspondences in mapping "m" to watermark
	# "wm" starting at offset "offset".  
	letters = list()
	bad = 0
	for i in range(offset, len(wm) - 2, 3):
		triplet = wm[i:i+3]
		try:
			letters.append(m[triplet])
		except KeyError:
			letters.append('[%s]' % triplet)
			bad = bad + 1

	# If the number of untranslated triplets exceeds a certain
	# threshold, we assume the mapping is incorrect and do not
	# bother to return any plaintext
	if bad > len(wm) / 3 * 0.3:
		return None
	else:
		return ''.join(letters)

if __name__ == "__main__":
	# Run the program if we were invoked as a script
	main()