challenge1.py

# Watermarks from JVCI's synthetic bacterium
# See Figure S1 in http://www.sciencemag.org/content/suppl/2010/05/18/science.1190719.DC1/Gibson.SOM.pdf

wm1 = "GTTCGAATATTTCTATAGCTGTACATATTGTAATGCTGATAACTAATACTGTGCGCTTGACTGTGATCCTGATAAATAACTTCTTCTGTAGGGTAGAGTTTTATTTAAGGCTACTCACTGGTTGCAAACCAATGCCGTACATTACTAGCTTGATCCTTGGTCGGTCATTGGGGGATATCTCTTACTAATAGAGCGGCCTATCGCGTATTCTCGCCGGACCCCCCTCTCCCACACCAGCGGTGTAGCATCACCAAGAAAATGAGGGGAACGGATGAGGAACGAGTGGGGGCTCATTGCTGATCATAATGACTGTTTATATACTAATGCCGTCAACTGTTTGCTGTGATACTGTGCTTTCGAGGGCGGGAGATTCGTTTTTGACATACATAAATATCATGACAAAACAGCCGGTCATGACAAAACAGCCGGTCATAATAGATTAGCCGGTGACTGTGAAACTAAAGCTACTAATGCCGTCAATAAATATGATAATAGCAACGGCACTGACTGTGAAACTAAAGCCGGCACTCATAATAGATTAGCCGGAGTCGTATTCATAGCCGGTAGATATCACTATAAGGCCCAGGATCATGATGAACACAGCACCACGTCGTCGTCCGAGTTTTTTTGCTGCGACGTCTATACCACGGAAGCTGATCATAAATAGTTTTTTTGCTGCGGCACTAGAGCCGGACAAGCACACTACGTTTGTAAATACATCGTTCCGAATTGTAAATAATTTAATTTCGTATTTAAATTATATGATCACTGGCTATAGTCTAGTGATAACTACAATAGCTAGCAATAAGTCATATATAACAATAGCTGAACCTGTGCTACATATCCGCTATACGGTAGATATCACTATAAGGCCCAGGACAATAGCTGAACTGACGTCAGCAACTACGTTTAGCTTGACTGTGGTCGGTTTTTTTGCTGCGACGTCTATACGGAAGCTCATAACTATAAGAGCGGCACTAGAGCCGGCACACAAGCCGGCACAGTCGTATTCATAGCCGGCACTCATGACAAAACAGC"

wm2 = "CAACTGGCAGCATAAAACATATAGAACTACCTGCTATAAGTGATACAACTGTTTTCATAGTAAAACATACAACGTTGCTGATAGTACTCCTAAGTGATAGCTTAGTGCGTTTAGCATATATTGTAGGCTTCATAATAAGTGATATTTTAGCTACGTAACTAAATAAACTAGCTATGACTGTACTCCTAAGTGATATTTTCATCCTTTGCAATACAATAACTACTACATCAATAGTGCGTGATATGCCTGTGCTAGATATAGAACACATAACTACGTTTGCTGTTTTCAGTGATATGCTAGTTTCATCTATAGATATAGGCTGCTTAGATTCCCTACTAGCTATTTCTGTAGGTGATATACGTCCATTGCATAAGTTAATGCATTTAACTAGCTGTGATACTATAGCATCCCCATTCCTAGTGCATATTTTCATCCTAGTGCTACGTGATATAATTGTACTAATGCCTGTAGATAATTTAATGCCTGGCTCGTTTGTAGGTGATAATTTAGTGCCTGTAAAACATATACCTGAGTGCTCGTTGCGTGATAGTTCGTTCATGCATATACAACTAGGCTGCTGTGATATGGTCACTGCCCTTACTGTGCTACATATTACTGCGAGGGGGATGACGTATAAACCTGTTGTAAGTGATATGACGTATATAACTACTAGTGATATGACGTATAGGCTAGAACAACGTGATATGACGTATATGACTACTGTCCCAAACATCAGTGATATGACGTATACTATAATTTCTATAATAGTGATAAATAAACCTGGGCTAAATACGTTCCTGAATACGTGGCATAAACCTGGGCTAACGAGGAATACCCATAGTTTAGCAATAAGCTATAGTTCGTCATTTTTAA"

wm3 = "TTTAACCATATTTAAATATCATCCTGATTTTCACTGGCTCGTTGCGTGATATAGATTCTACTGTAGTGCTAGATAGTTCTGTACTAGGTGATACTATAGATTTCATAGATAGCACTACTGGCTTCATGCTAGGCATCCCAATAGCTAGTGATAGTTTAGTGCATACAACGTCATGTGATACAACGTTGCTGGCTGTAGATACAACGTCGTATTCTGTAAGTGATACAATAGCTATTGCTGTGCATAGGCCTATAGTGGCTGTAACTAGTGATATCACGTAACAACCATATAAGTTAGATTTAATGCCCCTGACTGAACGCTCGTTGCGTGATAGTTTAGGCTCGTTGCATACAACTGTGATTTTCATAAAACAACGTGATAATTTAGTGCTAGATAAGTTCCGCTTAGCAAGTGATAGTTTCCGCTTGACTGTGCATAGTTCGTTCATGCGCTCGTTGCGTGATAAACTAGGCAGCTTCACAACTGATAATTTAATTGCTGATATTGCTGGCTGTCTAGTGCTAGTGATCATAGTGCGTGATAGTTTAAGCTGCTCTGTTTTAGATATCACGTGCTTGATAATGAAACTAACTAGTGATACTACGTAGTTAACTATGAATAGGCCTACTGTAAATTCAATAGTGCGTGATATTGAACTAGATTCTGCAACTGCTAATATGCCGTGCTGCACGTTTGGTGATAGTTTAGCATGCTTCACTATAATAAATATGGTAGTTGTAACTACTGCGAATAGGGGGAGCTTAATAAATATGATCACTGTGCTACGCTATATGCCGTTGAATATAGGCTATATGATCATAACATATATAGCTATAAGTGATAAGTTCCTGAATATAGGCTATATGATCATAACATATACAACTGTACTCATGAATAAGTTAACGAGGA"

wm4 = "TTTCATTGCTGATCACTGTAGATATAGTGCATTCTATAAGTCGCTCCCACAGGCTAGTGCTGCGCACGTTTTTCAGTGATATTATCCTAGTGCTACATAACATCATAGTGCGTGATAAACCTGATACAATAGGTGATATCATAGCAACTGAACTGACGTTGCATAGCTCAACTGTGATCAGTGATATAGATTCTGATACTATAGCAACGTTGCGTGATATTTTCACTACTGGCTTGACTGTAGTGCATATGATAGTACGTCTAACTAGCATAACTAGTGATAGTTATATTTCTATAGCTGTACATATTGTAATGCTGATAACTAGTGATATAATCCAACTAGATAGTCCTGAACTGATCCCTATGCTAACTAGTGATAAACTAACTGATACATCGTTCCTGCTACGTGATAGCTTCACTGAGTTCCATACATCGTCGTGCTTAAACATCAGTGATAACACTATAGAGTTCATAGATACTGCATTAACTAGTGATATGACTGCAAATAGCTTGACGTTTTGCAGTCTAAAACAACGTGATAATTCTGTAGTGCTAGATACTATAGATTTCCTGCTAAGTGATAAGTCTACTGATTTACTAATGAATAGCTTGGTTTTGGCATACACTGTGCGCTGCACTGGTGATAGCTTTTCGTTGATGAATAATTTCCCTAGCACTGTGCGTGATATGCTAGATTCTGTAGATAGGCTAAATTCGTCTACGTTTGTAGGTGATAGTTTAGTTGCTGTAACTAATATTATCCCTGTGCCGTTGCTAAGCTGTGATATCATAGTGCTGCTAGATATGATAAGCAAACTAATAGAGTCGAGGGGGAGTCTCATAGTGAATACTGATATTTTAGTGCTGCCGTTGAATAAGTTCCCTGAACATTGTGATACTGATATTTTAGTGCTGCCGTTGAATATCCTGCATTTAACTAGCTTGATAGTGCATTCGAGGAATACCCATACTACTGTTTTCATAGCTAATTATAGGCTAACATTGCCAATAGTGC"

wm_list = [ wm1, wm2, wm3, wm4 ]

hint_list = [
    "TO LIVE, TO ERR, TO FALL, TO TRIUMPH, TO RECREATE LIFE OUT OF LIFE.",
    "SEE THINGS NOT AS THEY ARE, BUT AS THEY MIGHT BE.",
    "WHAT I CANNOT BUILD, I CANNOT UNDERSTAND." ,
# If we add the hints below, the results look better.
# These hints are guesses made after looking at the initial
# results generated using only the hints above.
#    "ABCDEFGHIJKLMNOPQRSTUVWXYZ\n 0123456789",
#    "<A HREF=\"MAILTO:MROQSTIZ@JCVI.ORG\">HERE!</A>",
]

# These are guesses for individual triplet-to-letter maps
# that are difficult to incorporate into hints (without
# making the hints so large as to make the exercise pointless).
known = {
#    "CCC":"-",
#    "GAA":"'"
}

# This a much simpler test case of "watermarks" and "hints"
# where we know the right answer
if 0:
    wm_list = [ "AAABBBAAAAAABBB" ]
    hint_list = [ "XYXX" ]

# Assumptions:
#   Letters are encoded by DNA triplets
#   Only uppercase letters are used (from the hints, may not be correct)
#   Simple substitution cipher
# Approach:
#   Each hint defines a certain pattern of appearance for letters.
#   If we find the same pattern for triplets in the watermarks, we
#   can create a triplet-to-letter mapping.  There may be multiple
#   mappings that satisfy the hints, so we want to construct all
#   mappings from all hints and then combine them if they are
#   consistent with each other.  Finally, we decode the watermarks
#   using the combined mapping.
#   Problems:
#   (a) There may be more than one mapping that satisfy the hint
#   patterns.  In this case, we will have multiple "plaintext" and
#   we have to figure out which one is correct by comparing them.
#   (b) There will be missing mappings, ie triplets for which we
#   do not have a corresponding letter.  This is because we only have
#   a subset of letters in the hints.  For example, J and K do not
#   appear in any hints, so there is no direct way to identify the
#   triplet patterns that correspond to J and K.  However, we can
#   examine the first-round results and guess where certain letters
#   should be; for example, if we see ABCDEFGHI__LMNOPQ, we can be
#   fairly certain that the missing letters are J and K.  We can then
#   expand the set of hints by including these guessed patterns; in
#   the above example, we would add a hint of ABCDEFGHIJKLMNOPQRSTUVWXYZ.
#   Note that any unmapped triplet that only appear once in the
#   watermarks cannot be mapped algorithmically; we can only guess
#   at what it is likely to be based on patterns that we recognize
#   using contexts other than the hints.
#   

def main():

    # Get possible mappings
    letter2triplet_mappings = get_mapping(hint_list, wm_list)

    # Decode watermarks using each possible mapping
    for letter2triplet in letter2triplet_mappings:
        triplet2letter = invert_mapping(letter2triplet)
        triplet2letter.update(known.items())
        for wm in wm_list:
            for r in apply_mapping(triplet2letter, wm):
                if r:
                    print(r)
            print()

def get_mapping(hint_list, wm_list):

    # Loop through hints to get the possible mappings,
    # combining possibilities from all hints.
    mappings = None
    for hint in hint_list:
        m = process_hint(hint, wm_list)
        if mappings is None:
            mappings = m
        else:
            mappings = reconcile(mappings, m)

    # Check whether results are reasonable
    if len(mappings) == 0:
        print("ERROR: cannot find any suitable mapping")
    elif len(mappings) > 1:
        print("WARNING: %d mappings found" % len(m))
    return mappings

def process_hint(hint, wm_list):

    # Use the pattern defined by "hint" and apply it to each
    # watermark in "wm_list" to see if we can generate
    # triplet-to-letter mappings.
    mappings = list()
    for wm in wm_list:
        mappings.extend(find_mappings(build_pattern(hint), hint, wm))

    # If the hint appears twice in a watermark, or it appears
    # in multiple watermarks, we will generate a mapping for
    # each time it appears.  If multiple mappings are identical,
    # we only need one copy.
    mappings = unique_mappings(mappings)
    return mappings

def unique_mappings(orig_mappings):

    # Remove any duplicate mappings in the list
    mappings = list()
    for om in orig_mappings:
        for m in mappings:
            if om == m:
                break
        else:
            mappings.append(om)
    return mappings

def build_pattern(hint):

    # A pattern is a list of indices of the same length as the hint.
    # The value of the pattern at index i is the index 
    # in "hint" where hint[i] last occurs.  For example, "AAB"
    # has a pattern of [ 1, 1, 2 ].  When we want to see if a
    # string matches our pattern, we check if string[i] ==
    # string[pattern[i]].  (There is a hole in this logic where
    # do not enforce that two different letters must _not_ have
    # the same mapping, but the mapping reconciliation code should
    # catch that and eliminate the bad mapping.)
    letters = dict( [ (letter, i)
                for i, letter in enumerate(hint) ] )
    pattern = [ letters[letter] for letter in hint ]
    return pattern

def find_mappings(pattern, hint, wm):

    # Dividing the watermark into triplets, find triplets that
    # exhibit the sequence defined by "pattern"
    mappings = list()
    unique_indices = set(pattern)
    triplet_length = len(pattern) * 3
    for i in range(0, len(wm) - triplet_length + 1):
        if pattern_match(pattern, wm, i):
            mappings.append(mapping(unique_indices, hint, wm, i))
    return mappings

def triplet(wm, start, i):

    # Return the triplet in "watermark" at position "i" when
    # the pattern begins at "start".
    wm_i = start + i * 3
    return wm[wm_i:wm_i+3]

def pattern_match(pattern, wm, start):

    # Get len(pattern) triplets from wm starting at offset "start"
    # and see if they match pattern.  (See comments in "build_pattern"
    # above for how pattern matching occurs.)
    for i, first_i in enumerate(pattern):
        if first_i == i:
            # Letter always equal to itself
            continue
        if triplet(wm, start, i) != triplet(wm, start, first_i):
            return False
    return True

def mapping(unique_indices, hint, wm, start):

    # Assuming that the pattern for "hint" matches watermark
    # "wm" at offset "start", create the letter-to-triplet
    # mapping.  Only indices in "unique_indices" need to be
    # processed because all other indices are redundant.
    return dict( [ (hint[i], triplet(wm, start, i))
                for i in unique_indices ] )

def reconcile(mappings1, mappings2):

    # Each group of mappings consist of letter-to-triplet mappings
    # Take all pairs of mapping, one from each group, and check
    # whether they are compatible.  If so, create and save a
    # combined mapping.
    mappings = list()
    for m1 in mappings1:
        for m2 in mappings2:
            m = combine(m1, m2)
            if m:
                mappings.append(m)
    return unique_mappings(mappings)

def combine(m1, m2):

    # Combine two mappings.  If any letter appears in both mappings
    # and map to different triplets, then we return None indicating
    # that the mappings are incompatible.  Otherwise, we create
    # a single mapping that includes the all the correspondences
    # from both mappings.
    m = dict()
    for k in list(m1.keys()) + list(m2.keys()):
        try:
            v1 = m1[k]
        except KeyError:
            m[k] = m2[k]
            continue
        try:
            v2 = m2[k]
        except KeyError:
            m[k] = m1[k]
            continue
        if v1 != v2:
            return None
        m[k] = v1
    return m

def invert_mapping(m):

    # Invert the mapping, making the values into keys and vice
    # versa.  This assumes that the mapping is one-to-one.
    return dict( [ (v, k) for k, v in m.items() ] )

def apply_mapping(m, wm):

    # Assume there is no frame shift and simply apply the
    # triplet-to-letter mapping "m" to watermark "wm"
    return [ substitute(m, wm) ]
    # UNUSED: We can handle frame shifts by returning multiple
    # "plaintext" for each watermark.
    # Return the three possible substitution results
    #return [ substitute(m, wm, offset) for offset in range(3) ]

def substitute(m, wm, offset=0):

    # Apply the correspondences in mapping "m" to watermark
    # "wm" starting at offset "offset".  
    letters = list()
    bad = 0
    for i in range(offset, len(wm) - 2, 3):
        triplet = wm[i:i+3]
        try:
            letters.append(m[triplet])
        except KeyError:
            letters.append('[%s]' % triplet)
            bad = bad + 1

    # If the number of untranslated triplets exceeds a certain
    # threshold, we assume the mapping is incorrect and do not
    # bother to return any plaintext
    if bad > len(wm) / 3 * 0.4:
        return None
    else:
        return ''.join(letters)

if __name__ == "__main__":
    # Run the program if we were invoked as a script
    main()