"""Example class for parsing SSAHA output and composing genomic locations from individual matches.""" from location import Match, GenomicLocation class Ssaha: def __init__(self): # If we were actually running SSAHA, we would require # arguments such as the genomic sequences and possibly # locations where index files may be stored. But for # this example, we're not really running SSAHA, so we # just don't do anything here. pass def localize(self, seq): "Find sequence matches using SSAHA and return GenomicLocation" # We're faking the SSAHA run and using canned output. matches = self._readSsaha("%s.ssaha.pf" % seq.description) return matches, self._findLocation(matches) def _readSsaha(self, filename): "Read SSAHA output and convert to list of Match instances" matches = [] f = open(filename) for line in f: parts = line.strip().split() if len(parts) != 9: # silently ignore bad lines continue # q = query (mRNA) # t = target (genomic) qId = parts[1] qStart = int(parts[2]) qEnd = int(parts[3]) tId = parts[4] tStart = int(parts[5]) tEnd = int(parts[6]) m = Match((qId, qStart, qEnd), (tId, tStart, tEnd)) matches.append(m) f.close() return matches def _findLocation(self, matches): "Find best set of matches to cover a sequence." # The algorithm is to sort all matches by their # start position on the mRNA sequence. We then # compute the "score" of the matches in order. # The "score" of a match is the number of bases # covered by a match and all "compatible" matches # that precede it. To determine "compatibility" # for each match, we look at all matches that # start before it. If the preceding match ends # before the current match on both the query and # on the target, then the two matches are # compatible, and the score of the current match # is the sum of the score of the preceding match # and the number of bases in the current match. # The final score of the current match is the # maximum score that we can find. # The best coverage of the query sequence is the # maximum score of all matches. # First we sort the matches in ascending order # of query start position. sortList = [ (m.srcStart, m) for m in matches ] sortList.sort() matches = [ t[1] for t in sortList ] # Then we compute the score for each match, # keeping track of which preceding match led # to the best score scores = [] topScore = 0 topMatch = None for m in matches: bestScore = 0 bestPrev = None matchScore = m.srcEnd - m.srcStart + 1 for s in scores: prevMatch, prevScore, prevPrev = s if not self._compatible(m, prevMatch): continue score = prevScore + matchScore if score > bestScore: bestScore = score bestPrev = prevMatch scores.insert(0, (m, bestScore, bestPrev)) if bestScore > topScore: topScore = bestScore topMatch = m # Then we recover the list of matches the led # to the top match and make it into a GenomicLocation. # Note that the matches are recovered in reverse order, # so we insert at the beginning of our results list # to reverse the reversal. bestMatches = [ topMatch ] for m, score, prev in scores: if m is topMatch: if prev is None: break else: bestMatches.insert(0, prev) topMatch = prev return GenomicLocation(bestMatches) def _compatible(self, m, prevMatch): # See comment in _findLocation for explanation return (m.srcStart > prevMatch.srcEnd and m.dstStart > prevMatch.dstEnd) if __name__ == "__main__": def main(): import pprint from sequence import Sequence seq = Sequence(">AF037454", "ATGC") matches, loc = Ssaha().localize(seq) print len(matches), "overall matches" pprint.pprint(matches) print len(loc.matches), "location matches" pprint.pprint(loc.matches) main()