"""Example class for parsing SSAHA output and composing genomic locations
from individual matches."""

from location import Match, GenomicLocation

class Ssaha:

	def __init__(self):
		# If we were actually running SSAHA, we would require
		# arguments such as the genomic sequences and possibly
		# locations where index files may be stored.  But for
		# this example, we're not really running SSAHA, so we
		# just don't do anything here.
		pass

	def localize(self, seq):
		"Find sequence matches using SSAHA and return GenomicLocation"

		# We're faking the SSAHA run and using canned output.
		matches = self._readSsaha("%s.ssaha.pf" % seq.description)
		return matches, self._findLocation(matches)

	def _readSsaha(self, filename):
		"Read SSAHA output and convert to list of Match instances"

		matches = []
		f = open(filename)
		for line in f:
			parts = line.strip().split()
			if len(parts) != 9:
				# silently ignore bad lines
				continue
			# q = query (mRNA)
			# t = target (genomic)
			qId = parts[1]
			qStart = int(parts[2])
			qEnd = int(parts[3])
			tId = parts[4]
			tStart = int(parts[5])
			tEnd = int(parts[6])
			m = Match((qId, qStart, qEnd), (tId, tStart, tEnd))
			matches.append(m)
		f.close()
		return matches

	def _findLocation(self, matches):
		"Find best set of matches to cover a sequence."

		# The algorithm is to sort all matches by their
		# start position on the mRNA sequence.  We then
		# compute the "score" of the matches in order.
		# The "score" of a match is the number of bases
		# covered by a match and all "compatible" matches
		# that precede it.  To determine "compatibility"
		# for each match, we look at all matches that
		# start before it.  If the preceding match ends
		# before the current match on both the query and
		# on the target, then the two matches are
		# compatible, and the score of the current match
		# is the sum of the score of the preceding match
		# and the number of bases in the current match.
		# The final score of the current match is the
		# maximum score that we can find.
		# The best coverage of the query sequence is the
		# maximum score of all matches.

		# First we sort the matches in ascending order
		# of query start position.
		sortList = [ (m.srcStart, m) for m in matches ]
		sortList.sort()
		matches = [ t[1] for t in sortList ]

		# Then we compute the score for each match,
		# keeping track of which preceding match led
		# to the best score
		scores = []
		topScore = 0
		topMatch = None
		for m in matches:
			bestScore = 0
			bestPrev = None
			matchScore = m.srcEnd - m.srcStart + 1
			for s in scores:
				prevMatch, prevScore, prevPrev = s
				if not self._compatible(m, prevMatch):
					continue
				score = prevScore + matchScore
				if score > bestScore:
					bestScore = score
					bestPrev = prevMatch
			scores.insert(0, (m, bestScore, bestPrev))
			if bestScore > topScore:
				topScore = bestScore
				topMatch = m
		# Then we recover the list of matches the led
		# to the top match and make it into a GenomicLocation.
		# Note that the matches are recovered in reverse order,
		# so we insert at the beginning of our results list
		# to reverse the reversal.
		bestMatches = [ topMatch ]
		for m, score, prev in scores:
			if m is topMatch:
				if prev is None:
					break
				else:
					bestMatches.insert(0, prev)
					topMatch = prev
		return GenomicLocation(bestMatches)

	def _compatible(self, m, prevMatch):
		# See comment in _findLocation for explanation
		return (m.srcStart > prevMatch.srcEnd
				and m.dstStart > prevMatch.dstEnd)

if __name__ == "__main__":
	def main():
		import pprint
		from sequence import Sequence
		seq = Sequence(">AF037454", "ATGC")
		matches, loc = Ssaha().localize(seq)
		print len(matches), "overall matches"
		pprint.pprint(matches)
		print len(loc.matches), "location matches"
		pprint.pprint(loc.matches)
	main()