# # This file demonstrates "polymorphism", where two unrelated classes # provide the same functionality through the same interface and may # be used by a single calling routine. # # Conrad Huang # October 8, 2004 # # The two sequence localization programs "MegaBLAST" and "SSAHA" can # identify matching segments from a query sequence with segments # from some database sequences (frequently genomic assemblies). # One of the output formats of MegaBLAST is tab-separated-values # with the following fields: # query id, subject id, % identity, alignment length, mismatches # gap openings, query start, query end, subject start, subject end # e-value, bit score # The output format of SSAHA is also tab-separated-values, but # with these fields: # strand directions, query id, query start, query end, subject id, # subject start, subject end, alignment size, alignment score # Clearly, the two formats have many fields in common. If only the # common fields are needed for a certain analysis (e.g., checking # if the two programs hit the same genomic segments), then a single # analysis routine may be used if data from the two output files # can be represented in a compatible manner in the program. # In the code below, SsahaParser and MegablastParser return # instances of classes SsahaAlignment and MegablastAlignment # respectively. Different class instances are desirable # because they contain format-specific data, e.g., % identity # for MegaBLAST. However, SsahaAlignment and MegablastAlignment # share a common method: hitSize. Since the analysis function # averageHitSize only use this method, it can compute its # result from a list of instances of either class. # This module may be tested by executing it as a Python script. # Command line flag "-s" and "-m" select SSAHA and MegaBLAST # output formats respectively. Each command line flag must be # followed by the name of a result file. class SsahaParser: def __init__(self, filename): alignments = [] f = open(filename) try: for line in f: alignments.append(SsahaAlignment(line)) finally: self.alignments = alignments f.close() class SsahaAlignment: def __init__(self, line): parts = line.strip().split() if len(parts) != 9: raise ValueError, "bad SSAHA output: %s" % line self.strands = parts[0] self.queryId = parts[1] self.queryStart = int(parts[2]) self.queryEnd = int(parts[3]) self.subjectId = parts[4] self.subjectStart = int(parts[5]) self.subjectEnd = int(parts[6]) self.alignmentSize = int(parts[7]) self.alignmentScore = float(parts[7]) def hitSize(self): return self.alignmentSize class MegablastParser: def __init__(self, filename): alignments = [] f = open(filename) try: for line in f: if line[0] == '#': # Skip comment lines continue alignments.append(MegablastAlignment(line)) finally: self.alignments = alignments f.close() class MegablastAlignment: def __init__(self, line): parts = line.strip().split() if len(parts) != 12: raise ValueError, "bad MegaBLAST output: %s" % line self.queryId = parts[0] self.subjectId = parts[1] self.percentIdentity = float(parts[2]) self.alignmentLength = int(parts[3]) self.mismatches = int(parts[4]) self.gapOpenings = int(parts[5]) self.queryStart = int(parts[6]) self.queryEnd = int(parts[7]) self.subjectStart = int(parts[8]) self.subjectEnd = int(parts[9]) self.alignmentSize = float(parts[10]) self.alignmentScore = float(parts[11]) def hitSize(self): return self.alignmentLength def averageHitSize(alignments): sum = 0.0 for a in alignments: # Note that "a" can be instances of either # SsahaAlignment or MegablastAlignment. A # list may contain instances from both classes # if both types of output files were specified # on the command line. The important aspect # of polymorophism is that none of the # aforementioned # complexity appears in the # code (just this comment :-). sum += a.hitSize() return sum / len(alignments) if __name__ == "__main__": def main(): import sys, getopt opts, args = getopt.getopt(sys.argv[1:], "s:m:") alignments = [] for opt, val in opts: if opt == "-s": parser = SsahaParser(val) elif opt == "-m": parser = MegablastParser(val) else: print >> sys.stderr, ("%s: unknown flag %s" % (sys.argv[0], opt)) raise SystemExit, 1 # Note that following is also polymorphism at work alignments += parser.alignments if not alignments: print >> sys.stderr, ("%s: no alignments found" % sys.argv[0]) raise SystemExit, 1 print "Number of alignments: %s" % len(alignments) print "Average hit size: %.3f" % averageHitSize(alignments) main()