# # This file demonstrates "inheritance", where a "base class" provides # basic functionality and a "derived class" modifies the base class # behavior by "overriding" a method. # # Conrad Huang # October 8, 2004 # # A FASTA file contains a series of sequences, each of which # consists of a one-line description followed by any number of # lines of nucleotide data. The description line begins with # the '>' characters. The data lines consist of single-character # nucleotide or amino acid symbols, but many programs (including # this one) treat any lines that do not begin with '>' as data lines. # An NCBI FASTA file is just like a normal FASTA file, but the # description lines are in a specific format: # db1|dbid|db2|dbid|symbol name # where db1 and db2 are names of databases such as "gi"; dbids # are identifiers from the preceding database; symbol is a unique # short name, usually without whitespace; and name is a full # name that can include whitespace. # In the code below, instances of the FastaSequence class represent # a single sequence in a FASTA file. The instances have two # attributes: "description" and "sequence". Instances of the # NCBIFastaSequence class are specializations of FastaSequence. # These instances have the two standard attributes but also three # NCBI-specific attributes: "symbol", "name" and "ident". The # latter is a dictionary whose keys are database names and whose # values are database identifiers. # FastaClassic is an implementation of a FASTA file reader using # classic constructs available in nearly all versions of Python. # NCBIFastaMixin is a "mixin" class intended to be combined with # base FASTA readers to provide an NCBI FASTA reader. # NCBIFastaClassic inherits from both NCBIFastaMixin and # FastaClassic to create such an NCBI FASTA reader. Notice # that NCBIFastaMixin is listed first in the list of base # classes because its methods must have precedence over that # of the base FASTA reader. # FastaGenerator and FastaIterator are implementations that use # the newer features of Python that first appeared in 2.2. # NCBIFastaGenerator and NCBIFastaIterator are created by # inheriting from NCBIFastaMixin and the corresponding base # implementation. # This module may be tested by executing it as a Python script. # Command line flags "-c", "-g" and "-i" correspond to using # FastaClassic, FastaGenerator and FastaIterator respectively. # The uppercase version of the flags use the NCBI versions. # Subsequent command line arguments are treated as the names # of FASTA files. If no FASTA file is specified, the default # test case of "ncbi.fa" is used. "Read sequences in FASTA format." class FastaSequence: "A FASTA sequence and its description." def __init__(self, desc, seq): self.description = desc[1:].strip() self.sequence = seq class NCBIFastaSequence(FastaSequence): "A FASTA sequence and its NCBI-format description parsed out." def __init__(self, *args): FastaSequence.__init__(self, *args) parts = self.description.split('|') self.symbol, self.name = parts[-1].split(' ', 1) del parts[-1] self.ident = {} for i in range(0, len(parts), 2): self.ident[parts[i]] = parts[i + 1] class FastaClassic: "Read generic FASTA files. (Classic version)" def __init__(self, f): import types if type(f) is types.StringType: # Argument is a string, treat as filename self.f = open(f) self.autoClose = True elif type(f) is types.FileType: # Argument is a file object self.f = f self.autoClose = False else: raise ValueError, "expecting string or file object" try: self.description = self.f.next() except StopIteration: self.close() else: self.finished = False def __del__(self): self.close() def nextSequence(self): if self.finished: return None # We have the sequence description from a previous # call or from __init__. Now we read sequence data # until we hit the next description or the end of file. desc = self.description seq = [] for line in self.f: if line[0] == '>': self.description = line break else: seq.append(line.strip()) else: self.close() return self.makeSequence(desc, ''.join(seq)) def makeSequence(self, desc, seq): return FastaSequence(desc, seq) def close(self): if self.f and self.autoClose: self.f.close() self.f = None self.finished = True class NCBIFastaMixin: "NCBI mixin for parsing FASTA descriptions" def makeSequence(self, desc, seq): return NCBIFastaSequence(desc, seq) class NCBIFastaClassic(NCBIFastaMixin, FastaClassic): "Read FASTA files with NCBI-style descriptions." pass class FastaGenerator: "Read generic FASTA files. (Generator version)" def __init__(self, f): import types if type(f) is types.StringType: # Argument is a string, treat as filename self.f = open(f) self.autoClose = True elif type(f) is types.FileType: # Argument is a file object self.f = f self.autoClose = False else: raise ValueError, "expecting string or file object" def __del__(self): self.close() def sequences(self): desc = None seq = [] for line in self.f: if line[0] == '>': # sequence description if desc is not None: yield self.makeSequence(desc, "".join(seq)) desc = line seq = [] else: # sequence data seq.append(line.strip()) if desc is not None: yield self.makeSequence(desc, "".join(seq)) self.close() def makeSequence(self, desc, seq): return FastaSequence(desc, seq) def close(self): if self.f and self.autoClose: self.f.close() self.f = None class NCBIFastaGenerator(NCBIFastaMixin, FastaGenerator): "Read FASTA files with NCBI-style descriptions." pass class FastaIterator: "Read generic FASTA files. (Classic version)" def __init__(self, f): import types if type(f) is types.StringType: # Argument is a string, treat as filename self.f = open(f) self.autoClose = True elif type(f) is types.FileType: # Argument is a file object self.f = f self.autoClose = False else: raise ValueError, "expecting string or file object" try: self.description = self.f.next() except StopIteration: self.close() else: self.finished = False def __del__(self): self.close() def __iter__(self): return self def next(self): if self.finished: raise StopIteration # We have the sequence description from a previous # call or from __init__. Now we read sequence data # until we hit the next description or the end of file. desc = self.description seq = [] for line in self.f: if line[0] == '>': self.description = line break else: seq.append(line.strip()) else: self.close() return self.makeSequence(desc, ''.join(seq)) def makeSequence(self, desc, seq): return FastaSequence(desc, seq) def close(self): if self.f and self.autoClose: self.f.close() self.f = None self.finished = True class NCBIFastaIterator(NCBIFastaMixin, FastaIterator): "Read FASTA files with NCBI-style descriptions." pass if __name__ == "__main__": def readFasta(filename, optList): if "-c" in optList: print "-- FastaClassic" fa = FastaClassic(filename) while 1: fs = fa.nextSequence() if fs is None: break print fs.description, len(fs.sequence) print if "-g" in optList: print "-- FastaGenerator" fa = FastaGenerator(filename) for fs in fa.sequences(): if fs is None: break print fs.description, len(fs.sequence) print if "-i" in optList: print "-- FastaIterator" for fs in FastaIterator(filename): print fs.description, len(fs.sequence) print if "-C" in optList: print "-- NCBIFastaClassic" fa = NCBIFastaClassic(filename) while 1: fs = fa.nextSequence() if fs is None: break print fs.name, len(fs.sequence) print if "-G" in optList: print "-- NCBIFastaGenerator" fa = NCBIFastaGenerator(filename) for fs in fa.sequences(): if fs is None: break print fs.name, len(fs.sequence) print if "-I" in optList: print "-- NCBIFastaIterator" for fs in NCBIFastaIterator(filename): print fs.name, len(fs.sequence) print def main(): import sys, getopt opts, args = getopt.getopt(sys.argv[1:], "cigCIG") if not opts: optList = [ "-I" ] else: optList = [ opt for opt, val in opts ] if not args: readFasta("ncbi.fa", optList) else: for filename in args: readFasta(filename, optList) main()