#!/usr/bin/python

def process(filename, verbose=True):
	"Process ATOM records from PDB file"
	atoms = read_pdb(filename)
	countMap = count_by_rtype(atoms)
	numPhe = countMap.get("PHE", 0)
	numTyr = countMap.get("TYR", 0)
	if verbose:
		print "%s: %d PHE and %d TYR" % (filename, numPhe, numTyr)
	return numPhe, numTyr

def count_by_rtype(atoms):
	"Group atoms by residues and residues by type"
	seen = set()
	countMap = {}
	for a in atoms:
		rid = (a[3], a[2])	# residue id = (sequence number, chain id)
		if rid in seen:
			continue
		seen.add(rid)
		try:
			countMap[a[1]] += 1
		except KeyError:
			countMap[a[1]] = 1
	return countMap

def read_pdb(filename):
	"Read ATOM records from PDB file"
	f = file(filename)
	atoms = []
	for line in f:
		if not line.startswith("ATOM"):
			continue
		# ATOM records are fixed-format, so we cannot use whitespace for parsing
		if line[16] != ' ' and line[16] != 'A':	# Alternate location
			continue
		atom = (line[12:16],			# Atom name
				line[17:20].strip(),	# Residue type
				line[21],		# Chain id
				int(line[22:26]),	# Sequence number
				)
#				float(line[30:38]), float(line[38:46]), float(line[46:54]))
		atoms.append(atom)
	f.close()
	return atoms

if __name__ == "__main__":
	def count_all():
		import os
		files = os.listdir(".")
		for filename in files:
			if filename.endswith(".ent"):
				process(filename)
	import cProfile as profile
	from pstats import Stats
	profile.run("count_all()", "optimize_v4.stats")
	s = Stats("optimize_v4.stats")
	s.sort_stats("cumulative", "time", "calls")
	s.print_stats(5)
