plasmid_list.py

import xlrd

#extract columns: plasmid_number, plasmid_description, and plasmid_selectionmarker
wb = xlrd.open_workbook("JSO_MasterList_121212.xls")
sh = wb.sheet_by_name(u'Plasmids')
plasmid_number = sh.col_values(0)
plasmid_number.pop(0) #float
plasmid_descrip= sh.col_values(1)
plasmid_descrip.pop(0) #unicode
plasmid_selection= sh.col_values(4)
plasmid_selection.pop(0) #unicode

#add JSO to plasmid_number and take the spaces out of all three columns
plasmidnum = []
plasmidsel = []
plasmiddesc = []
for value in plasmid_number:
	if value != '':
		value = int(value)
		JSOplasmid = "JSO"+str(value)
		plasmidnum.append(JSOplasmid)

		index = plasmid_number.index(value)
		JSOselection = plasmid_selection[index]
		JSOselection = str(JSOselection)
		plasmidsel.append(JSOselection)

		JSOdesc = plasmid_descrip[index]
		JSOdesc = str(JSOdesc)
		plasmiddesc.append(JSOdesc)

#make a dictionary dict2 of key = plasmid number and value = plasmid selection 
dict2 = dict(zip(plasmidnum, plasmidsel))

#make a dictionary dict3 of key = plasmid number and value = plasmid descriptions
dict3 = dict(zip(plasmidnum, plasmiddesc))

#make a dictionary dict1 of key = plasmid number and value = sequences
from Bio import SeqIO
handle = open("JSO_Plasmids.fasta", "rU")
id = []
seq = []
for record in SeqIO.parse(handle, "fasta") :
	temp1 = str(record.id)
	id.append(temp1)
	temp2 =str(record.seq)
	seq.append(temp2)
handle.close()

dict1 = dict(zip(id, seq))


#extract promoter information from plasmid descriptions
dict4 = {} #plasmid_promoter
dict5 = {} #plasmid_genes
for key in dict3:
	fullstring = dict3[key].lower()

	if "tdh3" in fullstring:
		#create stuff
		dict4[key] = "tdh3"
	if "adh1" in fullstring:
		dict4[key] = "adh1"
	if "gal1" in fullstring:
		dict4[key] = "gal1"
	if "tpk2" in fullstring:
		dict4[key] = "tpk2"
	if "cyc1" in fullstring:
		dict4[key] = "cyc1"
	if "nop7" in fullstring:
		dict4[key] = "nop7"

	if "msn2" in fullstring:
		dict5[key]= "msn2"
	if "crz1" in fullstring:
		dict5[key]= "crz1"
	if "tod6" in fullstring:
		dict5[key]= "tod6"
	if "dot6" in fullstring:
		dict5[key]= "dot6"
	if "sfp1" in fullstring:
		dict5[key]= "sfp1"
	if "stb3" in fullstring:
		dict5[key] = "stb3"


#note that some of the plasmids are dually tagged, so make that an option
#read in fasta file with fluorophore genes, and put into a dictionary called fluorescence
fluor = open("fluorescent_sequences.fasta", "rU")
fluorescence = {}
for record in SeqIO.parse(fluor, "fasta") :
	temp1 = str(record.id)
	temp2 = str(record.seq)
	fluorescence[temp1] = temp2
fluor.close

#loop through dict1 (with sequences) and search for each fluorophore in the dictionary fluorescence 	
dict6 = {} #plasmid_fluor
for key in dict1:
	for item in fluorescence:
		if fluorescence[item] in dict1[key]:
			dict6[key] = item

#merge the fasta and the xls information into a list. 
merged_list = []
for key in dict3:
	if key in dict1:
		seq = dict1[key]
	else:
		seq = "none"
	if key in dict4:
		prom = dict4[key]
	else:
		prom = "none"
	if key in dict5:
		gene = dict5[key]
	else:
		gene = "none"
	if key in dict6:
		fluores = dict6[key]
	else:
		fluores = "none"

	temp = [key, dict3[key], dict2[key], seq, prom, gene, fluores]
	merged_list.append(temp)

#this writes this information stored in a list to a csv file
import csv
resultfile = open("compiled_info_plasmid.txt", "wb")
wr = csv.writer(resultfile, dialect = "excel")

for item in merged_list:
	wr.writerow(item)