I have a list of dictionaries that contain bacterial name as keys, and as values a set of numbers identifying a DNA sequence. Unluckily, in some dictionaries there is a missing value, and the script fails to produce the csv. Can anyone give me an idea on how I can get around it? This is my script:
import glob, subprocess, sys, os, csv
from Bio import SeqIO, SearchIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def allele():
folders=sorted(glob.glob('path_to_files'))
dict_list=[]
for folder in folders:
fasta_file=glob.glob(folder +'/file.fa')[0]
input_handle=open(fasta_file ,'r')
records=list(SeqIO.parse(input_handle, 'fasta'))
namelist=[]
record_dict={}
sampleID = os.path.basename(folder)
record_dict['sampleid']=sampleID
for record in records:
name=record.description.split('\t')
gene=record.id.split('_')
geneID=gene[0] + '_' +gene[1]
allele=gene[2]
record_dict[geneID]=allele
dict_list.append(record_dict)
header = dict_list[0].keys()
with open('path_to_files/mycsv.csv', 'w') as csv_output:
writer=csv.DictWriter(csv_output,header,delimiter='\t')
writer.writeheader()
for samp in dict_list:
writer.writerow(samp)
print 'start'
allele()
Also can I get any suggestion on how to identify those dictionaries whose values sequence are the same? Thanks