I'm now trying to create a tool which can translate DNA sequences and then compare them to each other for deleting the repetitions!
I used this script to read my fastq
file:
def sequence_cleaner(fastq_file, min_length=0, por_n=100):
# Create our hash table to add the sequences
sequences={}
# Using the Biopython fastq parse we can read our fastq input
for seq_record in SeqIO.parse(fastq_file, "fastq"):
# Take the current sequence
sequence = str(seq_record.seq).upper()
# Check if the current sequence is according to the user parameters
if (len(sequence) >= min_length and
(float(sequence.count("N"))/float(len(sequence)))*100 <= por_n):
# If the sequence passed in the test "is it clean?" and it isn't in the
# hash table, the sequence and its id are going to be in the hash
if sequence not in sequences:
sequences[sequence] = seq_record.id
# If it is already in the hash table, we're just gonna concatenate the ID
# of the current sequence to another one that is already in the hash table
else:
sequences[sequence] += "_" + seq_record.id
print sequence
trans=translate( sequence )
# Write the clean sequences
# Create a file in the same directory where you ran this script
output_file = open("clear_" + fastq_file, "w+")
# Just read the hash table and write on the file as a fasta format
for sequence in sequences:
output_file.write("@" + sequences[sequence] +"\n" + sequence + "\n" + trans +"\n")
output_file.close()
print("\n YOUR SEQUENCES ARE CLEAN!!!\nPlease check clear_" + fastq_file + " on the same repository than " + rep + "\n")
and i used this one to translate it to amino acide sequences:
def translate( sequ ):
"""Return the translated protein from 'sequence' assuming +1 reading frame"""
gencode = {
'ATA':'Ile', 'ATC':'Ile', 'ATT':'Ile', 'ATG':'Met',
'ACA':'Thr', 'ACC':'Thr', 'ACG':'Thr', 'ACT':'Thr',
'AAC':'Asn', 'AAT':'Asn', 'AAA':'Lys', 'AAG':'Lys',
'AGC':'Ser', 'AGT':'Ser', 'AGA':'Arg', 'AGG':'Arg',
'CTA':'Leu', 'CTC':'Leu', 'CTG':'Leu', 'CTT':'Leu',
'CCA':'Pro', 'CCC':'Pro', 'CCG':'Pro', 'CCT':'Pro',
'CAC':'His', 'CAT':'His', 'CAA':'Gln', 'CAG':'Gln',
'CGA':'Arg', 'CGC':'Arg', 'CGG':'Arg', 'CGT':'Arg',
'GTA':'Val', 'GTC':'Val', 'GTG':'Val', 'GTT':'Val',
'GCA':'Ala', 'GCC':'Ala', 'GCG':'Ala', 'GCT':'Ala',
'GAC':'Asp', 'GAT':'Asp', 'GAA':'Glu', 'GAG':'Glu',
'GGA':'Gly', 'GGC':'Gly', 'GGG':'Gly', 'GGT':'Gly',
'TCA':'Ser', 'TCC':'Ser', 'TCG':'Ser', 'TCT':'Ser',
'TTC':'Phe', 'TTT':'Phe', 'TTA':'Leu', 'TTG':'Leu',
'TAC':'Tyr', 'TAT':'Tyr', 'TAA':'STOP', 'TAG':'STOP',
'TGC':'Cys', 'TGT':'Cys', 'TGA':'STOP', 'TGG':'Trp'}
return ''.join(gencode.get(sequ[3*i:3*i+3],'X') for i in range(len(sequ)//3))
The result is not what i expected:
@SRR797221.3
TCAGCCGCGCAGTAGTTAGCACAAGTAGTACGATACAAGAACACTATTTGTAAGTCTAAGGCATTGGCCGCTCGTCTGAGACTGCCAAGGCACACAGGGAGTAGNGNN
SerAlaAlaGlnValValProLeuSerSerValProAlaThrProThrProSerAsnAsnAlaAlaArgLeuArgLeuProArgHisThrGlyValGlu
@SRR797221.4
TCAGCCGCGCAGGTAGTTCCGTTATCATCAGTACCAGCAACTCCAACTCCATCCAACAATGCCGCTCGTCTGAGACTGCCAAGGCACACAGGAGTAGAG
SerAlaAlaGlnValValProLeuSerSerValProAlaThrProThrProSerAsnAsnAlaAlaArgLeuArgLeuProArgHisThrGlyValGlu
@SRR797221.2
TCAGCCGCGCAGGTTCTTGGTAACGGAACGCGCGTTAGACTTAAGACCAGTGAATGGAGCCACCATTGGCCGCTCGTCTGAGACTGCCCAAAGGGCACACAGGGGNGTAGNGN
SerAlaAlaGlnValValProLeuSerSerValProAlaThrProThrProSerAsnAsnAlaAlaArgLeuArgLeuProArgHisThrGlyValGlu
@SRR797221.1
TCAGCCGCGCAGGTAGATTAAGGATCAACGGTTCCTTGGCTCGCAAGTCAATTGGCCGCTCGTCTGAGACTGCCAAGGCACACAGGGAGTAGNG
SerAlaAlaGlnValValProLeuSerSerValProAlaThrProThrProSerAsnAsnAlaAlaArgLeuArgLeuProArgHisThrGlyValGlu
Firstly you can see that the sequences id
are not sorted from 1 to 4 like on the original file, and also it repeats the same 4th id translation for the three other sequences!