well tried without using Biopython, went only for forward strand (no reverse) and used the translated sequence
found two ways, I am sure these are non-optimal approaches, I am waiting for somebody better here How to find a open reading frame in Python are fastest ways I suppose.
first one gives you ORFs even if there is no stop codon (sequence doesnt terminate so no '_' for stop codon presence :
mappy = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
# for i in mappy:
# print(mappy[i])
DNA = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
DNAlist1 = []
DNAlist2 = []
DNAlist3 = []
# protein = []
def revProt(dna_list):
proteinz = []
for elements in dna_list:
if len(elements) == 3:
proteinz.append(mappy[elements])
# proteinz = ''.join(proteinz)
return ''.join([ i for i in reversed(proteinz)])
for i in range(0, len(DNA), 3):
DNAlist1.append(DNA[i:i+3])
for i in range(1, len(DNA), 3):
DNAlist2.append(DNA[i:i+3])
for i in range(2, len(DNA), 3):
DNAlist3.append(DNA[i:i+3])
# for i in [DNAlist1] : #, DNAlist2, DNAlist3]:
for i in [DNAlist1, DNAlist2, DNAlist3]:
protein = revProt(i)
print(''.join(protein), type(''.join(protein)))
seqs = []
j = 0
orf = []
while True:
if j <= len(protein)-1:
if protein[j] == '_' :
if orf[0] == 'M':
orf.append('_')
seqs.append(''.join([i for i in reversed(orf)]))
orf = []
else :
orf = []
orf.append('_')
if protein[j] not in [ '_' , 'M'] :
orf.append(protein[j])
if protein[j] == 'M':
orf.append(protein[j])
seqs.append(''.join([i for i in reversed(orf)]))
else :
break
j += 1
print(seqs, '\n')
output:
QSAVRIM_A_ELLSELGLRPTMGMYGSNAVHS <class 'str'>
['MIRVASQ', 'MTPRLGLESLLE_', 'MGMTPRLGLESLLE_'] -----> here sequences 1st is at the end of DNA so no stop
LH_ES_EPKNWFLS_DLDRP_GWTVQTL_MA <class 'str'>
['M_']
SISSPDNLSIGFSVRIWTAPDDGHLRL_SCP <class 'str'>
[]
second way even more cumbersome :
import itertools
mappy = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
DNA = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
DNAlist1 = []
DNAlist2 = []
DNAlist3 = []
def Prot(dna_list):
proteinz = []
for elements in dna_list:
if len(elements) == 3:
proteinz.append(mappy[elements])
# proteinz = ''.join(proteinz)
return proteinz
def Met(protein):
met = [i for i, x in enumerate(protein) if x == "M"]
return met
def Stop(protein):
stop = [i for i, x in enumerate(protein) if x == "_"]
return stop
for i in range(0, len(DNA), 3):
DNAlist1.append(DNA[i:i+3])
for i in range(1, len(DNA), 3):
DNAlist2.append(DNA[i:i+3])
for i in range(2, len(DNA), 3):
DNAlist3.append(DNA[i:i+3])
for i in [DNAlist1, DNAlist2, DNAlist3]:
protein = Prot(i)
print(''.join(protein), type(''.join(protein)))
met = Met(protein)
# print('met : ', met)
stop = Stop(protein)
# print('stop : ' , stop)
# print('------------------')
orf = [i for i in list(itertools.product(met, stop)) if i[0] < i[1]]
print(orf)
orf_p = [''.join(protein[j[0]:j[1]]) for j in orf]
orf_pp = [i for i in orf_p]
for y in orf_p:
# print(y, type(y))
if '_' in y:
# print('ok')
orf_pp.remove(y)
print('orf_pp : ',orf_pp)
print('______________')
output:
SHVANSGYMGMTPRLGLESLLE_A_MIRVASQ <class 'str'>
[(8, 22), (8, 24), (10, 22), (10, 24)]
orf_pp : ['MGMTPRLGLESLLE', 'MTPRLGLESLLE'] ----->here the sequences
______________
AM_LTQVTWG_PRDLD_SLFWNKPE_SE_HL <class 'str'>
[(1, 2), (1, 10), (1, 16), (1, 25), (1, 28)]
orf_pp : ['M']
______________
PCS_LRLHGDDPATWIRVSFGISLNDPSSIS <class 'str'>
[]
orf_pp : []
______________
shorter (probably faster copied from : How to find a open reading frame in Python
import re
mappy = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
# for i in mappy:
# print(mappy[i])
DNA = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
def Prot(dna_list):
proteinz = []
for elements in dna_list:
if len(elements) == 3:
proteinz.append(mappy[elements])
return proteinz
pattern = re.compile(r'(?=(ATG(?:...)*?)(?=TAG|TGA|TAA))')
def revcomp(dna_seq):
return dna_seq[::-1].translate(str.maketrans("ATGC","TACG"))
def orfs(dna):
return set(pattern.findall(dna) + pattern.findall(revcomp(dna)))
for j in orfs(DNA):
# print(j, type(j))
DNAlistz = []
for i in range(0, len(j), 3):
DNAlistz.append(j[i:i+3])
print(''.join(Prot(DNAlistz)))
print('+++++++++++++')
output this time with reverse strand translation too:
MGMTPRLGLESLLE
MTPRLGLESLLE
M
MLLGSFRLIPKETLIQVAGSSPCNLS
+++++++++++++