Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-2-27
From OpenWetWare
Jump to navigationJump to search
Program Code
#!/usr/bin/env python
import os
from Bio import GenBank, Seq
from Bio.Seq import Seq,translate
from Bio import Clustalw
from Bio.Clustalw import MultipleAlignCL
from Bio.Align import AlignInfo
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
numseq=[]
orfs=[]
proteins=[]
seqs=[]
numseq=alignment.get_all_seqs()
t=0
for i in range(len(numseq)):
se=numseq[i]
seqs.append(se.seq.tostring())
sq=seqs[i]
start = sq.find('ATG')
orf = ''
c=start
for x in range(len(sq)-start-4):
orf = orf + sq[c]
c= c +1
length = c-start
remainder=length%3
if remainder == 0:
codon=sq[c]+sq[c+1]+sq[c+2]
if codon== 'TAA' or codon=='TAG' or codon=='TGA':
orf=orf+sq[c+1]+sq[c+2]
break
orfs.append(orf)
proteins.append(translate(orfs[t]))
t=t+1
star = seqs[0].find('ATG')
print "5'->3' ORF: Start Codon Position:", star+1, " Stop Codon Position:", c+1
for i in range(len(orfs[0])):
if orfs[0][i]==orfs[1][i]==orfs[2][i]:
continue
x=(i+1)//3
if not orfs[0][i]==orfs[1][i]:
if orfs[1][i-1]=='-':
break
elif orfs[1][i]=='-':
q=1
for q in range(len(orfs[0])):
if orfs[1][i+q]=='-':
continue
else:
break
if q%3==0:
print q, 'base pair deletion in variable sequence one beginning at position', i+1, '\n resulting in a non-frameshift mutation.'
else:
print q, 'base pair deletion in variable sequence one beginning at position', i+1, '\n resulting in a frameshift mutation.'
elif len(orfs[1])>len(orfs[0]):
dif=len(orfs[1])-len(orfs[0])
d=0
for y in range(dif):
if not orfs[1][i+y-dif+1]==orfs[0][i]:
continue
elif orfs[1][i+y-dif+1]==orfs[0][i]:
one=''.join([orfs[1][j] for j in range(i+y-dif+1, i+y-dif+11)])
zero=''.join([orfs[0][j] for j in range(i+y-dif+1, i+y-dif+11)])
if one==zero:
if y%3==0:
print y+dif, 'base pair insertion in variable sequence one beginning at position', i+1, '\n resulting in a non-frameshift mutation.'
else:
print y+dif, 'base pair insertion in variable sequence one beginning at position', i+1, '\n resulting in a frameshift mutation.'
break
elif proteins[0][x]==proteins[1][x]:
print 'Silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence one resulting in', proteins[0][x], x+1, proteins[2][x]
else:
print 'Non-silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in varable sequence one resulting in', proteins[0][x], x+1, proteins[2][x]
if not orfs[0][i]==orfs[2][i]:
if orfs[2][i-1]=='-':
break
elif orfs[2][i]=='-':
q=1
for q in range(len(orfs[0])):
if orfs[2][i+q]=='-':
continue
else:
break
if q%3==0:
print q, 'base pair deletion in variable sequence two beginning at position', i+1, '\n resulting in a non-frameshift mutation.'
else:
print q, 'base pair deletion in variable sequence two beginning at position', i+1, '\n resulting in a frameshift mutation.'
elif len(orfs[2])>len(orfs[0]):
dif=len(orfs[2])-len(orfs[0])
d=0
for y in range(dif):
if not orfs[2][i+y]==orfs[0][i]:
continue
elif orfs[2][i+y]==orfs[0][i]:
two=''.join([orfs[2][j] for j in range(i+y, i+y+10)])
zero=''.join([orfs[0][j] for j in range(i+y, i+y+10)])
if two==zero:
if y%3==0:
print y, 'base pair insertion in variable sequence two beginning at position', i+1, '\n resulting in a non-frameshift mutation.'
else:
print y, 'base pair insertion in variable sequence two beginning at position', i+1, '\n resulting in a frameshift mutation.'
elif proteins[0][x]==proteins[2][x]:
print 'Silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x+1, proteins[2][x]
else:
print 'Non-silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x+1, proteins[2][x]
Output of Program
5'->3' ORF: Start Codon Position: 61 Stop Codon Position: 1012
Non-silent point mutation G 599 A in variable sequence two resulting in G 200 E
5 base pair deletion in variable sequence one beginning at position 743
resulting in a frameshift mutation.
Non-silent point mutation A 743 C in variable sequence two resulting in D 248 A
Notes
- I considered the first sequence in the apoe.fasta file to be the reference sequence, but this could also be downloaded directly from GenBank.
- I realized there was an error in my earlier program output, so I decided to upload a corrected version. Previously, I did not take into account that the indexing started at zero, so all of my position values were off by 1. I think this new program should be correct though. Please let me know if you notice any issues with it. Thanks!