Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-2-27

From OpenWetWare
Jump to navigationJump to search

Program Code

#!/usr/bin/env python

import os
from Bio import GenBank, Seq
from Bio.Seq import Seq,translate
from Bio import Clustalw
from Bio.Clustalw import MultipleAlignCL
from Bio.Align import AlignInfo

cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)

numseq=[]
orfs=[]
proteins=[]
seqs=[]
numseq=alignment.get_all_seqs()
t=0

for i in range(len(numseq)):
    se=numseq[i]
    seqs.append(se.seq.tostring())
    sq=seqs[i]
    start = sq.find('ATG')
    orf = ''
    c=start
    for x in range(len(sq)-start-4):
        orf = orf + sq[c]
        c= c +1
        length = c-start
        remainder=length%3
        if remainder == 0:
            codon=sq[c]+sq[c+1]+sq[c+2]
            if codon== 'TAA' or codon=='TAG' or codon=='TGA':
                orf=orf+sq[c+1]+sq[c+2]
                break
            
    orfs.append(orf)
    proteins.append(translate(orfs[t]))
    t=t+1

star = seqs[0].find('ATG')
print "5'->3' ORF:  Start Codon Position:", star+1, " Stop Codon Position:", c+1 
    
for i in range(len(orfs[0])):
    if orfs[0][i]==orfs[1][i]==orfs[2][i]:
        continue
    x=(i+1)//3
    if not orfs[0][i]==orfs[1][i]:
        if orfs[1][i-1]=='-':
            break
        elif orfs[1][i]=='-':
            q=1
            for q in range(len(orfs[0])):
                if orfs[1][i+q]=='-':
                    continue
                else:
                    break
            if q%3==0:
                print q, 'base pair deletion in variable sequence one beginning at position', i+1, '\n    resulting in a non-frameshift mutation.'
            else:
                print q, 'base pair deletion in variable sequence one beginning at position', i+1, '\n    resulting in a frameshift mutation.'
        elif len(orfs[1])>len(orfs[0]):
            dif=len(orfs[1])-len(orfs[0])
            d=0
            for y in range(dif):
                if not orfs[1][i+y-dif+1]==orfs[0][i]:
                    continue
                elif orfs[1][i+y-dif+1]==orfs[0][i]:
                    one=''.join([orfs[1][j] for j in range(i+y-dif+1, i+y-dif+11)])
                    zero=''.join([orfs[0][j] for j in range(i+y-dif+1, i+y-dif+11)])
                    if one==zero:
                        if y%3==0:                            
                            print y+dif, 'base pair insertion in variable sequence one beginning at position', i+1, '\n    resulting in a non-frameshift mutation.'
                        else:
                            print y+dif, 'base pair insertion in variable sequence one beginning at position', i+1, '\n    resulting in a frameshift mutation.'            
                        break
        elif proteins[0][x]==proteins[1][x]:
            print 'Silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence one resulting in', proteins[0][x], x+1, proteins[2][x]
        else:
            print 'Non-silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in varable sequence one resulting in', proteins[0][x], x+1, proteins[2][x]
    if not orfs[0][i]==orfs[2][i]:
        if orfs[2][i-1]=='-':
            break
        elif orfs[2][i]=='-':
            q=1
            for q in range(len(orfs[0])):
                if orfs[2][i+q]=='-':
                    continue
                else:
                    break
            if q%3==0:
                print q, 'base pair deletion in variable sequence two beginning at position', i+1, '\n    resulting in a non-frameshift mutation.'
            else:
                print q, 'base pair deletion in variable sequence two beginning at position', i+1, '\n    resulting in a frameshift mutation.'
        elif len(orfs[2])>len(orfs[0]):
            dif=len(orfs[2])-len(orfs[0])
            d=0
            for y in range(dif):
                if not orfs[2][i+y]==orfs[0][i]:
                    continue
                elif orfs[2][i+y]==orfs[0][i]:
                    two=''.join([orfs[2][j] for j in range(i+y, i+y+10)])
                    zero=''.join([orfs[0][j] for j in range(i+y, i+y+10)])
                    if two==zero:
                        if y%3==0:                            
                            print y, 'base pair insertion in variable sequence two beginning at position', i+1, '\n    resulting in a non-frameshift mutation.'
                        else:
                            print y, 'base pair insertion in variable sequence two beginning at position', i+1, '\n    resulting in a frameshift mutation.'            
        elif proteins[0][x]==proteins[2][x]:
            print 'Silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x+1, proteins[2][x]
        else:
            print 'Non-silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x+1, proteins[2][x]

Output of Program

5'->3' ORF:  Start Codon Position: 61  Stop Codon Position: 1012
Non-silent point mutation G 599 A in variable sequence two resulting in G 200 E
5 base pair deletion in variable sequence one beginning at position 743 
    resulting in a frameshift mutation.
Non-silent point mutation A 743 C in variable sequence two resulting in D 248 A

Notes

  • I considered the first sequence in the apoe.fasta file to be the reference sequence, but this could also be downloaded directly from GenBank.
  • I realized there was an error in my earlier program output, so I decided to upload a corrected version. Previously, I did not take into account that the indexing started at zero, so all of my position values were off by 1. I think this new program should be correct though. Please let me know if you notice any issues with it. Thanks!