Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-2-27
From OpenWetWare
Jump to navigationJump to search
Program Code
#!/usr/bin/env python import os from Bio import GenBank, Seq from Bio.Seq import Seq,translate from Bio import Clustalw from Bio.Clustalw import MultipleAlignCL from Bio.Align import AlignInfo cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) numseq=[] orfs=[] proteins=[] seqs=[] numseq=alignment.get_all_seqs() t=0 for i in range(len(numseq)): se=numseq[i] seqs.append(se.seq.tostring()) sq=seqs[i] start = sq.find('ATG') orf = '' c=start for x in range(len(sq)-start-4): orf = orf + sq[c] c= c +1 length = c-start remainder=length%3 if remainder == 0: codon=sq[c]+sq[c+1]+sq[c+2] if codon== 'TAA' or codon=='TAG' or codon=='TGA': orf=orf+sq[c+1]+sq[c+2] break orfs.append(orf) proteins.append(translate(orfs[t])) t=t+1 star = seqs[0].find('ATG') print "5'->3' ORF: Start Codon Position:", star+1, " Stop Codon Position:", c+1 for i in range(len(orfs[0])): if orfs[0][i]==orfs[1][i]==orfs[2][i]: continue x=(i+1)//3 if not orfs[0][i]==orfs[1][i]: if orfs[1][i-1]=='-': break elif orfs[1][i]=='-': q=1 for q in range(len(orfs[0])): if orfs[1][i+q]=='-': continue else: break if q%3==0: print q, 'base pair deletion in variable sequence one beginning at position', i+1, '\n resulting in a non-frameshift mutation.' else: print q, 'base pair deletion in variable sequence one beginning at position', i+1, '\n resulting in a frameshift mutation.' elif len(orfs[1])>len(orfs[0]): dif=len(orfs[1])-len(orfs[0]) d=0 for y in range(dif): if not orfs[1][i+y-dif+1]==orfs[0][i]: continue elif orfs[1][i+y-dif+1]==orfs[0][i]: one=''.join([orfs[1][j] for j in range(i+y-dif+1, i+y-dif+11)]) zero=''.join([orfs[0][j] for j in range(i+y-dif+1, i+y-dif+11)]) if one==zero: if y%3==0: print y+dif, 'base pair insertion in variable sequence one beginning at position', i+1, '\n resulting in a non-frameshift mutation.' else: print y+dif, 'base pair insertion in variable sequence one beginning at position', i+1, '\n resulting in a frameshift mutation.' break elif proteins[0][x]==proteins[1][x]: print 'Silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence one resulting in', proteins[0][x], x+1, proteins[2][x] else: print 'Non-silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in varable sequence one resulting in', proteins[0][x], x+1, proteins[2][x] if not orfs[0][i]==orfs[2][i]: if orfs[2][i-1]=='-': break elif orfs[2][i]=='-': q=1 for q in range(len(orfs[0])): if orfs[2][i+q]=='-': continue else: break if q%3==0: print q, 'base pair deletion in variable sequence two beginning at position', i+1, '\n resulting in a non-frameshift mutation.' else: print q, 'base pair deletion in variable sequence two beginning at position', i+1, '\n resulting in a frameshift mutation.' elif len(orfs[2])>len(orfs[0]): dif=len(orfs[2])-len(orfs[0]) d=0 for y in range(dif): if not orfs[2][i+y]==orfs[0][i]: continue elif orfs[2][i+y]==orfs[0][i]: two=''.join([orfs[2][j] for j in range(i+y, i+y+10)]) zero=''.join([orfs[0][j] for j in range(i+y, i+y+10)]) if two==zero: if y%3==0: print y, 'base pair insertion in variable sequence two beginning at position', i+1, '\n resulting in a non-frameshift mutation.' else: print y, 'base pair insertion in variable sequence two beginning at position', i+1, '\n resulting in a frameshift mutation.' elif proteins[0][x]==proteins[2][x]: print 'Silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x+1, proteins[2][x] else: print 'Non-silent point mutation', orfs[0][i], i+1, orfs[2][i], 'in variable sequence two resulting in', proteins[0][x], x+1, proteins[2][x]
Output of Program
5'->3' ORF: Start Codon Position: 61 Stop Codon Position: 1012 Non-silent point mutation G 599 A in variable sequence two resulting in G 200 E 5 base pair deletion in variable sequence one beginning at position 743 resulting in a frameshift mutation. Non-silent point mutation A 743 C in variable sequence two resulting in D 248 A
Notes
- I considered the first sequence in the apoe.fasta file to be the reference sequence, but this could also be downloaded directly from GenBank.
- I realized there was an error in my earlier program output, so I decided to upload a corrected version. Previously, I did not take into account that the indexing started at zero, so all of my position values were off by 1. I think this new program should be correct though. Please let me know if you notice any issues with it. Thanks!