Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-2-20
From OpenWetWare
Jump to navigationJump to search
Script
#!/usr/bin/env python
import os
from Bio import GenBank, Seq
from Bio.Seq import Seq,translate
from Bio import Clustalw
from Bio.Clustalw import MultipleAlignCL
from Bio.Align import AlignInfo
from sets import Set
mut = []
gap =[]
cold=[]
colm=[]
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
summary_align = AlignInfo.SummaryInfo(alignment)
for i in range(alignment.get_alignment_length()):
col = alignment.get_column(i)
s = Set() # create a new set
for c in range(len(col)):
s.add(col[c]) # add each column element to the set
if '-' in s:
gap.append(i)
cold.append(col)
ss=s.copy()
ss.remove('-')
if len(ss)>1:
mut.append(i)
colm.append(col)
elif len(s)>1:
mut.append(i)
colm.append(col)
for i in range(len(gap)):
print 'Deletion', cold[i], 'at', gap[i]
for i in range(len(mut)):
print 'Mismatch', colm[i], 'at', mut[i]
numseq=[]
orfs=[]
proteins=[]
seq=[]
numseq=alignment.get_all_seqs()
record_parser = GenBank.FeatureParser()
ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)
parsed_record = ncbi_dict['178850']
s = parsed_record.seq.tostring()
t=0
for i in range(len(numseq)):
se=numseq[i]
seq.append(se.seq.tostring())
start = seq[i].find('ATG')
orf =
c=start
for x in range(len(s)-start-4):
orf = orf + s[c]
c= c +1
length = c-start
remainder=length%3
if remainder == 0:
codon=s[c]+s[c+1]+s[c+2]
if codon== 'TAA' or codon=='TAG' or codon=='TGA':
orf=orf+s[c+1]+s[c+2]
break
orfs.append(orf)
proteins.append(translate(orfs[t]))
t=t+1
sil=0
cha=0
for p in range((len(proteins))-1):
if proteins[p+1]==proteins[0]:
sil=sil+1
else:
cha=cha+1
print 'There are', sil, 'amino acids that are contain either no mutations or only silent mutations.'
print 'There are', cha, 'nonsilent mutations that lead to amino acid changes.'
Output
Deletion A-C at 802 Deletion C-C at 803 Deletion G-G at 804 Deletion A-A at 805 Deletion G-G at 806 Mismatch GGA at 658 Mismatch A-C at 802 There are 5 amino acids that are contain either no mutations or only silent mutations. There are 2844 nonsilent mutations that lead to amino acid changes.
Notes
- There were also a few errors in the second half of my code which compares protein sequences, but I was not sure exactly what was causing these errors and would love any insight as to what went wrong. Thanks!
- Just so you know, I updated my program so there are no errors when it runs now, but there may be a few nonintuitive parts to it that could be improved. Please let me know!