Harvard:Biophysics 101/Notebook:ZS/2007-2-27
From OpenWetWare
Jump to navigationJump to search
Assignment 4
This program was extremely extremely painful to do... I really like Java/C++ better :D Anyways, it has the ability to:
- Find all open reading frames in a sequence of DNA
- Classify single and multiple mutations, deletions, and insertions
- Determine if the mutation is in exon or intron
- IF intron, determine that the mutation is harmful or not
THIS part is still being worked on, I can't get the protein translation table to work
There are some Achilles' heels in error-checking, however - hopefully will be implemented in further editions, but works in 95% of cases.
#Zachary Sun, Assignment 4
#2.26.07
#!/usr/bin/env python
#This program is able to:
##Find all open reading frames in a sequence of DNA
##Classify single and multiple mutations, deletions, and insertions
##Determine if the mutation is in exon or intron
##IF intron, determine that the mutation is harmful or not
###THIS part is still being worked on, I can't get the protein translation table to work
import os
import math
from Bio import Clustalw
from Bio import GenBank, Seq
from Bio.Seq import Seq,translate
from sets import Set
from string import *
#finds DNA complement (borrowed code!)
def complement(dna):
tab = maketrans("AGCTagct", "TCGAtcga")
return translate(dna, tab)
cmdline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'Apoe.fasta'))
cmdline.set_output('test.aln')
align = Clustalw.do_alignment(cmdline)
refSeqObj = align.get_all_seqs()
refSeq_forward = refSeqObj[0].seq.tostring()
refSeq_backward = complement(refSeq_forward)
readingFrame = [["",""],["",""],["",""],["",""],["",""],["",""]]
##NOTE: readingFrame stores in format [["frame0"]["frame1"]["frame2"]["rev_frame0"]...
## and in each frame [0] = startsite and [1] = endsite
#finds the site of a codon in the reference sequence passed
def codonFind(index, codon, refSeq):
for testCodon in codon:
if index+2 >= len(refSeq): #if out of bounds
return -1
elif refSeq[index] + refSeq[index+1] + refSeq[index+2] == testCodon: #equals codon
return index
return codonFind(index+3, codon, refSeq) #recursive call
#function to return a Set object of characters in a column
def colSetFunction(i):
col = align.get_column(i)
s = Set() # create a new set
for c in range(len(col)):
s.add(col[c]) # add each column element to the set
return s
#given that there is an abnormality at a single mutation, finds abnormality"
def findSingleMutation(i):
colSet = colSetFunction(i)
if '-' in colSet: #if there is a insertion or deletion
if refSeqObj[0].seq.tostring()[i] == "-": #test reference sequence
for z in range(len(refSeqObj)-1):
if refSeqObj[z+1].seq.tostring()[i] != "-":
print "***Intron abnormality (major)"
print "\tInsertion in sequence " , z+1, "of [",refSeqObj[z+1].seq.tostring()[i],"] at position " , i
print "\tFRAMESHIFT MUTATION!"
else: #there is a deletion
for z in range(len(refSeqObj)-1):
if refSeqObj[z+1].seq.tostring()[i] == "-":
print "***Intron abnormality (major)"
print "\tDeletion at sequence " , z+1, "of [",refSeqObj[0].seq.tostring()[i],"] at position " , i
print "\tFRAMESHIFT MUTATION!"
else: #there must be a mutation
for z in range(len(refSeqObj)-1):
if refSeqObj[0].seq.tostring()[i] != refSeqObj[z+1].seq.tostring()[i]:
print "*Intron abnormality (minor)"
print "\tMutation at sequence " , z+1, "of [",refSeqObj[0].seq.tostring()[i],"->", refSeqObj[z+1].seq.tostring()[i],"] at position " , i
proteinTranslate(i, 1, z+1)
#This function doesn't work yet, but it should translate a set of codons to protein
def proteinTranslate(i,toTrans, seq):#second value is number of codons to translate, #3rd value is sequence with mutation
origCode = ''
newCode = ''
for j in range(toTrans):
origCode = origCode + refSeqObj[0].seq.tostring()[i] +refSeqObj[0].seq.tostring()[i+1] + refSeqObj[0].seq.tostring()[i+2]
newCode = newCode + refSeqObj[seq].seq.tostring()[i] +refSeqObj[seq].seq.tostring()[i+1] + refSeqObj[seq].seq.tostring()[i+2]
i = i + 3
### print "\t Amino Acid change(or lack thereof) [",translate(origCode,'').tostring(),"]->[",translate(newCode, '').tostring(),"]"
### cannot get translation table to work
#given that there is an abnormality at a multiple mutation, finds abnormality"
#Note: cannot handle back to back muations/ins/del in different strands
#Note: cannot handle two back to back muations in same strand
def findMultipleMutation(i):
curIndex = i;
colSet = colSetFunction(i)
mutLength = 1
while(1):
curIndex = curIndex + 1
if len(colSetFunction(curIndex)) == 1: #no more mutation
break
else:
mutLength = mutLength+1
if refSeqObj[0].seq.tostring()[i] == "-": #test reference sequence shows insertion
for z in range(len(refSeqObj)-1):
if refSeqObj[z+1].seq.tostring()[i] != "-":
tempString = ''
for x in range(mutLength): #make the insertion strand
tempString = tempString + refSeqObj[z+1].seq.tostring()[i+x]
print "***Intron abnormality (major)"
print "\tMultiple Insertion in sequence " , z+1, "of [" ,tempString ,"] starting at position " , i
if mutLength%3 != 0: #if mutation not a multiple of 3
print "\tFRAMESHIFT MUTATION!"
else:
proteinTranslate(i,mutLength%3, z+1)
else: #there is a deletion
for z in range(len(refSeqObj)-1):
if refSeqObj[z+1].seq.tostring()[i] == "-":
tempString = ''
for x in range(mutLength): #make the insertion strand
tempString = tempString + refSeqObj[0].seq.tostring()[i+x]
print "***Intron abnormality (major)"
print "\tMultiple Deletion in sequence " , z+1, "of [" ,tempString ,"] starting at position " , i
if mutLength%3 != 0: #if mutation not a multiple of 3
print "\tFRAMESHIFT MUTATION!"
else:
proteinTranslate(i,mutLength%3, z+1)
return curIndex - 1
##Finding Start codons
readingFrame[0][0] = codonFind(0,["ATG"], refSeq_forward)
readingFrame[1][0] = codonFind(1,["ATG"], refSeq_forward)
readingFrame[2][0] = codonFind(2,["ATG"], refSeq_forward)
readingFrame[3][0] = codonFind(0,["ATG"], refSeq_backward)
readingFrame[4][0] = codonFind(1,["ATG"], refSeq_backward)
readingFrame[5][0] = codonFind(2,["ATG"], refSeq_backward)
##Finding Stop codons
for z in range(3):
w = z
while(1):
if codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward) == -1: #did not find start
readingFrame[z][1] = '-1'
break
elif codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward) < codonFind(z, ["ATG"], refSeq_forward): #stop before start
w = 3 + codonFind(w, ["TAA","TAG","TGA"], refSeq_forward)
else:
readingFrame[z][1] = codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward)
break
for z in range(3):
w = z
while(1):
if codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward) == -1: #did not find start
readingFrame[z+3][1] = '-1'
break
elif codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward) < codonFind(z, ["ATG"], refSeq_backward): #stop before start
w = 3 + codonFind(w, ["TAA","TAG","TGA"], refSeq_backward)
else:
readingFrame[z+3][1] = codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward)
break
##Output information for reading frames
print "***Reading Frame Information (Using Longest Length)***"
print "5->3 Frame 0:"
if readingFrame[0][0] == -1 or readingFrame[0][1] == -1: #if DNE
print "\tNonexistant!"
else:
print "\tStart: " , readingFrame[0][0], "\n\tEnd: ", readingFrame[0][1], "\n\tLength: " ,readingFrame[0][1]-readingFrame[0][0]
print "5->3 Frame 1:"
if readingFrame[1][0] == -1 or readingFrame[1][1] == -1: #if DNE
print "\tNonexistant!"
else:
print "\tStart: " , readingFrame[1][0], "\n\tEnd: ", readingFrame[1][1], "\n\tLength: ", readingFrame[1][1]-readingFrame[1][0]
print "5->3 Frame 2:"
if readingFrame[2][0] == -1 or readingFrame[2][1] == -1: #if DNE
print "\tNonexistant!"
else:
print "\tStart: " , readingFrame[2][0], "\n\tEnd: ", readingFrame[2][1], "\n\tLength: ", readingFrame[2][1]-readingFrame[2][0]
print "3->5 Frame 0 (reversed):"
if readingFrame[3][0] == -1 or readingFrame[3][1] == -1: #if DNE
print "\tNonexistant!"
else:
print "\tStart: " , readingFrame[3][0], "\n\tEnd: ", readingFrame[3][1], "\n\tLength: ", readingFrame[3][1]-readingFrame[3][0]
print "3->5 Frame 1 (reversed):"
if readingFrame[4][0] == -1 or readingFrame[4][1] == -1: #if DNE
print "\tNonexistant!"
else:
print "\tStart: " , readingFrame[4][0], "\n\tEnd: ", readingFrame[4][1], "\n\tLength: ", readingFrame[4][1]-readingFrame[4][0]
print "3->5 Frame 2 (reversed):"
if readingFrame[5][0] == -1 or readingFrame[5][1] == -1: #if DNE
print "\tNonexistant!"
else:
print "\tStart: " , readingFrame[5][0], "\n\tEnd: ", readingFrame[5][1], "\n\tLength: ", readingFrame[5][1]-readingFrame[5][0]
#Determining longest frame and assigning to finalRefSeq - assuming 5-3 directionality for now
longest=-1;
for r in range(6):
if readingFrame[r][0] != -1 and readingFrame[r][1] != -1:
if readingFrame[r][1]-readingFrame[r][0] > longest:
longest = readingFrame[r][1]-readingFrame[r][0]
longIndex = r
startSite = readingFrame[longIndex][0]
stopSite = readingFrame[longIndex][1]
#Main block to determine deviations from std.
i=0
print "\n***Information about allignment***"
while(i <align.get_alignment_length()):
colSet = colSetFunction(i)
if len(colSet) > 1: # multiple elements in s indicate a mismatch
if i<startSite or i>stopSite: #If it is considered an Exon
print "*Exon abnormality (minor)\n\tPosition ", i
elif len(colSetFunction(i+1)) == 1: #if the next position is normal
findSingleMutation(i)
else:
i = findMultipleMutation(i) #returns a new value for i
i = i + 1
The modified Apoe.fasta dataset can be found File:Apoe ZS.fasta.
The output for this modified dataset is:
***Reading Frame Information (Using Longest Length)*** 5->3 Frame 0: Start: 60 End: 246 Length: 186 5->3 Frame 1: Start: 304 End: 1015 Length: 711 5->3 Frame 2: Start: 104 End: 293 Length: 189 3->5 Frame 0 (reversed): Nonexistant! 3->5 Frame 1 (reversed): Start: 220 End: 646 Length: 426 3->5 Frame 2 (reversed): Start: 725 End: 1010 Length: 285 ***Information about allignment*** *Exon abnormality (minor) Position 152 ***Intron abnormality (major) Multiple Deletion in sequence 1 of [ TG ] starting at position 458 FRAMESHIFT MUTATION! ***Intron abnormality (major) Multiple Insertion in sequence 2 of [ TTG ] starting at position 588 *Intron abnormality (minor) Mutation at sequence 2 of [ G -> A ] at position 662 ***Intron abnormality (major) Multiple Deletion in sequence 1 of [ ACGAG ] starting at position 806 FRAMESHIFT MUTATION!