Harvard:Biophysics 101/Notebook:ZS/2007-2-27

Assignment 4
This program was extremely extremely painful to do... I really like Java/C++ better :D Anyways, it has the ability to: THIS part is still being worked on, I can't get the protein translation table to work
 * Find all open reading frames in a sequence of DNA
 * Classify single and multiple mutations, deletions, and insertions
 * Determine if the mutation is in exon or intron
 * IF intron, determine that the mutation is harmful or not

There are some Achilles' heels in error-checking, however - hopefully will be implemented in further editions, but works in 95% of cases.


 * 1) Zachary Sun, Assignment 4
 * 2) 2.26.07
 * 3) !/usr/bin/env python
 * 4) This program is able to:
 * 5) Find all open reading frames in a sequence of DNA
 * 6) Classify single and multiple mutations, deletions, and insertions
 * 7) Determine if the mutation is in exon or intron
 * 8) IF intron, determine that the mutation is harmful or not
 * 9) THIS part is still being worked on, I can't get the protein translation table to work

import os import math from Bio import Clustalw from Bio import GenBank, Seq from Bio.Seq import Seq,translate from sets import Set from string import *

def complement(dna): tab = maketrans("AGCTagct", "TCGAtcga") return translate(dna, tab)
 * 1) finds DNA complement (borrowed code!)

cmdline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'Apoe.fasta')) cmdline.set_output('test.aln') align = Clustalw.do_alignment(cmdline)

refSeqObj = align.get_all_seqs refSeq_forward = refSeqObj[0].seq.tostring refSeq_backward = complement(refSeq_forward) readingFrame = "",""],["",""],["",""],["",""],["",""],["",""
 * 1) NOTE: readingFrame stores in format [["frame0"]["frame1"]["frame2"]["rev_frame0"]...
 * 2) and in each frame [0] = startsite and [1] = endsite

def codonFind(index, codon, refSeq): for testCodon in codon: if index+2 >= len(refSeq): #if out of bounds return -1 elif refSeq[index] + refSeq[index+1] + refSeq[index+2] == testCodon: #equals codon return index return codonFind(index+3, codon, refSeq) #recursive call
 * 1) finds the site of a codon in the reference sequence passed

def colSetFunction(i): col = align.get_column(i) s = Set # create a new set for c in range(len(col)): s.add(col[c]) # add each column element to the set return s
 * 1) function to return a Set object of characters in a column

def findSingleMutation(i): colSet = colSetFunction(i) if '-' in colSet: #if there is a insertion or deletion if refSeqObj[0].seq.tostring[i] == "-": #test reference sequence for z in range(len(refSeqObj)-1): if refSeqObj[z+1].seq.tostring[i] != "-": print "***Intron abnormality (major)" print "\tInsertion in sequence ", z+1, "of [",refSeqObj[z+1].seq.tostring[i],"] at position " , i                    print "\tFRAMESHIFT MUTATION!" else: #there is a deletion for z in range(len(refSeqObj)-1): if refSeqObj[z+1].seq.tostring[i] == "-": print "***Intron abnormality (major)" print "\tDeletion at sequence ", z+1, "of [",refSeqObj[0].seq.tostring[i],"] at position " , i                    print "\tFRAMESHIFT MUTATION!" else: #there must be a mutation for z in range(len(refSeqObj)-1): if refSeqObj[0].seq.tostring[i] != refSeqObj[z+1].seq.tostring[i]: print "*Intron abnormality (minor)" print "\tMutation at sequence ", z+1, "of [",refSeqObj[0].seq.tostring[i],"->", refSeqObj[z+1].seq.tostring[i],"] at position " , i               proteinTranslate(i, 1, z+1)
 * 1) given that there is an abnormality at a single mutation, finds abnormality"

def proteinTranslate(i,toTrans, seq):#second value is number of codons to translate, #3rd value is sequence with mutation origCode = '' newCode = '' for j in range(toTrans): origCode = origCode + refSeqObj[0].seq.tostring[i] +refSeqObj[0].seq.tostring[i+1] + refSeqObj[0].seq.tostring[i+2] newCode = newCode + refSeqObj[seq].seq.tostring[i] +refSeqObj[seq].seq.tostring[i+1] + refSeqObj[seq].seq.tostring[i+2] i = i + 3
 * 1) This function doesn't work yet, but it should translate a set of codons to protein

### print "\t Amino Acid change(or lack thereof) [",translate(origCode,).tostring,"]->[",translate(newCode, ).tostring,"]" ### cannot get translation table to work def findMultipleMutation(i): curIndex = i;   colSet = colSetFunction(i) mutLength = 1 while(1): curIndex = curIndex + 1 if len(colSetFunction(curIndex)) == 1: #no more mutation break else: mutLength = mutLength+1
 * 1) given that there is an abnormality at a multiple mutation, finds abnormality"
 * 2) Note: cannot handle back to back muations/ins/del in different strands
 * 3) Note: cannot handle two back to back muations in same strand

if refSeqObj[0].seq.tostring[i] == "-": #test reference sequence shows insertion for z in range(len(refSeqObj)-1): if refSeqObj[z+1].seq.tostring[i] != "-": tempString = '' for x in range(mutLength): #make the insertion strand tempString = tempString + refSeqObj[z+1].seq.tostring[i+x] print "***Intron abnormality (major)" print "\tMultiple Insertion in sequence ", z+1, "of [" ,tempString ,"] starting at position " , i               if mutLength%3 != 0: #if mutation not a multiple of 3 print "\tFRAMESHIFT MUTATION!" else: proteinTranslate(i,mutLength%3, z+1) else: #there is a deletion for z in range(len(refSeqObj)-1): if refSeqObj[z+1].seq.tostring[i] == "-": tempString = '' for x in range(mutLength): #make the insertion strand tempString = tempString + refSeqObj[0].seq.tostring[i+x] print "***Intron abnormality (major)" print "\tMultiple Deletion in sequence ", z+1, "of [" ,tempString ,"] starting at position " , i               if mutLength%3 != 0: #if mutation not a multiple of 3 print "\tFRAMESHIFT MUTATION!" else: proteinTranslate(i,mutLength%3, z+1) return curIndex - 1

readingFrame[0][0] = codonFind(0,["ATG"], refSeq_forward) readingFrame[1][0] = codonFind(1,["ATG"], refSeq_forward) readingFrame[2][0] = codonFind(2,["ATG"], refSeq_forward) readingFrame[3][0] = codonFind(0,["ATG"], refSeq_backward) readingFrame[4][0] = codonFind(1,["ATG"], refSeq_backward) readingFrame[5][0] = codonFind(2,["ATG"], refSeq_backward)
 * 1) Finding Start codons

for z in range(3): w = z   while(1): if codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward) == -1: #did not find start readingFrame[z][1] = '-1' break elif codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward) < codonFind(z, ["ATG"], refSeq_forward): #stop before start w = 3 + codonFind(w, ["TAA","TAG","TGA"], refSeq_forward) else: readingFrame[z][1] = codonFind(w, ["TAA", "TAG", "TGA"], refSeq_forward) break for z in range(3): w = z   while(1): if codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward) == -1: #did not find start readingFrame[z+3][1] = '-1' break elif codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward) < codonFind(z, ["ATG"], refSeq_backward): #stop before start w = 3 + codonFind(w, ["TAA","TAG","TGA"], refSeq_backward) else: readingFrame[z+3][1] = codonFind(w, ["TAA", "TAG", "TGA"], refSeq_backward) break
 * 1) Finding Stop codons

print "***Reading Frame Information (Using Longest Length)***" print "5->3 Frame 0:" if readingFrame[0][0] == -1 or readingFrame[0][1] == -1: #if DNE print "\tNonexistant!" else: print "\tStart: ", readingFrame[0][0], "\n\tEnd: ", readingFrame[0][1], "\n\tLength: " ,readingFrame[0][1]-readingFrame[0][0] print "5->3 Frame 1:" if readingFrame[1][0] == -1 or readingFrame[1][1] == -1: #if DNE print "\tNonexistant!" else: print "\tStart: ", readingFrame[1][0], "\n\tEnd: ", readingFrame[1][1], "\n\tLength: ", readingFrame[1][1]-readingFrame[1][0] print "5->3 Frame 2:" if readingFrame[2][0] == -1 or readingFrame[2][1] == -1: #if DNE print "\tNonexistant!" else: print "\tStart: ", readingFrame[2][0], "\n\tEnd: ", readingFrame[2][1], "\n\tLength: ", readingFrame[2][1]-readingFrame[2][0] print "3->5 Frame 0 (reversed):" if readingFrame[3][0] == -1 or readingFrame[3][1] == -1: #if DNE print "\tNonexistant!" else: print "\tStart: ", readingFrame[3][0], "\n\tEnd: ", readingFrame[3][1], "\n\tLength: ", readingFrame[3][1]-readingFrame[3][0] print "3->5 Frame 1 (reversed):" if readingFrame[4][0] == -1 or readingFrame[4][1] == -1: #if DNE print "\tNonexistant!" else: print "\tStart: ", readingFrame[4][0], "\n\tEnd: ", readingFrame[4][1], "\n\tLength: ", readingFrame[4][1]-readingFrame[4][0] print "3->5 Frame 2 (reversed):" if readingFrame[5][0] == -1 or readingFrame[5][1] == -1: #if DNE print "\tNonexistant!" else: print "\tStart: ", readingFrame[5][0], "\n\tEnd: ", readingFrame[5][1], "\n\tLength: ", readingFrame[5][1]-readingFrame[5][0]
 * 1) Output information for reading frames

longest=-1; for r in range(6): if readingFrame[r][0] != -1 and readingFrame[r][1] != -1: if readingFrame[r][1]-readingFrame[r][0] > longest: longest = readingFrame[r][1]-readingFrame[r][0] longIndex = r startSite = readingFrame[longIndex][0] stopSite = readingFrame[longIndex][1] i=0 print "\n***Information about allignment***" while(i  1: # multiple elements in s indicate a mismatch if istopSite: #If it is considered an Exon print "*Exon abnormality (minor)\n\tPosition ", i       elif len(colSetFunction(i+1)) == 1: #if the next position is normal findSingleMutation(i) else: i = findMultipleMutation(i) #returns a new value for i   i = i + 1
 * 1) Determining longest frame and assigning to finalRefSeq - assuming 5-3 directionality for now
 * 1) Main block to determine deviations from std.

The modified Apoe.fasta dataset can be found.

The output for this modified dataset is: 5->3 Frame 0: Start: 60 End: 246 Length: 186 5->3 Frame 1: Start: 304 End: 1015 Length: 711 5->3 Frame 2: Start: 104 End: 293 Length: 189 3->5 Frame 0 (reversed): Nonexistant! 3->5 Frame 1 (reversed): Start: 220 End: 646 Length: 426 3->5 Frame 2 (reversed): Start: 725 End: 1010 Length: 285
 * Reading Frame Information (Using Longest Length)***

Position 152 Multiple Deletion in sequence 1 of [ TG ] starting at position  458 FRAMESHIFT MUTATION! Multiple Insertion in sequence 2 of [ TTG ] starting at position  588 Mutation at sequence 2 of [ G -> A ] at position  662 Multiple Deletion in sequence 1 of [ ACGAG ] starting at position  806 FRAMESHIFT MUTATION!
 * Information about allignment***
 * Exon abnormality (minor)
 * Intron abnormality (major)
 * Intron abnormality (major)
 * Intron abnormality (minor)
 * Intron abnormality (major)