Difference between revisions of "Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-27"

Notes: I've decided to make an orf class that indexes to a list of codon classes. The mutation detection will be built into comparators for the class. It's taking me a bit to figure out exactly how these things all work...

```#find all orfs within the sequences
import re
from Bio import Transcribe
transcriber = Transcribe.unambiguous_transcriber
from Bio import Translate
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
standard_translator = Translate.unambiguous_dna_by_id[1]
class codon:
def  __init__(self, sequence=""):
self.sequence = sequence
self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class mutation:
def __init__(self,type="",start=0,stop=0):
self.type = type
self.span=(start,stop)

class orf:
#On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
#be ignored on the conversion to codons
def __init__(self, sequence=""):
self.codons = []
for i in range(len(sequence)/3):
#print i
temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
self.codons.append(temp_codon)
#print self.codons[i].sequence
self.sequence = sequence

#orfs are indexed by codons
def __getitem__(self,index):
return self.codons[index]

#comparing two orfs returns a list of mutations
def __eq__(self,other)

def findORFS(sequence, startpos=0):
all_orfs = []
start = re.compile('ATG')
stop = re.compile('(TAA|TGA|TAG)')
all_starts = start.finditer(sequence)
all_stops = stop.finditer(sequence)
print "Infunction: ",sequence
print "starts:"
all_starts_list = []
for match in all_starts:
all_starts_list.append(match.span())
print match.span()
print "stops:"
all_stops_list = []
for stops in all_stops:
all_stops_list.append(stops.span())
print stops.span()
for start in all_starts_list:
found = 0
for stop in all_stops_list:
print "checking", start[0], "and", stop[0]
diff = (stop[0]-start[0])
if ((diff>0) and ((diff%3) == 0)):
print "orf at:", start[0]," ",stop[1]
all_orfs.append((start[0],stop[1]))
found = 1
break
if found ==0:
all_orfs.append((start[0],-1))

return all_orfs

#Main Program starts here
teststring = "ATG GGG GGG AAT GAT TAA CGT CGT TAA AGT ATG TTT TTT GTA G"
print teststring
teststring = re.sub("\s+", "", teststring)
print 'chomped:',teststring
allspans = findORFS(teststring)
allorfs = []
print "spans"
print allspans
for i in allspans:
x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
y = i[1]
temp_orf = orf(teststring[x:y])
allorfs.append(temp_orf)
print allorfs
for i in allorfs:
print "orfseq:",i.sequence
for j in i:
print j.sequence
print j.protein

```