Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-5-1
From OpenWetWare
Jump to navigationJump to search
Here's the code for a function that compares a query and reference sequence, returning a list of mutation objects that specify its position in both sequences. Its based on the program wrote a long time ago. The actual useful function is called:
def getAllMutations (referenceSequence, querySequence):
Here is the mutation object:
class mutation:
def __init__(self,type="",ref_span=[0,0],other_span=[0,0]):
self.type = type
self.ref_span = ref_span
self.other_span = other_span
And here is the code:
#find all orfs within the sequences
import re
from Bio import Transcribe
transcriber = Transcribe.unambiguous_transcriber
from Bio import Translate
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
import os
from Bio import Clustalw
standard_translator = Translate.unambiguous_dna_by_id[1]
class codon:
def __init__(self, sequence=""):
self.sequence = sequence
self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]
class raw_mutation:
def __init__(self,type="",ref_location=0,other_location=0):
self.type = type
self.ref_location=ref_location
self.other_location = other_location
class mutation:
def __init__(self,type="",ref_span=[0,0],other_span=[0,0]):
self.type = type
self.ref_span = ref_span
self.other_span = other_span
class orf:
#On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will
#be ignored on the conversion to codons
def __init__(self, source="", source_span=[]):
self.source = source
self.codons = []
self.sequence = source[source_span[0]:source_span[1]]
for i in range(len(self.sequence)/3):
temp_codon = codon(self.sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases
self.codons.append(temp_codon)
#orfs are indexed by codons
def __getitem__(self,index):
return self.codons[index]
def find_mutations(self,other):
refPos = 0
seqPos = 0
#print "Ref:",self
#print "Other:",other
all_mutations = []
for i in range(len(self)):
#print self[i], " ", other[i], ' ', refPos, ' ', seqPos
if self[i] != other[i]:
if self[i] == '-':
new_mutation = raw_mutation("insertion",refPos,seqPos)
elif other[i] == '-':
new_mutation = raw_mutation("deletion",refPos,seqPos)
else:
new_mutation = raw_mutation("point",refPos,seqPos)
all_mutations.append(new_mutation)
#print "Mutation!"
print new_mutation.type," ",new_mutation.ref_location," ",new_mutation.other_location
if self[i] != '-':
refPos += 1
if other[i] != '-':
seqPos += 1
return all_mutations
def consolidate_mutations(raw_mutations):
#I think this will have a bug for insertions before the first base of the refseq because
#in every other case, ref_loc will refer to the base before the insertion
consolidated_mutations =[]
i = 0
while i < len(raw_mutations):
#print "outer loop"
current_type = raw_mutations[i].type
if(current_type == "point"):
ref_loc = raw_mutations[i].ref_location
other_loc = raw_mutations[i].other_location
new_mutation = mutation("point",[ref_loc,ref_loc],[other_loc,other_loc])
consolidated_mutations.append(new_mutation)
i += 1
elif(current_type == "deletion") or (current_type == "insertion"):
ref_start = raw_mutations[i].ref_location
other_start = raw_mutations[i].other_location
ref_end = raw_mutations[i].ref_location
other_end = raw_mutations[i].other_location
i += 1
#It would have been nice to have a do while here...
while (i<=len(raw_mutations)): #and (raw_mutations[i].type == current_type):
#hopefully it exits after the first condition so it doesn't go past array length
if (i==len(raw_mutations)) or ((raw_mutations[i].ref_location!=ref_start) and (raw_mutations[i].other_location!=other_start)):
new_mutation = mutation(current_type,[ref_start,ref_end],[other_start,other_end])
consolidated_mutations.append(new_mutation)
print current_type, "found at ", "ref", ref_start, ",",ref_end," seq ", other_start, ",", other_end
break
elif (current_type == "insertion"): #and (raw_mutations[i].ref_location==ref_start):
other_end += 1
i += 1
#print "extending insertion"
elif (current_type == "deletion"): #and (raw_mutations[i].ref_location==other_start):
ref_end += 1
i += 1
#print "extending deletion"
else:
print "I shouldn't be here!!!"
break
return consolidated_mutations
def findORFS(sequence, startpos=0):
all_orfs = []
start = re.compile('ATG')
stop = re.compile('(TAA|TGA|TAG)')
all_starts = start.finditer(sequence)
all_stops = stop.finditer(sequence)
all_starts_list = []
for match in all_starts:
all_starts_list.append(match.span())
all_stops_list = []
for stops in all_stops:
all_stops_list.append(stops.span())
#print stops.span()
for start in all_starts_list:
found = 0
for stop in all_stops_list:
diff = (stop[0]-start[0])
if ((diff>0) and ((diff%3) == 0)):
print "orf at:", start[0]," ",stop[1]
all_orfs.append((start[0],stop[1]))
found = 1
break
if found ==0:
all_orfs.append((start[0],None))
return all_orfs
def getAllMutations (referenceSequence, querySequence):
teststring = re.sub("\s+", "", referenceSequence)
teststring2 = re.sub("\s+", "", querySequence)
allspans = findORFS(teststring)
allspans2 = findORFS(teststring2)
allorfs = []
allorfs2 = []
print "spans"
print allspans2
for i in allspans:
x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int
y = i[1]
temp_orf = orf(teststring,i)
allorfs.append(temp_orf)
print "orfs2"
for i in allspans2:
x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int
y = i[1]
temp_orf = orf(teststring2,i)
allorfs2.append(temp_orf)
print temp_orf.sequence
temp_file = open(os.path.join(os.curdir, 'temp.txt'),"w")
temp_file.write(">gi|23509994|ref|NC_004318.1| Plasmodium\n ")
temp_file.write(teststring)
temp_file.write("\n\n")
temp_file.write(">gi|239809994|ref|NC_004318.1| Plasmodium\n ")
temp_file.write(teststring2)
temp_file.close()
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'temp.txt'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
all_records = alignment.get_all_seqs()
print alignment
mutations = find_mutations(all_records[0].seq,all_records[1].seq)
all_real = consolidate_mutations (mutations)
return all_real
#print mutations
all_real = consolidate_mutations (mutations)
#def is_silent(mutation, ref_orfs):
#for i in ref_orfs:
#def is_frameshift(mutation, all_orfs):
#Main Program starts here
teststring = " A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT"
teststring2= "CGA ATG GGG GCG ATG ATT AAC C GTT AAA GTA TGT TTT TTG TAG"
all_real = getAllMutations(teststring,teststring2)
for i in all_real:
print i.type, "at ref:", i.ref_span, " other ", i.other_span