Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-3-6
From OpenWetWare
Jump to navigationJump to search
Notes: This isn't as clean as the sample output, but it compares two hardcoded strings and does mutation compatison. The functions package mutations and orfs into objects that could be exported for further manipulation. Though this version doesn't implement it, it would be fairly easy to load in multiple sequences to do pairwise alignments and call the main script (This also goes for the reverse strand comparisons). Also missing in this version is the final output proteins for the insertions and deletions. I couldn't quite figure out exactly we're supposed to handle out of fram mutations in getting the codons to line up.
#find all orfs within the sequences
import re, string
from Bio import Transcribe
transcriber = Transcribe.unambiguous_transcriber
from Bio import Translate
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
import os
from Bio import Clustalw
standard_translator = Translate.unambiguous_dna_by_id[1]
class codon:
def __init__(self, sequence=""):
self.sequence = sequence
self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]
class raw_mutation:
def __init__(self,type="",ref_location=0,other_location=0, original="", mutant=""):
self.type = type
self.ref_location=ref_location
self.other_location = other_location
self.original = original
self.mutant = mutant
class mutation:
def __init__(self,type="",ref_span=[0,0],other_span=[0,0],original="", mutant=""):
self.type = type
self.ref_span = ref_span
self.other_span = other_span
self.original = original
self.mutant = mutant
class orf:
#On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will
#be ignored on the conversion to codons
def __init__(self, source="", source_span=[]):
self.source = source
self.codons = []
self.sequence = source[source_span[0]:source_span[1]]
self.source_span = source_span
for i in range(len(self.sequence)/3):
temp_codon = codon(self.sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases
self.codons.append(temp_codon)
#orfs are indexed by codons
def __getitem__(self,index):
return self.codons[index]
def find_mutations(self,other):
refPos = 0
seqPos = 0
#print "Ref:",self
#print "Other:",other
all_mutations = []
for i in range(len(self)):
#print self[i], " ", other[i], ' ', refPos, ' ', seqPos
if self[i] != other[i]:
if self[i] == '-':
new_mutation = raw_mutation("insertion",refPos,seqPos)
elif other[i] == '-':
new_mutation = raw_mutation("deletion",refPos,seqPos)
else:
new_mutation = raw_mutation("point",refPos,seqPos, self[i],other[i])
all_mutations.append(new_mutation)
#print "Mutation!"
#print new_mutation.type," ",new_mutation.ref_location," ",new_mutation.other_location
if self[i] != '-':
refPos += 1
if other[i] != '-':
seqPos += 1
return all_mutations
def consolidate_mutations(raw_mutations):
#I think this will have a bug for insertions before the first base of the refseq because
#in every other case, ref_loc will refer to the base before the insertion
consolidated_mutations =[]
i = 0
while i < len(raw_mutations):
#print "outer loop"
current_type = raw_mutations[i].type
if(current_type == "point"):
ref_loc = raw_mutations[i].ref_location
other_loc = raw_mutations[i].other_location
new_mutation = mutation("point",[ref_loc,ref_loc],[other_loc,other_loc],raw_mutations[i].original, raw_mutations[i].mutant)
consolidated_mutations.append(new_mutation)
i += 1
elif(current_type == "deletion") or (current_type == "insertion"):
ref_start = raw_mutations[i].ref_location
other_start = raw_mutations[i].other_location
ref_end = raw_mutations[i].ref_location
other_end = raw_mutations[i].other_location
i += 1
#It would have been nice to have a do while here...
while (i<=len(raw_mutations)): #and (raw_mutations[i].type == current_type):
#hopefully it exits after the first condition so it doesn't go past array length
if (i==len(raw_mutations)) or ((raw_mutations[i].ref_location!=ref_start) and (raw_mutations[i].other_location!=other_start)):
new_mutation = mutation(current_type,[ref_start,ref_end],[other_start,other_end])
consolidated_mutations.append(new_mutation)
#print current_type, "found at ", "ref", ref_start, ",",ref_end," seq ", other_start, ",", other_end
break
elif (current_type == "insertion"): #and (raw_mutations[i].ref_location==ref_start):
other_end += 1
i += 1
#print "extending insertion"
elif (current_type == "deletion"): #and (raw_mutations[i].ref_location==other_start):
ref_end += 1
i += 1
#print "extending deletion"
else:
print "I shouldn't be here!!!"
break
return consolidated_mutations
def findORFS(sequence, startpos=0):
print "For",sequence
all_orfs = []
start = re.compile('ATG')
stop = re.compile('(TAA|TGA|TAG)')
all_starts = start.finditer(sequence)
all_stops = stop.finditer(sequence)
all_starts_list = []
for match in all_starts:
all_starts_list.append(match.span())
all_stops_list = []
for stops in all_stops:
all_stops_list.append(stops.span())
#print stops.span()
for start in all_starts_list:
found = 0
for stop in all_stops_list:
diff = (stop[0]-start[0])
if ((diff>0) and ((diff%3) == 0)):
print "orf at:", start[0]," ",stop[1]
temp_orf = orf(sequence,(start[0],stop[1]))
all_orfs.append(temp_orf)
#all_orfs.append((start[0],stop[1]))
found = 1
break
if found ==0:
print "open orf",
temp_orf = orf(sequence,(start[0],None))
all_orfs.append(temp_orf)
print "starting at:", start[0]
#all_orfs.append((start[0],None))
return all_orfs
def check_point(mutation, ref_orfs):
mut_pos = mutation.ref_span[0] #ref_span[0] and [1] are the same
print "For ", mutation.type, "at position", mutation.ref_span[0], ':'
for i in ref_orfs:
if (mut_pos>=i.source_span[0] and mut_pos<=i.source_span[1]): #does the mutation affect the orf
print "Point affects ", i.sequence
ref_codon_position = (mut_pos - i.source_span[0])/3
base_position = (mut_pos-i.source_span[0])%3
original_aa = i[ref_codon_position].protein
mut_codon = i[ref_codon_position].sequence
# print "Original Codon:", mut_codon
mut_codon = list(mut_codon)
mut_codon[base_position] = mutation.mutant
mut_codon = "".join(mut_codon)
# print "Mutant Codon",mut_codon
mutant_aa = standard_translator.translate(Seq(mut_codon, IUPAC.unambiguous_dna))[0]
if (mutant_aa == original_aa):
print "Silent Point Mutation detected: ",
else:
print "Non-silent Point Mutation detected",
print (mutation.original+str(mut_pos)+mutation.mutant),"Protein result:",original_aa+str(mut_pos)+mutant_aa
def formatted_seq_print (sequence):
for i in range(len(sequence)/3):
start_base = i*3
print_codon = sequence.data[start_base:start_base+3]
print print_codon,
print ""
def formatted_codon_print (sequence):
for i in range(len(sequence)/3):
start_base = i*3
prot = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]
print_codon = sequence.data[start_base:start_base+3]
#if currently in the deletion region, don't print
print print_codon,
print ""
def check_frame(mutation, all_orfs, all_records):
if ((mutation.ref_span[1]-mutation.ref_span[0])%3 !=0) or ((mutation.other_span[1]-mutation.other_span[0])%3 !=0):
print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is a frameshift"
else:
print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is in frame"
print "odna:",
formatted_seq_print (all_records[0].seq)
#for i in range(len(something)/3):
# print " ",some_protein," "
print "mdna:",
formatted_seq_print (all_records[1].seq)
#for i in range(len(something)/3):
# print " ",some_protein," "
#Main Program starts here
teststring = " A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT"
teststring2= "CGA ATG GGG GCG ATG ATT AAC C GTT AAA GTA TGT TTT TTG TAG"
print teststring
teststring = re.sub("\s+", "", teststring)
teststring2 = re.sub("\s+", "", teststring2)
print "Forward orfs"
allorfs = findORFS(teststring)
allorfs2 = findORFS(teststring2)
temp_file = open(os.path.join(os.curdir, 'temp.txt'),"w")
temp_file.write(">temp1|\n ")
temp_file.write(teststring)
temp_file.write("\n\n")
temp_file.write(">gtemp2|\n ")
temp_file.write(teststring2)
temp_file.close()
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'temp.txt'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
all_records = alignment.get_all_seqs()
#print alignment
mutations = find_mutations(all_records[0].seq,all_records[1].seq)
#print mutations
all_real = consolidate_mutations (mutations)
for i in all_real:
print i.type, "at ref:", i.ref_span, " other ", i.other_span
for i in all_real:
if (i.type == "point"):
check_point(i, allorfs)
else:
check_frame(i,allorfs,all_records)
reverse_list1 = list(teststring)[:]
reverse_list2 = list(teststring2)[:]
teststring = "".join(teststring)
teststring2 = "".join(teststring2)
reverse_list1.reverse()
reverse_list2.reverse()
reverse_string1 = "".join(reverse_list1)
reverse_string2 = "".join(reverse_list2)
table = string.maketrans("ATGC","TACG")
reverse_string1 = reverse_string1.translate(table)
reverse_string2 = reverse_string2.translate(table)
print "reverse orfs"
revallorfs = findORFS(reverse_string1)
revallorfs2 = findORFS(reverse_string2)
#print "orfs2"
#for i in allorfs2:
# print i.sequence
Output:
A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT Forward For ATGGGGGGGAATGATTAACGTCGTTAAAGTATGTTTTTT orf at: 0 18 open orf starting at: 10 open orf starting at: 30 For CGAATGGGGGCGATGATTAACCGTTAAAGTATGTTTTTTGTAG orf at: 3 27 orf at: 12 27 open orf starting at: 30 insertion at ref: [0, 0] other [0, 1] point at ref: [1, 1] other [3, 3] point at ref: [2, 2] other [4, 4] point at ref: [8, 8] other [10, 10] point at ref: [9, 9] other [11, 11] deletion at ref: [19, 20] other [21, 21] insertion at ref: [39, 39] other [39, 42] insertion from 0 to 0 is a frameshift odna: --A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT- --- mdna: CGA ATG GGG GCG ATG ATT AAC --C GTT AAA GTA TGT TTT TTG TAG For point at position 1 : Point affects ATGGGGGGGAATGATTAA Non-silent Point Mutation detected T1A Protein result: M1K For point at position 2 : Point affects ATGGGGGGGAATGATTAA Non-silent Point Mutation detected G2T Protein result: M2I For point at position 8 : Point affects ATGGGGGGGAATGATTAA Silent Point Mutation detected: G8C Protein result: G8G For point at position 9 : Point affects ATGGGGGGGAATGATTAA Non-silent Point Mutation detected A9G Protein result: N9D deletion from 19 to 20 is a frameshift odna: --A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT- --- mdna: CGA ATG GGG GCG ATG ATT AAC --C GTT AAA GTA TGT TTT TTG TAG insertion from 39 to 39 is in frame odna: --A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT- --- mdna: CGA ATG GGG GCG ATG ATT AAC --C GTT AAA GTA TGT TTT TTG TAG reverse For AAAAAACATACTTTAACGACGTTAATCATTCCCCCCCAT For CTACAAAAAACATACTTTAACGGTTAATCATCGCCCCCATTCG