Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-3-6

Notes: This isn't as clean as the sample output, but it compares two hardcoded strings and does mutation compatison. The functions package mutations and orfs into objects that could be exported for further manipulation. Though this version doesn't implement it, it would be fairly easy to load in multiple sequences to do pairwise alignments and call the main script (This also goes for the reverse strand comparisons). Also missing in this version is the final output proteins for the insertions and deletions. I couldn't quite figure out exactly we're supposed to handle out of fram mutations in getting the codons to line up.

import re, string from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq import os from Bio import Clustalw standard_translator = Translate.unambiguous_dna_by_id[1]
 * 1) find all orfs within the sequences

class codon: def __init__(self, sequence=""): self.sequence = sequence self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna)) self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class raw_mutation: def __init__(self,type="",ref_location=0,other_location=0, original="", mutant=""): self.type = type self.ref_location=ref_location self.other_location = other_location self.original = original self.mutant = mutant

class mutation: def __init__(self,type="",ref_span=[0,0],other_span=[0,0],original="", mutant=""): self.type = type self.ref_span = ref_span self.other_span = other_span self.original = original self.mutant = mutant class orf: #On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will #be ignored on the conversion to codons def __init__(self, source="", source_span=[]): self.source = source self.codons = [] self.sequence = source[source_span[0]:source_span[1]] self.source_span = source_span for i in range(len(self.sequence)/3): temp_codon = codon(self.sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases self.codons.append(temp_codon) #orfs are indexed by codons def __getitem__(self,index): return self.codons[index]

def find_mutations(self,other): refPos = 0 seqPos = 0 #print "Ref:",self #print "Other:",other all_mutations = [] for i in range(len(self)): #print self[i], " ", other[i], ' ', refPos, ' ', seqPos if self[i] != other[i]: if self[i] == '-': new_mutation = raw_mutation("insertion",refPos,seqPos) elif other[i] == '-': new_mutation = raw_mutation("deletion",refPos,seqPos) else: new_mutation = raw_mutation("point",refPos,seqPos, self[i],other[i]) all_mutations.append(new_mutation) #print "Mutation!" #print new_mutation.type," ",new_mutation.ref_location," ",new_mutation.other_location if self[i] != '-': refPos += 1 if other[i] != '-': seqPos += 1 return all_mutations def consolidate_mutations(raw_mutations): #I think this will have a bug for insertions before the first base of the refseq because #in every other case, ref_loc will refer to the base before the insertion consolidated_mutations =[] i = 0 while i < len(raw_mutations): #print "outer loop" current_type = raw_mutations[i].type if(current_type == "point"): ref_loc = raw_mutations[i].ref_location other_loc = raw_mutations[i].other_location new_mutation = mutation("point",[ref_loc,ref_loc],[other_loc,other_loc],raw_mutations[i].original, raw_mutations[i].mutant) consolidated_mutations.append(new_mutation) i += 1 elif(current_type == "deletion") or (current_type == "insertion"): ref_start = raw_mutations[i].ref_location other_start = raw_mutations[i].other_location ref_end = raw_mutations[i].ref_location other_end = raw_mutations[i].other_location i += 1 #It would have been nice to have a do while here... while (i<=len(raw_mutations)): #and (raw_mutations[i].type == current_type): #hopefully it exits after the first condition so it doesn't go past array length if (i==len(raw_mutations)) or ((raw_mutations[i].ref_location!=ref_start) and (raw_mutations[i].other_location!=other_start)): new_mutation = mutation(current_type,[ref_start,ref_end],[other_start,other_end]) consolidated_mutations.append(new_mutation) #print current_type, "found at ", "ref", ref_start, ",",ref_end," seq ", other_start, ",", other_end break elif (current_type == "insertion"): #and (raw_mutations[i].ref_location==ref_start): other_end += 1 i += 1 #print "extending insertion" elif (current_type == "deletion"): #and (raw_mutations[i].ref_location==other_start): ref_end += 1 i += 1 #print "extending deletion" else: print "I shouldn't be here!!!" break

return consolidated_mutations def findORFS(sequence, startpos=0): print "For",sequence all_orfs = [] start = re.compile('ATG') stop = re.compile('(TAA|TGA|TAG)') all_starts = start.finditer(sequence) all_stops = stop.finditer(sequence) all_starts_list = [] for match in all_starts: all_starts_list.append(match.span) all_stops_list = [] for stops in all_stops: all_stops_list.append(stops.span) #print stops.span for start in all_starts_list: found = 0 for stop in all_stops_list: diff = (stop[0]-start[0]) if ((diff>0) and ((diff%3) == 0)): print "orf at:", start[0]," ",stop[1] temp_orf = orf(sequence,(start[0],stop[1])) all_orfs.append(temp_orf) #all_orfs.append((start[0],stop[1])) found = 1 break if found ==0: print "open orf", temp_orf = orf(sequence,(start[0],None)) all_orfs.append(temp_orf) print "starting at:", start[0] #all_orfs.append((start[0],None)) return all_orfs

def check_point(mutation, ref_orfs): mut_pos = mutation.ref_span[0] #ref_span[0] and [1] are the same print "For ", mutation.type, "at position", mutation.ref_span[0], ':' for i in ref_orfs: if (mut_pos>=i.source_span[0] and mut_pos<=i.source_span[1]): #does the mutation affect the orf print "Point affects ", i.sequence ref_codon_position = (mut_pos - i.source_span[0])/3 base_position = (mut_pos-i.source_span[0])%3 original_aa = i[ref_codon_position].protein mut_codon = i[ref_codon_position].sequence #  print "Original Codon:", mut_codon mut_codon = list(mut_codon) mut_codon[base_position] = mutation.mutant mut_codon = "".join(mut_codon) #  print "Mutant Codon",mut_codon mutant_aa = standard_translator.translate(Seq(mut_codon, IUPAC.unambiguous_dna))[0] if (mutant_aa == original_aa): print "Silent Point Mutation detected: ", else: print "Non-silent Point Mutation detected", print (mutation.original+str(mut_pos)+mutation.mutant),"Protein result:",original_aa+str(mut_pos)+mutant_aa

def formatted_seq_print (sequence): for i in range(len(sequence)/3): start_base = i*3 print_codon = sequence.data[start_base:start_base+3] print print_codon, print ""

def formatted_codon_print (sequence): for i in range(len(sequence)/3): start_base = i*3 prot = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0] print_codon = sequence.data[start_base:start_base+3] #if currently in the deletion region, don't print print print_codon, print ""

def check_frame(mutation, all_orfs, all_records): if ((mutation.ref_span[1]-mutation.ref_span[0])%3 !=0) or ((mutation.other_span[1]-mutation.other_span[0])%3 !=0): print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is a frameshift" else: print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is in frame" print "odna:", formatted_seq_print (all_records[0].seq) #for i in range(len(something)/3): #   print " ",some_protein," " print "mdna:", formatted_seq_print (all_records[1].seq) #for i in range(len(something)/3): #   print " ",some_protein," "

teststring = " A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT" teststring2= "CGA ATG GGG GCG ATG ATT AAC C GTT AAA GTA TGT TTT TTG TAG" print teststring teststring = re.sub("\s+", "", teststring) teststring2 = re.sub("\s+", "", teststring2) print "Forward orfs" allorfs = findORFS(teststring) allorfs2 = findORFS(teststring2) temp_file = open(os.path.join(os.curdir, 'temp.txt'),"w") temp_file.write(">temp1|\n ") temp_file.write(teststring) temp_file.write("\n\n") temp_file.write(">gtemp2|\n ") temp_file.write(teststring2) temp_file.close cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'temp.txt')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) all_records = alignment.get_all_seqs mutations = find_mutations(all_records[0].seq,all_records[1].seq) all_real = consolidate_mutations (mutations) for i in all_real: print i.type, "at ref:", i.ref_span, " other ", i.other_span for i in all_real: if (i.type == "point"): check_point(i, allorfs) else: check_frame(i,allorfs,all_records)
 * 1) Main Program starts here
 * 1) print alignment
 * 1) print mutations

reverse_list1 = list(teststring)[:] reverse_list2 = list(teststring2)[:] teststring = "".join(teststring) teststring2 = "".join(teststring2) reverse_list1.reverse reverse_list2.reverse reverse_string1 = "".join(reverse_list1) reverse_string2 = "".join(reverse_list2) table = string.maketrans("ATGC","TACG") reverse_string1 = reverse_string1.translate(table) reverse_string2 = reverse_string2.translate(table) print "reverse orfs" revallorfs = findORFS(reverse_string1) revallorfs2 = findORFS(reverse_string2)
 * 1) print "orfs2"
 * 2) for i in allorfs2:
 * 3)    print i.sequence

Output:  A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT Forward For ATGGGGGGGAATGATTAACGTCGTTAAAGTATGTTTTTT orf at: 0  18 open orf starting at: 10 open orf starting at: 30 For CGAATGGGGGCGATGATTAACCGTTAAAGTATGTTTTTTGTAG orf at: 3  27 orf at: 12  27 open orf starting at: 30 insertion at ref: [0, 0] other  [0, 1] point at ref: [1, 1] other  [3, 3] point at ref: [2, 2] other  [4, 4] point at ref: [8, 8] other  [10, 10] point at ref: [9, 9] other  [11, 11] deletion at ref: [19, 20] other  [21, 21] insertion at ref: [39, 39] other  [39, 42] insertion from 0 to 0 is a frameshift odna: --A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT- --- mdna: CGA ATG GGG GCG ATG ATT AAC --C GTT AAA GTA TGT TTT TTG TAG For point at position 1 : Point affects ATGGGGGGGAATGATTAA Non-silent Point Mutation detected T1A Protein result: M1K For point at position 2 : Point affects ATGGGGGGGAATGATTAA Non-silent Point Mutation detected G2T Protein result: M2I For point at position 8 : Point affects ATGGGGGGGAATGATTAA Silent Point Mutation detected: G8C Protein result: G8G For point at position 9 : Point affects ATGGGGGGGAATGATTAA Non-silent Point Mutation detected A9G Protein result: N9D deletion from 19 to 20 is a frameshift odna: --A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT- --- mdna: CGA ATG GGG GCG ATG ATT AAC --C GTT AAA GTA TGT TTT TTG TAG insertion from 39 to 39 is in frame odna: --A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT- --- mdna: CGA ATG GGG GCG ATG ATT AAC --C GTT AAA GTA TGT TTT TTG TAG reverse For AAAAAACATACTTTAACGACGTTAATCATTCCCCCCCAT For CTACAAAAAACATACTTTAACGGTTAATCATCGCCCCCATTCG