Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-28

I've tried to design this program in a somehwat formulaic oo design. Clearly, I'm a bit rusty as it's taking alot longer than I thought it would. The program is still not yet complete, but at this point, it can compare two sequences, extracting insertions and deletions and it extracts orfs from individual sequences with translations built into the orf objects. All that remains is to take the detected mutation objects and map them to the orf objects to contextualize them...

import re from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq import os from Bio import Clustalw standard_translator = Translate.unambiguous_dna_by_id[1]
 * 1) find all orfs within the sequences

class codon: def __init__(self, sequence=""): self.sequence = sequence self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna)) self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class raw_mutation: def __init__(self,type="",ref_location=0,other_location=0): self.type = type self.ref_location=ref_location self.other_location = other_location

class mutation: def __init__(self,type="",ref_span=[0,0],other_span=[0,0]): self.type = type self.ref_span = ref_span self.other_span = other_span class orf: #On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will #be ignored on the conversion to codons def __init__(self, sequence=""): self.codons = [] for i in range(len(sequence)/3): temp_codon = codon(sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases self.codons.append(temp_codon) self.sequence = sequence #orfs are indexed by codons def __getitem__(self,index): return self.codons[index]

def find_mutations(self,other): refPos = 0 seqPos = 0 #print "Ref:",self #print "Other:",other all_mutations = [] for i in range(len(self)): #print self[i], " ", other[i], ' ', refPos, ' ', seqPos if self[i] != other[i]: if self[i] == '-': new_mutation = raw_mutation("insertion",refPos,seqPos) elif other[i] == '-': new_mutation = raw_mutation("deletion",refPos,seqPos) else: new_mutation = raw_mutation("point",refPos,seqPos) all_mutations.append(new_mutation) #print "Mutation!" print new_mutation.type," ",new_mutation.ref_location," ",new_mutation.other_location if self[i] != '-': refPos += 1 if other[i] != '-': seqPos += 1 return all_mutations def consolidate_mutations(raw_mutations): #I think this will have a bug for insertions before the first base of the refseq because #in every other case, ref_loc will refer to the base before the insertion consolidated_mutations =[] i = 0 while i < len(raw_mutations): #print "outer loop" current_type = raw_mutations[i].type if(current_type == "point"): ref_loc = raw_mutations[i].ref_location other_loc = raw_mutations[i].other_location new_mutation = mutation("point",[ref_loc,ref_loc],[other_loc,other_loc]) consolidated_mutations.append(new_mutation) i += 1 elif(current_type == "deletion") or (current_type == "insertion"): ref_start = raw_mutations[i].ref_location other_start = raw_mutations[i].other_location ref_end = raw_mutations[i].ref_location other_end = raw_mutations[i].other_location i += 1 #It would have been nice to have a do while here... while (i<=len(raw_mutations)): #and (raw_mutations[i].type == current_type): #hopefully it exits after the first condition so it doesn't go past array length if (i==len(raw_mutations)) or ((raw_mutations[i].ref_location!=ref_start) and (raw_mutations[i].other_location!=other_start)): new_mutation = mutation(current_type,[ref_start,ref_end],[other_start,other_end]) consolidated_mutations.append(new_mutation) print current_type, "found at ", "ref", ref_start, ",",ref_end," seq ", other_start, ",", other_end break elif (current_type == "insertion"): #and (raw_mutations[i].ref_location==ref_start): other_end += 1 i += 1 #print "extending insertion" elif (current_type == "deletion"): #and (raw_mutations[i].ref_location==other_start): ref_end += 1 i += 1 #print "extending deletion" else: print "I shouldn't be here!!!" break

return consolidated_mutations def findORFS(sequence, startpos=0): all_orfs = [] start = re.compile('ATG') stop = re.compile('(TAA|TGA|TAG)') all_starts = start.finditer(sequence) all_stops = stop.finditer(sequence) all_starts_list = [] for match in all_starts: all_starts_list.append(match.span) all_stops_list = [] for stops in all_stops: all_stops_list.append(stops.span) #print stops.span for start in all_starts_list: found = 0 for stop in all_stops_list: diff = (stop[0]-start[0]) if ((diff>0) and ((diff%3) == 0)): print "orf at:", start[0]," ",stop[1] all_orfs.append((start[0],stop[1])) found = 1 break if found ==0: all_orfs.append((start[0],-1)) return all_orfs

teststring = " A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT" teststring2= "CGA ATG GGG GCG ATG ATT AAC C GTT AAA GTA TGT TTT TTG TAG" print teststring teststring = re.sub("\s+", "", teststring) teststring2 = re.sub("\s+", "", teststring2) allspans = findORFS(teststring) allspans2 = findORFS(teststring2) allorfs = [] allorfs2 = [] print "spans" print allspans for i in allspans: x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int y = i[1] temp_orf = orf(teststring[x:y]) allorfs.append(temp_orf) for i in allspans2: x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int y = i[1] temp_orf = orf(teststring2[x:y]) allorfs2.append(temp_orf)
 * 1) Main Program starts here

temp_file = open(os.path.join(os.curdir, 'temp.txt'),"w") temp_file.write(">gi|23509994|ref|NC_004318.1| Plasmodium\n ") temp_file.write(teststring) temp_file.write("\n\n") temp_file.write(">gi|239809994|ref|NC_004318.1| Plasmodium\n ") temp_file.write(teststring2) temp_file.close cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'temp.txt')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) all_records = alignment.get_all_seqs print alignment mutations = find_mutations(all_records[0].seq,all_records[1].seq) all_real = consolidate_mutations (mutations) for i in all_real: print i.type, "at ref:", i.ref_span, " other ", i.other_span
 * 1) print mutations