Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-3-2
From OpenWetWare
Jump to navigationJump to search
The basic functions now all work: I just need to format the output and tidy it up. The object containment wasn't as tight as I originally hoped. I think perhaps the most logical all-encompasing objct would have been a comparison between a reference sequence and a mutant.
#find all orfs within the sequences import re from Bio import Transcribe transcriber = Transcribe.unambiguous_transcriber from Bio import Translate from Bio.Alphabet import IUPAC from Bio.Seq import Seq import os from Bio import Clustalw standard_translator = Translate.unambiguous_dna_by_id[1] class codon: def __init__(self, sequence=""): self.sequence = sequence self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna)) self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0] class raw_mutation: def __init__(self,type="",ref_location=0,other_location=0, original="", mutant=""): self.type = type self.ref_location=ref_location self.other_location = other_location self.original = original self.mutant = mutant class mutation: def __init__(self,type="",ref_span=[0,0],other_span=[0,0],original="", mutant=""): self.type = type self.ref_span = ref_span self.other_span = other_span self.original = original self.mutant = mutant class orf: #On initiation, the orf is stored as a list of codons. If the orf has no stop, any excess bases will #be ignored on the conversion to codons def __init__(self, source="", source_span=[]): self.source = source self.codons = [] self.sequence = source[source_span[0]:source_span[1]] self.source_span = source_span for i in range(len(self.sequence)/3): temp_codon = codon(self.sequence[i*3:3+(i*3)]) #this algorithm of seperating into codons ignores any excess bases self.codons.append(temp_codon) #orfs are indexed by codons def __getitem__(self,index): return self.codons[index] def find_mutations(self,other): refPos = 0 seqPos = 0 #print "Ref:",self #print "Other:",other all_mutations = [] for i in range(len(self)): #print self[i], " ", other[i], ' ', refPos, ' ', seqPos if self[i] != other[i]: if self[i] == '-': new_mutation = raw_mutation("insertion",refPos,seqPos) elif other[i] == '-': new_mutation = raw_mutation("deletion",refPos,seqPos) else: new_mutation = raw_mutation("point",refPos,seqPos, self[refPos],other[seqPos]) all_mutations.append(new_mutation) #print "Mutation!" print new_mutation.type," ",new_mutation.ref_location," ",new_mutation.other_location if self[i] != '-': refPos += 1 if other[i] != '-': seqPos += 1 return all_mutations def consolidate_mutations(raw_mutations): #I think this will have a bug for insertions before the first base of the refseq because #in every other case, ref_loc will refer to the base before the insertion consolidated_mutations =[] i = 0 while i < len(raw_mutations): #print "outer loop" current_type = raw_mutations[i].type if(current_type == "point"): ref_loc = raw_mutations[i].ref_location other_loc = raw_mutations[i].other_location new_mutation = mutation("point",[ref_loc,ref_loc],[other_loc,other_loc],raw_mutations[i].original, raw_mutations[i].mutant) consolidated_mutations.append(new_mutation) i += 1 elif(current_type == "deletion") or (current_type == "insertion"): ref_start = raw_mutations[i].ref_location other_start = raw_mutations[i].other_location ref_end = raw_mutations[i].ref_location other_end = raw_mutations[i].other_location i += 1 #It would have been nice to have a do while here... while (i<=len(raw_mutations)): #and (raw_mutations[i].type == current_type): #hopefully it exits after the first condition so it doesn't go past array length if (i==len(raw_mutations)) or ((raw_mutations[i].ref_location!=ref_start) and (raw_mutations[i].other_location!=other_start)): new_mutation = mutation(current_type,[ref_start,ref_end],[other_start,other_end]) consolidated_mutations.append(new_mutation) print current_type, "found at ", "ref", ref_start, ",",ref_end," seq ", other_start, ",", other_end break elif (current_type == "insertion"): #and (raw_mutations[i].ref_location==ref_start): other_end += 1 i += 1 #print "extending insertion" elif (current_type == "deletion"): #and (raw_mutations[i].ref_location==other_start): ref_end += 1 i += 1 #print "extending deletion" else: print "I shouldn't be here!!!" break return consolidated_mutations def findORFS(sequence, startpos=0): print sequence all_orfs = [] start = re.compile('ATG') stop = re.compile('(TAA|TGA|TAG)') all_starts = start.finditer(sequence) all_stops = stop.finditer(sequence) all_starts_list = [] for match in all_starts: all_starts_list.append(match.span()) all_stops_list = [] for stops in all_stops: all_stops_list.append(stops.span()) #print stops.span() for start in all_starts_list: found = 0 for stop in all_stops_list: diff = (stop[0]-start[0]) if ((diff>0) and ((diff%3) == 0)): print "orf at:", start[0]," ",stop[1] temp_orf = orf(sequence,(start[0],stop[1])) all_orfs.append(temp_orf) #all_orfs.append((start[0],stop[1])) found = 1 break if found ==0: print "open orf" temp_orf = orf(sequence,(start[0],None)) all_orfs.append(temp_orf) #all_orfs.append((start[0],None)) return all_orfs def is_silent(mutation, ref_orfs): mut_pos = mutation.ref_span[0] #ref_span[0] and [1] are the same print "For ", mutation.type, "at position", mutation.ref_span[0], ':' for i in ref_orfs: if (mut_pos>=i.source_span[0] and mut_pos<=i.source_span[1]): #does the mutation affect the orf print "Point affects ", i.sequence ref_codon_position = (mut_pos - i.source_span[0])/3 base_position = (mut_pos-i.source_span[0])%3 original_aa = i[ref_codon_position].protein mut_codon = i[ref_codon_position].sequence print "Original Codon:", mut_codon mut_codon = list(mut_codon) mut_codon[base_position] = mutation.mutant mut_codon = "".join(mut_codon) print "Mutant Codon",mut_codon mutant_aa = standard_translator.translate(Seq(mut_codon, IUPAC.unambiguous_dna))[0] if (mutant_aa == original_aa): print "Silent" else: print "not silent" def is_frameshift(mutation, all_orfs): if ((mutation.ref_span[1]-mutation.ref_span[0])%3 !=0) or ((mutation.other_span[1]-mutation.other_span[0])%3 !=0): print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is a frameshift" else: print mutation.type, "from", mutation.ref_span[0]," to", mutation.ref_span[1], "is in frame" #Main Program starts here teststring = " A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT" teststring2= "CGA ATG GGG GCG ATG ATT AAC C GTT AAA GTA TGT TTT TTG TAG" print teststring teststring = re.sub("\s+", "", teststring) teststring2 = re.sub("\s+", "", teststring2) allorfs = findORFS(teststring) allorfs2 = findORFS(teststring2) #allorfs = [] #allorfs2 = [] #print "spans" #print allspans2 #for i in allspans: # x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int # y = i[1] # temp_orf = orf(teststring,i) # allorfs.append(temp_orf) print "orfs2" for i in allorfs2: # x = i[0] #I don't know why it won't let me use i[0] and i[1] directly as an int # y = i[1] # temp_orf = orf(teststring2,i) # allorfs2.append(temp_orf) print i.sequence temp_file = open(os.path.join(os.curdir, 'temp.txt'),"w") temp_file.write(">gi|23509994|ref|NC_004318.1| Plasmodium\n ") temp_file.write(teststring) temp_file.write("\n\n") temp_file.write(">gi|239809994|ref|NC_004318.1| Plasmodium\n ") temp_file.write(teststring2) temp_file.close() cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'temp.txt')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) all_records = alignment.get_all_seqs() print alignment mutations = find_mutations(all_records[0].seq,all_records[1].seq) #print mutations all_real = consolidate_mutations (mutations) for i in all_real: print i.type, "at ref:", i.ref_span, " other ", i.other_span for i in all_real: if (i.type == "point"): is_silent(i, allorfs) else: is_frameshift(i,allorfs)