Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-28

I've tried to design this program in a somehwat formulaic oo design. Clearly, I'm a bit rusty as it's taking alot longer than I thought it would. The program is still not yet complete, but at this point, it can compare two sequences, extracting insertions and deletions and it extracts orfs from individual sequences with translations built into the orf objects. All that remains is to take the detected mutation objects and map them to the orf objects to contextualize them...
#find all orfs within the sequences
import re
from Bio import Transcribe
transcriber = Transcribe.unambiguous_transcriber
from Bio import Translate
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
import os
from Bio import Clustalw
standard_translator = Translate.unambiguous_dna_by_id[1]

class codon:
    def  __init__(self, sequence=""):
        self.sequence = sequence
        self.mRNA = transcriber.transcribe(Seq(sequence, IUPAC.unambiguous_dna))
        self.protein = standard_translator.translate(Seq(sequence, IUPAC.unambiguous_dna))[0]

class raw_mutation:
    def __init__(self,type="",ref_location=0,other_location=0):
        self.type = type
        self.ref_location=ref_location
        self.other_location = other_location

class mutation:
    def __init__(self,type="",ref_span=[0,0],other_span=[0,0]):
        self.type = type
        self.ref_span = ref_span
        self.other_span = other_span
    
class orf:
    #On initiation, the orf is stored as a list of codons.  If the orf has no stop, any excess bases will
    #be ignored on the conversion to codons
    def __init__(self, sequence=""):
        self.codons = []
        for i in range(len(sequence)/3):
            temp_codon = codon(sequence[i*3:3+(i*3)])  #this algorithm of seperating into codons ignores any excess bases
            self.codons.append(temp_codon)
        self.sequence = sequence
        
    #orfs are indexed by codons
    def __getitem__(self,index):
        return self.codons[index]

   
def find_mutations(self,other):
    refPos = 0
    seqPos = 0
    #print "Ref:",self
    #print "Other:",other
    all_mutations = []
    for i in range(len(self)):
        #print self[i], " ", other[i], ' ', refPos, ' ', seqPos
        if self[i] != other[i]:
            if self[i] == '-':
                new_mutation = raw_mutation("insertion",refPos,seqPos)
            elif other[i] == '-':
                new_mutation = raw_mutation("deletion",refPos,seqPos)
            else:
                new_mutation = raw_mutation("point",refPos,seqPos)
            all_mutations.append(new_mutation)
            #print "Mutation!"
            print new_mutation.type," ",new_mutation.ref_location," ",new_mutation.other_location
             
        if self[i] != '-':
            refPos += 1
        if other[i] != '-':
            seqPos += 1
    return all_mutations
            
def consolidate_mutations(raw_mutations):
    #I think this will have a bug for insertions before the first base of the refseq because
    #in every other case, ref_loc will refer to the base before the insertion
    consolidated_mutations =[]
    i = 0
    while i < len(raw_mutations):
        #print "outer loop"
        current_type = raw_mutations[i].type
        if(current_type == "point"):
            ref_loc = raw_mutations[i].ref_location
            other_loc = raw_mutations[i].other_location
            new_mutation = mutation("point",[ref_loc,ref_loc],[other_loc,other_loc])
            consolidated_mutations.append(new_mutation)
            i += 1
        elif(current_type == "deletion") or (current_type == "insertion"):
            
            ref_start = raw_mutations[i].ref_location
            other_start = raw_mutations[i].other_location
            ref_end = raw_mutations[i].ref_location
            other_end = raw_mutations[i].other_location 
            i += 1
            #It would have been nice to have a do while here...
            while (i<=len(raw_mutations)): #and (raw_mutations[i].type == current_type):
                #hopefully it exits after the first condition so it doesn't go past array length
                if (i==len(raw_mutations)) or ((raw_mutations[i].ref_location!=ref_start) and (raw_mutations[i].other_location!=other_start)):
                    new_mutation = mutation(current_type,[ref_start,ref_end],[other_start,other_end])
                    consolidated_mutations.append(new_mutation)
                    print current_type, "found at ", "ref", ref_start, ",",ref_end," seq ", other_start, ",", other_end
                    break
                elif (current_type == "insertion"): #and (raw_mutations[i].ref_location==ref_start):
                    other_end += 1
                    i += 1
                    #print "extending insertion"
                elif (current_type == "deletion"): #and (raw_mutations[i].ref_location==other_start):
                    ref_end += 1
                    i += 1
                    #print "extending deletion"
                else:
                    print "I shouldn't be here!!!"
                    break

    return consolidated_mutations
        
                
    
        
def findORFS(sequence, startpos=0):
    all_orfs = []
    start = re.compile('ATG')
    stop = re.compile('(TAA|TGA|TAG)')
    all_starts = start.finditer(sequence)
    all_stops = stop.finditer(sequence)
    all_starts_list = []
    for match in all_starts:
        all_starts_list.append(match.span())
    all_stops_list = []
    for stops in all_stops:
        all_stops_list.append(stops.span())
        #print stops.span()
    for start in all_starts_list:
        found = 0
        for stop in all_stops_list:
            diff = (stop[0]-start[0])
            if ((diff>0) and ((diff%3) == 0)):
                print "orf at:", start[0]," ",stop[1]
                all_orfs.append((start[0],stop[1]))
                found = 1
                break
        if found ==0:
            all_orfs.append((start[0],-1))
        
    return all_orfs

#Main Program starts here          
teststring = "  A TGG GGG GGA ATG ATT AAC GTC GTT AAA GTA TGT TTT TT"
teststring2= "CGA ATG GGG GCG ATG ATT AAC C GTT AAA GTA TGT TTT TTG TAG"
print teststring
teststring = re.sub("\s+", "", teststring)
teststring2 = re.sub("\s+", "", teststring2)
allspans = findORFS(teststring)
allspans2 = findORFS(teststring2)
allorfs = []
allorfs2 = []
print "spans"
print allspans
for i in allspans:
    x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
    y = i[1]
    temp_orf = orf(teststring[x:y])
    allorfs.append(temp_orf)
for i in allspans2:
    x = i[0]  #I don't know why it won't let me use i[0] and i[1] directly as an int
    y = i[1]
    temp_orf = orf(teststring2[x:y])
    allorfs2.append(temp_orf)

temp_file = open(os.path.join(os.curdir, 'temp.txt'),"w")
temp_file.write(">gi|23509994|ref|NC_004318.1| Plasmodium\n ")
temp_file.write(teststring)
temp_file.write("\n\n")
temp_file.write(">gi|239809994|ref|NC_004318.1| Plasmodium\n ")
temp_file.write(teststring2)
temp_file.close()
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'temp.txt'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
all_records = alignment.get_all_seqs()
print alignment
mutations = find_mutations(all_records[0].seq,all_records[1].seq)
#print mutations
all_real = consolidate_mutations (mutations)
for i in all_real:
    print i.type, "at ref:", i.ref_span, " other ", i.other_span
Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-28

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools