Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-2-27
# Katie Fifer # asst3 - sequence comparison #! /usr/bin/env python import os import re from Bio import Clustalw from Bio import Translate from Bio.Align import AlignInfo from Bio.Alphabet import IUPAC from sets import Set from sys import * from Bio.Seq import translate cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'Apoe.fasta')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) # generate simple consensus sequence summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.dumb_consensus() print "Consensus: ", consensus.tostring() # Single nucleotide analysis for i in range(alignment.get_alignment_length()): col = alignment.get_column(i) s = Set() # creat a new set for c in range(len(col)): s.add(col[c]) # add each column element to the set if len(s) > 1: # multiple elements in s indicate a mismatch # print i, col # determine the type of mutation # see if it's a deletion p = re.compile("\w-+\w") m = p.match(col) if m: print 'Deletion at %d: %s' % (i, col) # see if it's a point mutation else: print 'Point Mutation at %d: %s' %(i, col) # Codon analysis: figure out what the codons are and compare # them. note any protein changes (not totally finished). # set up a list of lists of all the codons of each sequence. big_list =  for seq in alignment.get_all_seqs(): seq_codons =  index = 0 num_codons = ((alignment.get_alignment_length()) / 3) for j in range(num_codons): new_codon = ''.join([seq.seq[index + i] for i in range(3)]) index = index + 3 seq_codons.append(new_codon) big_list.append(seq_codons) # using the big list that was just generated, do codon comparison and # print out any that are different. for i in range(len(big_list)): curr_codon = (big_list)[i] curr_list =  for j in range(len(big_list)): # make the list of the codon's we're comparing at the # moment. This may seem like wasted extra work for only # comparing a few codons, but we'll see that with filter # (below) we'll easily be able to pick out the mismatches from # a variable number of sequence comparisons. curr_list.append((big_list[j])[i]) new_list = filter(lambda x: x != curr_codon, curr_list) # if the codons ended up being different, print out what the # different proteins are if(new_list): print "a mismatch! Should have been %s. Instead is %s" % (curr_codon, new_list) # Figure out what the change was in terms of proteins. # Figure out how significant this change is. # Note: map would be way sweeter here, but whatevs, it doesn't # seem to work right. mutated_proteins =  for new_codon in new_list: mutated_proteins.append(translate(new_codon)) print "Protein Change: %s became %s" % (translate(curr_codon), mutated_proteins) # Notes about implementation: # 1. The next step will be to combine this # codon analysis with a protein library so that an analysis of wether # mutations are silent or not can be done. # 2. I'm not sure this is the # best implementation in that when the alignment did something like # place '-' as a placeholder to get the sequence to line up better, i # left that and didn't actually treat it like it could have caused a # frameshift. it would be pretty straightforward to make it a # frameshift, though (just by not including it when making the codon # list) and the rest of the implementation can stay the same. # Helper function to determine ORF (reading frame). will return a # list with two numbers - the start nucleotide number and the length of # the open reading frame (number of nucleotides). def find_frame(seq): # try it starting with the first nucleotide, then with the second, # then with the third. return whichever is found first. (this # could be improved later. start = 'AUG' stop = ['UAA', 'UAG', 'UGA'] start_num = -1 # this will do all three trials in order (changing the seq by one # each time. chopped =  for i in range(2): # chopped will be a list in which each element is a codon in # the list. print "len chopped: ", len(chopped) #FIX THIS HERE for counter in range((len(seq) / 3) - (len(chopped)%3)): print "Counter: ", counter print "Length: ", ((len(seq) / 3) - (len(chopped)%3)) chopped = split_len(seq, 3)[i:] return chopped # compare with the start codon. note: i is the offset so # that we return the correct numbers based on the # numbering of the full sequence. #if chopped[counter] is 'AUG': #start_num = counter + i #if (start_num != -1) and ((chopped[counter] == stop) or (chopped[counter] == stop) or (chopped[counter] == stop)): #return [start_num, (counter + i) - (start_num + 1)] def split_len(seq, length): return [seq[i:i+length] for i in range(0, len(seq), length)] print split_len('ABCABCABCABC', 3) word = 'hello' print word[2:4] print translate('AUG') list = find_frame('AUGATGUAG') print list
The second half of this has some issues but it's getting there... i need to go back and put in some fixes so it all works together nicely. Most importantly, I need to figure out how to hook it up with a library that can tell you info about specific proteins.