Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-2-27
From OpenWetWare
Jump to navigationJump to search
# Katie Fifer
# asst3 - sequence comparison
#! /usr/bin/env python
import os
import re
from Bio import Clustalw
from Bio import Translate
from Bio.Align import AlignInfo
from Bio.Alphabet import IUPAC
from sets import Set
from sys import *
from Bio.Seq import translate
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'Apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
# generate simple consensus sequence
summary_align = AlignInfo.SummaryInfo(alignment)
consensus = summary_align.dumb_consensus()
print "Consensus: ", consensus.tostring()
# Single nucleotide analysis
for i in range(alignment.get_alignment_length()):
col = alignment.get_column(i)
s = Set() # creat a new set
for c in range(len(col)):
s.add(col[c]) # add each column element to the set
if len(s) > 1: # multiple elements in s indicate a mismatch
# print i, col
# determine the type of mutation
# see if it's a deletion
p = re.compile("\w-+\w")
m = p.match(col)
if m:
print 'Deletion at %d: %s' % (i, col)
# see if it's a point mutation
else:
print 'Point Mutation at %d: %s' %(i, col)
# Codon analysis: figure out what the codons are and compare
# them. note any protein changes (not totally finished).
# set up a list of lists of all the codons of each sequence.
big_list = []
for seq in alignment.get_all_seqs():
seq_codons = []
index = 0
num_codons = ((alignment.get_alignment_length()) / 3)
for j in range(num_codons):
new_codon = ''.join([seq.seq[index + i] for i in range(3)])
index = index + 3
seq_codons.append(new_codon)
big_list.append(seq_codons)
# using the big list that was just generated, do codon comparison and
# print out any that are different.
for i in range(len(big_list[0])):
curr_codon = (big_list[0])[i]
curr_list = []
for j in range(len(big_list)):
# make the list of the codon's we're comparing at the
# moment. This may seem like wasted extra work for only
# comparing a few codons, but we'll see that with filter
# (below) we'll easily be able to pick out the mismatches from
# a variable number of sequence comparisons.
curr_list.append((big_list[j])[i])
new_list = filter(lambda x: x != curr_codon, curr_list)
# if the codons ended up being different, print out what the
# different proteins are
if(new_list):
print "a mismatch! Should have been %s. Instead is %s" % (curr_codon, new_list)
# Figure out what the change was in terms of proteins.
# Figure out how significant this change is.
# Note: map would be way sweeter here, but whatevs, it doesn't
# seem to work right.
mutated_proteins = []
for new_codon in new_list:
mutated_proteins.append(translate(new_codon))
print "Protein Change: %s became %s" % (translate(curr_codon), mutated_proteins)
# Notes about implementation:
# 1. The next step will be to combine this
# codon analysis with a protein library so that an analysis of wether
# mutations are silent or not can be done.
# 2. I'm not sure this is the
# best implementation in that when the alignment did something like
# place '-' as a placeholder to get the sequence to line up better, i
# left that and didn't actually treat it like it could have caused a
# frameshift. it would be pretty straightforward to make it a
# frameshift, though (just by not including it when making the codon
# list) and the rest of the implementation can stay the same.
# Helper function to determine ORF (reading frame). will return a
# list with two numbers - the start nucleotide number and the length of
# the open reading frame (number of nucleotides).
def find_frame(seq):
# try it starting with the first nucleotide, then with the second,
# then with the third. return whichever is found first. (this
# could be improved later.
start = 'AUG'
stop = ['UAA', 'UAG', 'UGA']
start_num = -1
# this will do all three trials in order (changing the seq by one
# each time.
chopped = []
for i in range(2):
# chopped will be a list in which each element is a codon in
# the list.
print "len chopped: ", len(chopped)
#FIX THIS HERE
for counter in range((len(seq) / 3) - (len(chopped)%3)):
print "Counter: ", counter
print "Length: ", ((len(seq) / 3) - (len(chopped)%3))
chopped = split_len(seq, 3)[i:]
return chopped
# compare with the start codon. note: i is the offset so
# that we return the correct numbers based on the
# numbering of the full sequence.
#if chopped[counter] is 'AUG':
#start_num = counter + i
#if (start_num != -1) and ((chopped[counter] == stop[0]) or (chopped[counter] == stop[1]) or (chopped[counter] == stop[2])):
#return [start_num, (counter + i) - (start_num + 1)]
def split_len(seq, length):
return [seq[i:i+length] for i in range(0, len(seq), length)]
print split_len('ABCABCABCABC', 3)
word = 'hello'
print word[2:4]
print translate('AUG')
list = find_frame('AUGATGUAG')
print list
The second half of this has some issues but it's getting there... i need to go back and put in some fixes so it all works together nicely. Most importantly, I need to figure out how to hook it up with a library that can tell you info about specific proteins.