Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20
From OpenWetWare
Jump to navigationJump to search
The script:
#!/usr/bin/env python
import os
from Bio import Clustalw, Seq, SeqRecord, SubsMat
from Bio.Align import AlignInfo
from Bio.Align.FormatConvert import FormatConverter
# Do nucleotide alignment
print "Preparing initial alignment data"
print '-' * 40
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
summary = AlignInfo.SummaryInfo(alignment)
# consensus = summary.dumb_consensus()
# pssmatrix = summary.pos_specific_score_matrix(consensus)
# Try protein alignment
print "Preparing protein data"
print '-' * 40
prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w')
for seq in alignment.get_all_seqs():
s = seq.seq.tostring()
# Translate!
p = Seq.translate(s).split('*') # Split a stop codon
q = Seq.translate('C' + s).split('*') # Try a different frame
r = Seq.translate('CC' + s).split('*') # Try another
# Group them all
pqr = p
pqr.extend(q)
pqr.extend(r)
# Find start codon for each
def trim(x): return x[x.find('M'):]
p_trimmed = map(trim, pqr)
# Find longest of them
def max(x, y):
if len(x) < len(y): return y
else: return x
p_max = reduce(max, p_trimmed)
# Create file
prot.write('>')
prot.write(seq.description)
prot.write('\n')
prot.write(p_max)
prot.write('\n')
prot.close()
prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta'))
prot_cline.set_output('.test_prot.aln')
prot_alignment = Clustalw.do_alignment(prot_cline)
os.remove(os.path.join(os.curdir, '.apoe_prot.fasta'))
os.remove(os.path.join(os.curdir, '.test_prot.aln'))
# Use clumsy algo to find occurrences of deletions, insertions, rearrangements
seq_copy = alignment.get_all_seqs()
ref_seq = seq_copy.pop(0) # pop out reference sequence (the first one)
print "Comparing against", ref_seq.description
print '-' * 40
counter = 0 # clumsy counter...to be used later
for seq in seq_copy:
counter = counter + 1
# Write a temporary file for one-on-one alignment
s = seq.seq.tostring()
working_file = os.path.join(os.curdir, '.apoe_working.fasta')
temp = open(working_file, 'w')
temp.write('>')
temp.write(ref_seq.description)
temp.write('\n')
temp.write(ref_seq.seq.tostring())
temp.write('\n')
temp.write('>')
temp.write(seq.description)
temp.write('\n')
temp.write(s)
temp.close()
# Do the aligning
cline_temp = Clustalw.MultipleAlignCL(working_file)
cline_temp.set_output('.test_working.aln')
alignment_temp = Clustalw.do_alignment(cline_temp)
# Analyse it
realigned_ref = alignment_temp.get_seq_by_num(0).tostring()
realigned_test = alignment_temp.get_seq_by_num(1).tostring()
summary_temp = AlignInfo.SummaryInfo(alignment_temp)
consensus_temp = summary_temp.dumb_consensus().tostring()
# Find dashes in the reference sequence: those are the insertions
insertions_num = 0
insertions_find = realigned_ref.find('-', 0)
insertions_count = dict(A=0, C=0, G=0, T=0) # start a count of which nucleotides are inserted most
while not insertions_find == -1:
insertions_num = insertions_num + 1
insertions_find = realigned_ref.find('-', insertions_find + 1)
# Increment the appropriate nucleotide count by 1 by looking at what's in the test sequence
insertions_count[realigned_test[insertions_find]] = insertions_count[realigned_test[insertions_find]] + 1
# Find dashes in the test sequence: those are the deletions
deletions_num = 0
deletions_find = realigned_test.find('-', 0)
deletions_count = dict(A=0, C=0, G=0, T=0)
while not deletions_find == -1:
deletions_num = deletions_num + 1
deletions_find = realigned_test.find('-', deletions_find + 1)
deletions_count[realigned_ref[deletions_find]] = deletions_count[realigned_ref[deletions_find]] + 1
# Find point mutations in consensus sequence (X's)
point_mutations_num = consensus_temp.count('X')
print seq.description
print "Number of inserted bases:", insertions_num,
print str(insertions_count)
print "Number of deleted bases:", deletions_num,
print str(deletions_count)
print "Number of point mutations:", point_mutations_num
# Find any frameshifts
# Replace all occurrences of '---' from both reference and test sequences
processed_ref = realigned_ref.replace('---', '***').rstrip('-')
processed_test = realigned_test.replace('---', '***').rstrip('-')
# Find all occurrences of '--' still left
ref_frameshift_count = processed_ref.count('--', 0)
test_frameshift_count = processed_test.count('--', 0)
# Replace these
reprocessed_ref = processed_ref.replace('--', '**') # it's like reprocessed meat!
reprocessed_test = processed_test.replace('--', '**')
# Find all remaining occurrences of '-' still left
ref_frameshift_count = ref_frameshift_count + reprocessed_ref.count('-', 0)
test_frameshift_count = test_frameshift_count + reprocessed_test.count('-', 0)
total_frameshifts = ref_frameshift_count + test_frameshift_count
print "Possible frameshifts:", total_frameshifts
# Do more aligning, but this time with proteins!
ref_prot_seq = prot_alignment.get_seq_by_num(0).tostring()
test_prot_seq = prot_alignment.get_seq_by_num(counter).tostring()
# Write a temporary file for one-on-one alignment
working_prot_file = os.path.join(os.curdir, '.apoe_working_prot.fasta')
temp2 = open(working_prot_file, 'w')
temp2.write('>')
temp2.write(ref_seq.description)
temp2.write('\n')
temp2.write(ref_prot_seq)
temp2.write('\n')
temp2.write('>')
temp2.write(seq.description)
temp2.write('\n')
temp2.write(test_prot_seq)
temp2.close()
# Do actual aligning
cline_temp2 = Clustalw.MultipleAlignCL(working_prot_file)
cline_temp2.set_output('.test_working_prot.aln')
alignment_prot_temp = Clustalw.do_alignment(cline_temp2)
print '-' * 40
os.remove(working_file)
os.remove(os.path.join(os.curdir, '.test_working.aln'))
os.remove(working_prot_file)
os.remove(os.path.join(os.curdir, '.test_working_prot.aln'))
# converter = FormatConverter(alignment)
print "Xiaodi Wu, Biophysics 101, 2007-02-20\n"
File output (raw alignment, test.aln):
CLUSTAL W (1.83) multiple sequence alignment
gi|178850|gb|K00396.1|HUMAPOE3 CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC
lcl|HUMAPOE3_with_deletion CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC
lcl|HUMAPOE3_with_mutation CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG
lcl|HUMAPOE3_with_deletion AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG
lcl|HUMAPOE3_with_mutation AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC
lcl|HUMAPOE3_with_deletion CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC
lcl|HUMAPOE3_with_mutation CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC
lcl|HUMAPOE3_with_deletion GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC
lcl|HUMAPOE3_with_mutation GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC
lcl|HUMAPOE3_with_deletion ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC
lcl|HUMAPOE3_with_mutation ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG
lcl|HUMAPOE3_with_deletion AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG
lcl|HUMAPOE3_with_mutation AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA
lcl|HUMAPOE3_with_deletion CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA
lcl|HUMAPOE3_with_mutation CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG
lcl|HUMAPOE3_with_deletion GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG
lcl|HUMAPOE3_with_mutation GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC
lcl|HUMAPOE3_with_deletion AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC
lcl|HUMAPOE3_with_mutation AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG
lcl|HUMAPOE3_with_deletion GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG
lcl|HUMAPOE3_with_mutation GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA
lcl|HUMAPOE3_with_deletion CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA
lcl|HUMAPOE3_with_mutation CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC
lcl|HUMAPOE3_with_deletion AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC
lcl|HUMAPOE3_with_mutation AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA
lcl|HUMAPOE3_with_deletion CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA
lcl|HUMAPOE3_with_mutation CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG
lcl|HUMAPOE3_with_deletion GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG
lcl|HUMAPOE3_with_mutation GCGCCTGGAGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG
******** *****************************************
gi|178850|gb|K00396.1|HUMAPOE3 GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG
lcl|HUMAPOE3_with_deletion GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG
lcl|HUMAPOE3_with_mutation GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT
lcl|HUMAPOE3_with_deletion CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT
lcl|HUMAPOE3_with_mutation CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 GGACGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC
lcl|HUMAPOE3_with_deletion GG-----GTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC
lcl|HUMAPOE3_with_mutation GGCCGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC
** *******************************************
gi|178850|gb|K00396.1|HUMAPOE3 AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG
lcl|HUMAPOE3_with_deletion AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG
lcl|HUMAPOE3_with_mutation AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT
lcl|HUMAPOE3_with_deletion AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT
lcl|HUMAPOE3_with_mutation AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA
lcl|HUMAPOE3_with_deletion GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA
lcl|HUMAPOE3_with_mutation GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC
lcl|HUMAPOE3_with_deletion GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC
lcl|HUMAPOE3_with_mutation GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC
lcl|HUMAPOE3_with_deletion CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC
lcl|HUMAPOE3_with_mutation CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT
lcl|HUMAPOE3_with_deletion CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT
lcl|HUMAPOE3_with_mutation CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT
**************************************************
gi|178850|gb|K00396.1|HUMAPOE3 TCACGC
lcl|HUMAPOE3_with_deletion TCACGC
lcl|HUMAPOE3_with_mutation TCACGC
******
Command line output:
Preparing initial alignment data
----------------------------------------
Preparing protein data
----------------------------------------
Comparing against gi|178850|gb|K00396.1|HUMAPOE3
----------------------------------------
lcl|HUMAPOE3_with_deletion
Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0}
Number of deleted bases: 5 {'A': 1, 'C': 2, 'T': 0, 'G': 2}
Number of point mutations: 0
Possible frameshifts: 1
----------------------------------------
lcl|HUMAPOE3_with_mutation
Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0}
Number of deleted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0}
Number of point mutations: 2
Possible frameshifts: 0
----------------------------------------
Xiaodi Wu, Biophysics 101, 2007-02-20
Still to do: code a (fairly repetitive and mundane) part to examine the protein sequences that arise, and compare with the changes observed in nucleotide sequence.