Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-20

The script:


 * 1) !/usr/bin/env python

import os from Bio import Clustalw, Seq, SeqRecord, SubsMat from Bio.Align import AlignInfo from Bio.Align.FormatConvert import FormatConverter


 * 1) Do nucleotide alignment

print "Preparing initial alignment data" print '-' * 40 cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline)

summary = AlignInfo.SummaryInfo(alignment)
 * 1) consensus = summary.dumb_consensus
 * 2) pssmatrix = summary.pos_specific_score_matrix(consensus)


 * 1) Try protein alignment

print "Preparing protein data" print '-' * 40 prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w') for seq in alignment.get_all_seqs: s = seq.seq.tostring # Translate! p = Seq.translate(s).split('*') # Split a stop codon q = Seq.translate('C' + s).split('*') # Try a different frame r = Seq.translate('CC' + s).split('*') # Try another # Group them all pqr = p	pqr.extend(q) pqr.extend(r) # Find start codon for each def trim(x): return x[x.find('M'):] p_trimmed = map(trim, pqr) # Find longest of them def max(x, y): if len(x) < len(y): return y		else: return x	p_max = reduce(max, p_trimmed) # Create file prot.write('>') prot.write(seq.description) prot.write('\n') prot.write(p_max) prot.write('\n') prot.close prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta')) prot_cline.set_output('.test_prot.aln') prot_alignment = Clustalw.do_alignment(prot_cline) os.remove(os.path.join(os.curdir, '.apoe_prot.fasta')) os.remove(os.path.join(os.curdir, '.test_prot.aln'))


 * 1) Use clumsy algo to find occurrences of deletions, insertions, rearrangements

seq_copy = alignment.get_all_seqs ref_seq = seq_copy.pop(0) # pop out reference sequence (the first one) print "Comparing against", ref_seq.description print '-' * 40 counter = 0 # clumsy counter...to be used later for seq in seq_copy: counter = counter + 1 # Write a temporary file for one-on-one alignment s = seq.seq.tostring working_file = os.path.join(os.curdir, '.apoe_working.fasta') temp = open(working_file, 'w') temp.write('>') temp.write(ref_seq.description) temp.write('\n') temp.write(ref_seq.seq.tostring) temp.write('\n') temp.write('>') temp.write(seq.description) temp.write('\n') temp.write(s) temp.close # Do the aligning cline_temp = Clustalw.MultipleAlignCL(working_file) cline_temp.set_output('.test_working.aln') alignment_temp = Clustalw.do_alignment(cline_temp) # Analyse it	realigned_ref = alignment_temp.get_seq_by_num(0).tostring realigned_test = alignment_temp.get_seq_by_num(1).tostring summary_temp = AlignInfo.SummaryInfo(alignment_temp) consensus_temp = summary_temp.dumb_consensus.tostring # Find dashes in the reference sequence: those are the insertions insertions_num = 0 insertions_find = realigned_ref.find('-', 0) insertions_count = dict(A=0, C=0, G=0, T=0) # start a count of which nucleotides are inserted most while not insertions_find == -1: insertions_num = insertions_num + 1 insertions_find = realigned_ref.find('-', insertions_find + 1) # Increment the appropriate nucleotide count by 1 by looking at what's in the test sequence insertions_count[realigned_test[insertions_find]] = insertions_count[realigned_test[insertions_find]] + 1 # Find dashes in the test sequence: those are the deletions deletions_num = 0 deletions_find = realigned_test.find('-', 0) deletions_count = dict(A=0, C=0, G=0, T=0) while not deletions_find == -1: deletions_num = deletions_num + 1 deletions_find = realigned_test.find('-', deletions_find + 1) deletions_count[realigned_ref[deletions_find]] = deletions_count[realigned_ref[deletions_find]] + 1 # Find point mutations in consensus sequence (X's)	point_mutations_num = consensus_temp.count('X') print seq.description print "Number of inserted bases:", insertions_num, print str(insertions_count) print "Number of deleted bases:", deletions_num, print str(deletions_count) print "Number of point mutations:", point_mutations_num # Find any frameshifts # Replace all occurrences of '---' from both reference and test sequences processed_ref = realigned_ref.replace('---', '***').rstrip('-') processed_test = realigned_test.replace('---', '***').rstrip('-') # Find all occurrences of '--' still left ref_frameshift_count = processed_ref.count('--', 0) test_frameshift_count = processed_test.count('--', 0) # Replace these reprocessed_ref = processed_ref.replace('--', '**') # it's like reprocessed meat! reprocessed_test = processed_test.replace('--', '**') # Find all remaining occurrences of '-' still left ref_frameshift_count = ref_frameshift_count + reprocessed_ref.count('-', 0) test_frameshift_count = test_frameshift_count + reprocessed_test.count('-', 0) total_frameshifts = ref_frameshift_count + test_frameshift_count print "Possible frameshifts:", total_frameshifts # Do more aligning, but this time with proteins! ref_prot_seq = prot_alignment.get_seq_by_num(0).tostring test_prot_seq = prot_alignment.get_seq_by_num(counter).tostring # Write a temporary file for one-on-one alignment working_prot_file = os.path.join(os.curdir, '.apoe_working_prot.fasta') temp2 = open(working_prot_file, 'w') temp2.write('>') temp2.write(ref_seq.description) temp2.write('\n') temp2.write(ref_prot_seq) temp2.write('\n') temp2.write('>') temp2.write(seq.description) temp2.write('\n') temp2.write(test_prot_seq) temp2.close # Do actual aligning cline_temp2 = Clustalw.MultipleAlignCL(working_prot_file) cline_temp2.set_output('.test_working_prot.aln') alignment_prot_temp = Clustalw.do_alignment(cline_temp2) print '-' * 40 os.remove(working_file) os.remove(os.path.join(os.curdir, '.test_working.aln')) os.remove(working_prot_file) os.remove(os.path.join(os.curdir, '.test_working_prot.aln'))

print "Xiaodi Wu, Biophysics 101, 2007-02-20\n"
 * 1) converter = FormatConverter(alignment)

File output (raw alignment, test.aln):

CLUSTAL W (1.83) multiple sequence alignment

gi|178850|gb|K00396.1|HUMAPOE3     CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC lcl|HUMAPOE3_with_deletion         CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC lcl|HUMAPOE3_with_mutation         CGCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGACTGGCCAATCAC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG lcl|HUMAPOE3_with_deletion         AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG lcl|HUMAPOE3_with_mutation         AGGCAGGAAGATGAAGGTTCTGTGGGCTGCGTTGCTGGTCACATTCCTGG **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC lcl|HUMAPOE3_with_deletion         CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC lcl|HUMAPOE3_with_mutation         CAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC lcl|HUMAPOE3_with_deletion         GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC lcl|HUMAPOE3_with_mutation         GAGCTGCGCCAGCAGACCGAGTGGCAGAGCGGCCAGCGCTGGGAACTGGC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC lcl|HUMAPOE3_with_deletion         ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC lcl|HUMAPOE3_with_mutation         ACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGACACTGTCTGAGC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG lcl|HUMAPOE3_with_deletion         AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG lcl|HUMAPOE3_with_mutation         AGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGGCG **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA lcl|HUMAPOE3_with_deletion         CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA lcl|HUMAPOE3_with_mutation         CTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGA **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG lcl|HUMAPOE3_with_deletion         GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG lcl|HUMAPOE3_with_mutation         GGAACAACTGACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGG **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC lcl|HUMAPOE3_with_deletion         AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC lcl|HUMAPOE3_with_mutation         AGCTGCAGGCGGCGCAGGCCCGGCTGGGCGCGGACATGGAGGACGTGTGC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG lcl|HUMAPOE3_with_deletion         GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG lcl|HUMAPOE3_with_mutation         GGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGAG **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA lcl|HUMAPOE3_with_deletion         CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA lcl|HUMAPOE3_with_mutation         CACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTA **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC lcl|HUMAPOE3_with_deletion         AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC lcl|HUMAPOE3_with_mutation         AGCGGCTCCTCCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTAC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA lcl|HUMAPOE3_with_deletion         CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA lcl|HUMAPOE3_with_mutation         CAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGCGCCATCCGCGA **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG lcl|HUMAPOE3_with_deletion         GCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG lcl|HUMAPOE3_with_mutation         GCGCCTGGAGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGG ******** *****************************************

gi|178850|gb|K00396.1|HUMAPOE3     GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG lcl|HUMAPOE3_with_deletion         GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG lcl|HUMAPOE3_with_mutation         GCTCCCTGGCCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAG **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT lcl|HUMAPOE3_with_deletion         CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT lcl|HUMAPOE3_with_mutation         CGGCTGCGCGCGCGGATGGAGGAGATGGGCAGCCGGACCCGCGACCGCCT **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     GGACGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC lcl|HUMAPOE3_with_deletion         GG-GTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC lcl|HUMAPOE3_with_mutation         GGCCGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAGC **    *******************************************

gi|178850|gb|K00396.1|HUMAPOE3     AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG lcl|HUMAPOE3_with_deletion         AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG lcl|HUMAPOE3_with_mutation         AGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAG **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT lcl|HUMAPOE3_with_deletion         AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT lcl|HUMAPOE3_with_mutation         AGCTGGTTCGAGCCCCTGGTGGAAGACATGCAGCGCCAGTGGGCCGGGCT **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA lcl|HUMAPOE3_with_deletion         GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA lcl|HUMAPOE3_with_mutation         GGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGCCCCTGTGCCCA **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC lcl|HUMAPOE3_with_deletion         GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC lcl|HUMAPOE3_with_mutation         GCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC lcl|HUMAPOE3_with_deletion         CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC lcl|HUMAPOE3_with_mutation         CCGTGCCTCCTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCC **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT lcl|HUMAPOE3_with_deletion         CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT lcl|HUMAPOE3_with_mutation         CCAGCCGTCCTCCTGGGGTGGACCCTAGTTTAATAAAGATTCACCAAGTT **************************************************

gi|178850|gb|K00396.1|HUMAPOE3     TCACGC lcl|HUMAPOE3_with_deletion         TCACGC lcl|HUMAPOE3_with_mutation         TCACGC ******

Command line output:

Preparing initial alignment data

Preparing protein data

Comparing against gi|178850|gb|K00396.1|HUMAPOE3

lcl|HUMAPOE3_with_deletion Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0} Number of deleted bases: 5 {'A': 1, 'C': 2, 'T': 0, 'G': 2} Number of point mutations: 0 Possible frameshifts: 1

lcl|HUMAPOE3_with_mutation Number of inserted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0} Number of deleted bases: 0 {'A': 0, 'C': 0, 'T': 0, 'G': 0} Number of point mutations: 2 Possible frameshifts: 0

Xiaodi Wu, Biophysics 101, 2007-02-20

Still to do: code a (fairly repetitive and mundane) part to examine the protein sequences that arise, and compare with the changes observed in nucleotide sequence.