Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-27

The code...(a bit brute force in the end, but...)


 * 1) !/usr/bin/env python

import os, re from Bio import Clustalw, Seq, SeqRecord, SubsMat from Bio.Align import AlignInfo from Bio.Align.FormatConvert import FormatConverter

print '-' * 40


 * 1) (1) Do nucleotide alignment


 * 1) print "Preparing initial alignment data"
 * 2) print '-' * 40

cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta')) cline.set_output('test.aln') alignment = Clustalw.do_alignment(cline) summary = AlignInfo.SummaryInfo(alignment)


 * 1) consensus = summary.dumb_consensus
 * 2) pssmatrix = summary.pos_specific_score_matrix(consensus)


 * 1) (2) Make ourselves a copy of the reference sequence; very useful really

seq_copy = alignment.get_all_seqs ref_seq = seq_copy[0]


 * 1) TODO: ClustalW does something very nasty and reverses the order of inputted
 * 2) sequences on occasion: what is needed is an algorithm to compare the description
 * 3) fields of each of the sequences, finding their location in the original FASTA
 * 4) file and taking the topmost as the reference


 * 1) (3) Try orf determination on main sequence
 * 2)     all others will be presumed mutants so we don't have to go through this
 * 3)     analysis more than once; we of course need to presume that there are no
 * 4)     frameshift mutations before the beginning of the orf that will throw these
 * 5)     calculations off

print "ORF DETERMINATION" print '-' * 40

orf_start = -1 # for now orf_end = -1 # for now orf_frame = 0 # for now s = ref_seq.seq.tostring

p = Seq.translate(s).split('*') # Split a stop codon q = Seq.translate('C' + s).split('*') # Try a different frame r = Seq.translate('CC' + s).split('*') # Try another pc = Seq.translate(Seq.reverse_complement(s)).split('*') # And yet another qc = Seq.translate('C' + Seq.reverse_complement(s)).split('*') # And another rc = Seq.translate('CC' + Seq.reverse_complement(s)).split('*') # And another
 * 1) Translate!

pqr = Seq.translate(s).split('*') pqr.extend(q) pqr.extend(r) pqr.extend(pc) pqr.extend(qc) pqr.extend(rc)
 * 1) Group them all

def trim(x): if x.find('M') >= 0: return x[x.find('M'):] else: return '' p_trimmed = map(trim, pqr)
 * 1) Find start codon for each

def max(x, y): if len(x) < len(y): return y	else: return x p_max = reduce(max, p_trimmed)
 * 1) Find longest of them


 * 1) CREATE PRETTY OUTPUT (consumes extra cycles)
 * 2) and actually pinpoints the frame -- otherwise a pretty useless algo

px = reduce(max, map(trim, p)) px2 = Seq.translate(s).find(px) * 3 + 1 # Start codon position (add one b/c zero-based) px3 = len(px) * 3 # Length px4 = px2 + px3 # Stop codon position

if px3 == 0: # The pathetic minimal case that otherwise kills the algo px2 = 0 px4 = 0

print "5'->3' frame 1:" print "start codon position: %d. stop codon position: %d. transcript length: %d." % (px2, px4, px3), if px == p_max: print "(** using this one **)", orf_frame = 1 orf_start = px2 orf_end = px4 print "\n",

qx = reduce(max, map(trim, q)) qx2 = Seq.translate('C' + s).find(qx) * 3 # Start codon position qx3 = len(qx) * 3 # Length qx4 = qx2 + qx3 # Stop codon position

if qx3 == 0: # The pathetic minimal case that otherwise kills the algo qx2 = 0 qx4 = 0

print "5'->3' frame 2:" print "start codon position: %d. stop codon position: %d. transcript length: %d." % (qx2, qx4, qx3), if qx == p_max: print "(** using this one **)", orf_frame = 2 orf_start = qx2 orf_end = qx4 print "\n",

rx = reduce(max, map(trim, r)) rx2 = Seq.translate('CC' + s).find(rx) * 3 - 1 # Start codon position rx3 = len(rx) * 3 # Length rx4 = rx2 + rx3 # Stop codon position

if rx3 == 0: # The pathetic minimal case that otherwise kills the algo rx2 = 0 rx4 = 0

print "5'->3' frame 3:" print "start codon position: %d. stop codon position: %d. transcript length: %d." % (rx2, rx4, rx3), if rx == p_max: print "(** using this one **)", orf_frame = 3 orf_start = rx2 orf_end = rx4 print "\n",

pcx = reduce(max, map(trim, pc)) pcx2 = len(s) + 1 - (Seq.translate(Seq.reverse_complement(s)).find(pcx) * 3 + 1) # Start codon position pcx3 = len(pcx) * 3 # Length pcx4 = pcx2 - pcx3 # Stop codon position

if pcx3 == 0: # The pathetic minimal case that otherwise kills the algo pcx2 = 0 pcx4 = 0

print "3'->5' frame 1:" print "start codon position: %d. stop codon position: %d. transcript length: %d." % (pcx2, pcx4, pcx3), if pcx == p_max: print "(** using this one **)", orf_frame = 4 orf_start = pcx2 orf_end = pcx4 print "\n",

qcx = reduce(max, map(trim, qc)) qcx2 = len(s) + 1 - (Seq.translate('C' + Seq.reverse_complement(s)).find(qcx) * 3) # Start codon position qcx3 = len(qcx) * 3 # Length qcx4 = qcx2 - qcx3 # Stop codon position

if qcx3 == 0: # The pathetic minimal case that otherwise kills the algo qcx2 = 0 qcx4 = 0

print "3'->5' frame 2:" print "start codon position: %d. stop codon position: %d. transcript length: %d." % (qcx2, qcx4, qcx3), if qcx == p_max: print "(** using this one **)", orf_frame = 5 orf_start = qcx2 orf_end = qcx4 print "\n",

rcx = reduce(max, map(trim, rc)) rcx2 = len(s) + 1 - (Seq.translate('CC' + Seq.reverse_complement(s)).find(rcx) * 3 - 1) # Start codon position rcx3 = len(rcx) * 3 # Length rcx4 = rcx2 - rcx3 # Stop codon position

if rcx3 == 0: # The pathetic minimal case that otherwise kills the algo rcx2 = 0 rcx4 = 0

print "3'->5' frame 3:" print "start codon position: %d. stop codon position: %d. transcript length: %d." % (rcx2, rcx4, rcx3), if rcx == p_max: print "(** using this one **)", orf_frame = 6 orf_start = rcx2 orf_end = rcx4 print "\n",

print '-' * 40


 * 1) END OF USELESS OUTPUT-GENERATING SEGMENT


 * 1) We need to find a way to align all the sequences, including mutations
 * 2) our current orf_start and orf_end do not take into account insertions, etc.
 * 3) which we must examine before we can do any additional processing

s_copy = re.sub('[^\-]', '*', s) # well, the insertions we need to account for are all '-' orf_start_normalized = s_copy.replace('*', '^', orf_start).rfind('^') # zero-based orf start orf_end_normalized = s_copy.replace('*', '^', orf_end).rfind('^') # zero-based orf end

trimmed_seq_descriptions = [] trimmed_seqs = [] for seq in alignment.get_all_seqs: s = seq.seq.tostring trimmed_seq_descriptions.append(seq.description) if orf_start_normalized < orf_end_normalized: start = orf_start_normalized length = orf_end_normalized - orf_start_normalized + 3 # include stop codon trimmed_seqs.append(s[start:start + length]) else: temp = Seq.reverse_complement(s) start = len(s) - orf_start_normalized - 1 length = orf_start_normalized - orf_end_normalized + 3 # include stop codon trimmed_seqs.append(temp[start:start + length])
 * 1) Now, we need to trim everything down to this size


 * 1) (4) Work on the protein side of things (get an alignment in a hidden file)

prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w') counter = 0 # The dumb counter method of synching two lists raw_translated_seqs = [] for seq in trimmed_seqs: translated_seq = Seq.translate(seq.replace('-','')) raw_translated_seqs.append(translated_seq) # Create input file prot.write('>') prot.write(trimmed_seq_descriptions[counter]) prot.write('\n') prot.write(translated_seq) prot.write('\n') # Increment counter = counter + 1 prot.close

prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta')) prot_cline.set_output('.test_prot.aln') prot_alignment = Clustalw.do_alignment(prot_cline) os.remove(os.path.join(os.curdir, '.apoe_prot.fasta')) os.remove(os.path.join(os.curdir, '.test_prot.aln'))
 * 1) Figure out protein alignment


 * 1) (5) Use clumsy algo to find occurrences of deletions, insertions, rearrangements

seq_copy.pop(0) # pop out reference sequence (the first one) ref_trimmed = trimmed_seqs.pop(0) ref_trimmed_desc = trimmed_seq_descriptions.pop(0) print "COMPARING AGAINST", ref_seq.description print "[all nucleotide positions given relative to start codon]" print '-' * 40 counter = 0 # clumsy counter again...to be used later

for seq in seq_copy: # Write a temporary file for one-on-one alignment working_file = os.path.join(os.curdir, '.apoe_working.fasta') temp = open(working_file, 'w') temp.write('>') temp.write(ref_seq.description) temp.write('\n') temp.write(ref_trimmed) temp.write('\n') temp.write('>') temp.write(seq.description) temp.write('\n') temp.write(trimmed_seqs[counter]) temp.close # Do the aligning cline_temp = Clustalw.MultipleAlignCL(working_file) cline_temp.set_output('.test_working.aln') alignment_temp = Clustalw.do_alignment(cline_temp) # Analyse it	realigned_ref = alignment_temp.get_seq_by_num(0).tostring realigned_test = alignment_temp.get_seq_by_num(1).tostring summary_temp = AlignInfo.SummaryInfo(alignment_temp) consensus_temp = summary_temp.dumb_consensus.tostring # HEAVY DUTY OUTPUT TIME! print seq.description print '-' * 40 print '\n', # Find point mutations in consensus sequence (X's)	point_mutations_num = consensus_temp.count('X') point_mutations_find = consensus_temp.find('X',0) point_mutations_count = dict(A=0, C=0, G=0, T=0) while not point_mutations_find == -1: old = realigned_ref[point_mutations_find] new = realigned_test[point_mutations_find] # (Take into account any insertions or deletions for frame) index_adjusted = point_mutations_find - realigned_ref.count('-', 0, point_mutations_find) offset = index_adjusted % 3 # (Work backwards to reconstitute the codon) original_codon = realigned_ref[point_mutations_find] j = 1 for i in range(0, offset): char_temp = realigned_ref[point_mutations_find - j]			while not char_temp.isalpha: j = j + 1 char_temp = realigned_ref[point_mutations_find - j]			if char_temp.isalpha: original_codon = char_temp + original_codon j = j + 1 # (Work forwards to fill out the codon) j = 1 for i in range(0, 2 - offset): char_temp = realigned_ref[point_mutations_find + j]			while not char_temp.isalpha: j = j + 1 char_temp = realigned_ref[point_mutations_find + j]			if char_temp.isalpha: original_codon = original_codon + char_temp j = j + 1 # (Do the same for the mutated codon) # (Work backwards to reconstitute the codon) mutated_codon = realigned_test[point_mutations_find] j = 1 for i in range(0, offset): char_temp = realigned_test[point_mutations_find - j]			while not char_temp.isalpha: j = j + 1 char_temp = realigned_test[point_mutations_find - j]			if char_temp.isalpha: mutated_codon = char_temp + mutated_codon j = j + 1 # (Work forwards to fill out the codon) j = 1 for i in range(0, 2 - offset): char_temp = realigned_test[point_mutations_find + j]			while not char_temp.isalpha: j = j + 1 char_temp = realigned_test[point_mutations_find + j]			if char_temp.isalpha: mutated_codon = mutated_codon + char_temp j = j + 1 a = Seq.translate(original_codon) b = Seq.translate(mutated_codon) if a == b: print "Silent point mutation found: %s%d%s." % (old, index_adjusted + 1, new), print "Amino acid result: %s%d%s." % (a, (index_adjusted - index_adjusted % 3) / 3 + 1, b)		else: print "Non-silent point mutation found: %s%d%s." % (old, index_adjusted + 1, new), print "Amino acid result: %s%d%s." % (a, (index_adjusted - index_adjusted % 3) / 3 + 1, b) # Increment point_mutations_find = consensus_temp.find('X', point_mutations_find + 1)

print "*** Number of point mutations:", point_mutations_num, "***" print '\n', # Find dashes in the reference sequence: those are the insertions insertions_num = 0 insertions_find = realigned_ref.find('-', 0) while not insertions_find == -1: contig_count = 1 while realigned_ref[insertions_find + contig_count] == '-': contig_count = contig_count + 1 # (Take into account any earlier insertions) index_adjusted = insertions_find - realigned_ref.count('-', 0, insertions_find) offset = index_adjusted % 3 countback = insertions_find - offset - 6 translation_snippet_a = realigned_ref[countback:insertions_find + contig_count + 6] a = Seq.translate(translation_snippet_a.replace('-','')) translation_snippet_b = realigned_test[countback:insertions_find + contig_count + 6] b = Seq.translate(translation_snippet_b.replace('-','')) if contig_count % 3 == 0: print "Non-frameshift insertion found: %d base insertion at position %d" % (contig_count, index_adjusted + 1) print "DNA Reference: %s" % (translation_snippet_a) print "Prot Reference: %s" % (a), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) print "DNA Result:    %s" % (translation_snippet_b) print "Prot Result:   %s" % (b), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) else: print "Frameshift insertion found: %d base insertion at position %d" % (contig_count, index_adjusted + 1) print "DNA Reference: %s" % (translation_snippet_a) print "Prot Reference: %s" % (a), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) print "DNA Result:    %s" % (translation_snippet_b) print "Prot Result:   %s" % (b), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) insertions_find = realigned_ref.find('-', insertions_find + contig_count) print '\n',

# Find dashes in the test sequence: those are the deletions deletions_num = 0 deletions_find = realigned_test.find('-', 0) while not deletions_find == -1: contig_count = 1 while realigned_test[deletions_find + contig_count] == '-': contig_count = contig_count + 1 # (Take into account any earlier insertions) index_adjusted = deletions_find - realigned_ref.count('-', 0, deletions_find) offset = index_adjusted % 3 countback = deletions_find - offset - 6 translation_snippet_a = realigned_ref[countback:deletions_find + contig_count + 6] a = Seq.translate(translation_snippet_a.replace('-','')) translation_snippet_b = realigned_test[countback:deletions_find + contig_count + 6] b = Seq.translate(translation_snippet_b.replace('-','')) if contig_count % 3 == 0: print "Non-frameshift deletion found: %d base deletion, position %d to %d" % (contig_count, index_adjusted + 1, index_adjusted + contig_count + 1) print "DNA Reference: %s" % (translation_snippet_a) print "Prot Reference: %s" % (a), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) print "DNA Result:    %s" % (translation_snippet_b) print "Prot Result:   %s" % (b), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) else: print "Frameshift deletion found: %d base deletion, position %d to %d" % (contig_count, index_adjusted + 1, index_adjusted + contig_count + 1) print "DNA Reference: %s" % (translation_snippet_a) print "Prot Reference: %s" % (a), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) print "DNA Result:    %s" % (translation_snippet_b) print "Prot Result:   %s" % (b), if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame) else: print "(3'->5' frame %d)" % (orf_frame - 3) deletions_find = realigned_test.find('-', deletions_find + contig_count) print '\n', print '-' * 40 # Increment and clean up	counter = counter + 1 os.remove(working_file) os.remove(os.path.join(os.curdir, '.test_working.aln'))

print "Xiaodi Wu, Biophysics 101, 2007-02-27\n"

...and the output...

ORF DETERMINATION

5'->3' frame 1: start codon position: 61. stop codon position: 1012. transcript length: 951. (** using this one **) 5'->3' frame 2: start codon position: 105. stop codon position: 246. transcript length: 141. 5'->3' frame 3: start codon position: 566. stop codon position: 809. transcript length: 243. 3'->5' frame 1: start codon position: 94. stop codon position: 91. transcript length: 3. 3'->5' frame 2: start codon position: 929. stop codon position: 335. transcript length: 594. 3'->5' frame 3: start codon position: 0. stop codon position: 0. transcript length: 0.

COMPARING AGAINST gi|178850|gb|K00396.1|HUMAPOE3 [all nucleotide positions given relative to start codon]

lcl|HUMAPOE3_with_deletion


 * Number of point mutations: 0 ***

Frameshift deletion found: 5 base deletion, position 743 to 748 DNA Reference: CGCCTGGACGAGGTGAAG Prot Reference: RLDEVK (5'->3' frame 1) DNA Result:    CGCCTGG-GTGAAG Prot Result:   RLGE (5'->3' frame 1)

lcl|HUMAPOE3_with_mutation

Non-silent point mutation found: G599A. Amino acid result: G200E. Non-silent point mutation found: A743C. Amino acid result: D248A.
 * Number of point mutations: 2 ***

Xiaodi Wu, Biophysics 101, 2007-02-27