Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-27

From OpenWetWare
Jump to navigationJump to search

The code...(a bit brute force in the end, but...)

#!/usr/bin/env python

import os, re
from Bio import Clustalw, Seq, SeqRecord, SubsMat
from Bio.Align import AlignInfo
from Bio.Align.FormatConvert import FormatConverter

print '-' * 40

# (1) Do nucleotide alignment

# print "Preparing initial alignment data"
# print '-' * 40

cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
summary = AlignInfo.SummaryInfo(alignment)

# consensus = summary.dumb_consensus()
# pssmatrix = summary.pos_specific_score_matrix(consensus)

# (2) Make ourselves a copy of the reference sequence; very useful really

seq_copy = alignment.get_all_seqs()
ref_seq = seq_copy[0]

# TODO: ClustalW does something very nasty and reverses the order of inputted
# sequences on occasion: what is needed is an algorithm to compare the description
# fields of each of the sequences, finding their location in the original FASTA
# file and taking the topmost as the reference

# (3) Try orf determination on main sequence
#     all others will be presumed mutants so we don't have to go through this
#     analysis more than once; we of course need to presume that there are no
#     frameshift mutations before the beginning of the orf that will throw these
#     calculations off

print "ORF DETERMINATION"
print '-' * 40

orf_start = -1 # for now
orf_end = -1 # for now
orf_frame = 0 # for now
s = ref_seq.seq.tostring()

# Translate!
p = Seq.translate(s).split('*') # Split a stop codon
q = Seq.translate('C' + s).split('*') # Try a different frame
r = Seq.translate('CC' + s).split('*') # Try another
pc = Seq.translate(Seq.reverse_complement(s)).split('*') # And yet another
qc = Seq.translate('C' + Seq.reverse_complement(s)).split('*') # And another
rc = Seq.translate('CC' + Seq.reverse_complement(s)).split('*') # And another

# Group them all
pqr = Seq.translate(s).split('*')
pqr.extend(q)
pqr.extend(r)
pqr.extend(pc)
pqr.extend(qc)
pqr.extend(rc)

# Find start codon for each
def trim(x):
	if x.find('M') >= 0: return x[x.find('M'):]
	else: return ''
p_trimmed = map(trim, pqr)

# Find longest of them
def max(x, y):
	if len(x) < len(y): return y
	else: return x
p_max = reduce(max, p_trimmed)

# CREATE PRETTY OUTPUT (consumes extra cycles)
# and actually pinpoints the frame -- otherwise a pretty useless algo

px = reduce(max, map(trim, p))
px2 = Seq.translate(s).find(px) * 3 + 1 # Start codon position (add one b/c zero-based)
px3 = len(px) * 3 # Length
px4 = px2 + px3 # Stop codon position

if px3 == 0: # The pathetic minimal case that otherwise kills the algo
	px2 = 0
	px4 = 0

print "5'->3' frame 1:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (px2, px4, px3),
if px == p_max:
	print "(** using this one **)",
	orf_frame = 1
	orf_start = px2
	orf_end = px4
print "\n",

qx = reduce(max, map(trim, q))
qx2 = Seq.translate('C' + s).find(qx) * 3 # Start codon position
qx3 = len(qx) * 3 # Length
qx4 = qx2 + qx3 # Stop codon position

if qx3 == 0: # The pathetic minimal case that otherwise kills the algo
	qx2 = 0
	qx4 = 0

print "5'->3' frame 2:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (qx2, qx4, qx3),
if qx == p_max:
	print "(** using this one **)",
	orf_frame = 2
	orf_start = qx2
	orf_end = qx4
print "\n",

rx = reduce(max, map(trim, r))
rx2 = Seq.translate('CC' + s).find(rx) * 3 - 1 # Start codon position
rx3 = len(rx) * 3 # Length
rx4 = rx2 + rx3 # Stop codon position

if rx3 == 0: # The pathetic minimal case that otherwise kills the algo
	rx2 = 0
	rx4 = 0

print "5'->3' frame 3:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (rx2, rx4, rx3),
if rx == p_max:
	print "(** using this one **)",
	orf_frame = 3
	orf_start = rx2
	orf_end = rx4
print "\n",

pcx = reduce(max, map(trim, pc))
pcx2 = len(s) + 1 - (Seq.translate(Seq.reverse_complement(s)).find(pcx) * 3 + 1) # Start codon position
pcx3 = len(pcx) * 3 # Length
pcx4 = pcx2 - pcx3 # Stop codon position

if pcx3 == 0: # The pathetic minimal case that otherwise kills the algo
	pcx2 = 0
	pcx4 = 0

print "3'->5' frame 1:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (pcx2, pcx4, pcx3),
if pcx == p_max:
	print "(** using this one **)",
	orf_frame = 4
	orf_start = pcx2
	orf_end = pcx4
print "\n",

qcx = reduce(max, map(trim, qc))
qcx2 = len(s) + 1 - (Seq.translate('C' + Seq.reverse_complement(s)).find(qcx) * 3) # Start codon position
qcx3 = len(qcx) * 3 # Length
qcx4 = qcx2 - qcx3 # Stop codon position

if qcx3 == 0: # The pathetic minimal case that otherwise kills the algo
	qcx2 = 0
	qcx4 = 0

print "3'->5' frame 2:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (qcx2, qcx4, qcx3),
if qcx == p_max:
	print "(** using this one **)",
	orf_frame = 5
	orf_start = qcx2
	orf_end = qcx4
print "\n",

rcx = reduce(max, map(trim, rc))
rcx2 = len(s) + 1 - (Seq.translate('CC' + Seq.reverse_complement(s)).find(rcx) * 3 - 1) # Start codon position
rcx3 = len(rcx) * 3 # Length
rcx4 = rcx2 - rcx3 # Stop codon position

if rcx3 == 0: # The pathetic minimal case that otherwise kills the algo
	rcx2 = 0
	rcx4 = 0

print "3'->5' frame 3:"
print "start codon position: %d. stop codon position: %d. transcript length: %d." % (rcx2, rcx4, rcx3),
if rcx == p_max:
	print "(** using this one **)",
	orf_frame = 6
	orf_start = rcx2
	orf_end = rcx4
print "\n",

print '-' * 40

# END OF USELESS OUTPUT-GENERATING SEGMENT

# We need to find a way to align all the sequences, including mutations
# our current orf_start and orf_end do not take into account insertions, etc.
# which we must examine before we can do any additional processing

s_copy = re.sub('[^\-]', '*', s) # well, the insertions we need to account for are all '-'
orf_start_normalized = s_copy.replace('*', '^', orf_start).rfind('^') # zero-based orf start
orf_end_normalized = s_copy.replace('*', '^', orf_end).rfind('^') # zero-based orf end

# Now, we need to trim everything down to this size
trimmed_seq_descriptions = []
trimmed_seqs = []
for seq in alignment.get_all_seqs():
	s = seq.seq.tostring()
	trimmed_seq_descriptions.append(seq.description)
	if orf_start_normalized < orf_end_normalized:
		start = orf_start_normalized
		length = orf_end_normalized - orf_start_normalized + 3 # include stop codon
		trimmed_seqs.append(s[start:start + length])
	else:
		temp = Seq.reverse_complement(s)
		start = len(s) - orf_start_normalized - 1
		length = orf_start_normalized - orf_end_normalized + 3 # include stop codon
		trimmed_seqs.append(temp[start:start + length])

# (4) Work on the protein side of things (get an alignment in a hidden file)

prot = open(os.path.join(os.curdir, '.apoe_prot.fasta'), 'w')
counter = 0 # The dumb counter method of synching two lists
raw_translated_seqs = []
for seq in trimmed_seqs:
	translated_seq = Seq.translate(seq.replace('-',''))
	raw_translated_seqs.append(translated_seq)
	# Create input file
	prot.write('>')
	prot.write(trimmed_seq_descriptions[counter])
	prot.write('\n')
	prot.write(translated_seq)
	prot.write('\n')
	# Increment
	counter = counter + 1
prot.close()

# Figure out protein alignment
prot_cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, '.apoe_prot.fasta'))
prot_cline.set_output('.test_prot.aln')
prot_alignment = Clustalw.do_alignment(prot_cline)
os.remove(os.path.join(os.curdir, '.apoe_prot.fasta'))
os.remove(os.path.join(os.curdir, '.test_prot.aln'))

# (5) Use clumsy algo to find occurrences of deletions, insertions, rearrangements

seq_copy.pop(0) # pop out reference sequence (the first one)
ref_trimmed = trimmed_seqs.pop(0)
ref_trimmed_desc = trimmed_seq_descriptions.pop(0)
print "COMPARING AGAINST", ref_seq.description
print "[all nucleotide positions given relative to start codon]"
print '-' * 40
counter = 0 # clumsy counter again...to be used later

for seq in seq_copy:
	# Write a temporary file for one-on-one alignment
	working_file = os.path.join(os.curdir, '.apoe_working.fasta')
	temp = open(working_file, 'w')
	temp.write('>')
	temp.write(ref_seq.description)
	temp.write('\n')
	temp.write(ref_trimmed)
	temp.write('\n')
	temp.write('>')
	temp.write(seq.description)
	temp.write('\n')
	temp.write(trimmed_seqs[counter])
	temp.close()
	# Do the aligning
	cline_temp = Clustalw.MultipleAlignCL(working_file)
	cline_temp.set_output('.test_working.aln')
	alignment_temp = Clustalw.do_alignment(cline_temp)	
	# Analyse it
	realigned_ref = alignment_temp.get_seq_by_num(0).tostring()
	realigned_test = alignment_temp.get_seq_by_num(1).tostring()
	summary_temp = AlignInfo.SummaryInfo(alignment_temp)
	consensus_temp = summary_temp.dumb_consensus().tostring()
	
	# HEAVY DUTY OUTPUT TIME!
	
	print seq.description
	print '-' * 40
	print '\n',
	
	# Find point mutations in consensus sequence (X's)
	point_mutations_num = consensus_temp.count('X')
	point_mutations_find = consensus_temp.find('X',0)
	point_mutations_count = dict(A=0, C=0, G=0, T=0)
	while not point_mutations_find == -1:
		old = realigned_ref[point_mutations_find]
		new = realigned_test[point_mutations_find]
		# (Take into account any insertions or deletions for frame)
		index_adjusted = point_mutations_find - realigned_ref.count('-', 0, point_mutations_find)
		offset = index_adjusted % 3
		# (Work backwards to reconstitute the codon)
		original_codon = realigned_ref[point_mutations_find]
		j = 1
		for i in range(0, offset):
			char_temp = realigned_ref[point_mutations_find - j]
			while not char_temp.isalpha():
				j = j + 1
				char_temp = realigned_ref[point_mutations_find - j]
			if char_temp.isalpha():
				original_codon = char_temp + original_codon
				j = j + 1
		# (Work forwards to fill out the codon)
		j = 1
		for i in range(0, 2 - offset):
			char_temp = realigned_ref[point_mutations_find + j]
			while not char_temp.isalpha():
				j = j + 1
				char_temp = realigned_ref[point_mutations_find + j]
			if char_temp.isalpha():
				original_codon = original_codon + char_temp
				j = j + 1
		# (Do the same for the mutated codon)
		# (Work backwards to reconstitute the codon)
		mutated_codon = realigned_test[point_mutations_find]
		j = 1
		for i in range(0, offset):
			char_temp = realigned_test[point_mutations_find - j]
			while not char_temp.isalpha():
				j = j + 1
				char_temp = realigned_test[point_mutations_find - j]
			if char_temp.isalpha():
				mutated_codon = char_temp + mutated_codon
				j = j + 1
		# (Work forwards to fill out the codon)
		j = 1
		for i in range(0, 2 - offset):
			char_temp = realigned_test[point_mutations_find + j]
			while not char_temp.isalpha():
				j = j + 1
				char_temp = realigned_test[point_mutations_find + j]
			if char_temp.isalpha():
				mutated_codon = mutated_codon + char_temp
				j = j + 1
		a = Seq.translate(original_codon)
		b = Seq.translate(mutated_codon)
		if a == b:
			print "Silent point mutation found: %s%d%s." % (old, index_adjusted + 1, new),
			print "Amino acid result: %s%d%s." % (a, (index_adjusted - index_adjusted % 3) / 3 + 1, b)
		else:
			print "Non-silent point mutation found: %s%d%s." % (old, index_adjusted + 1, new),
			print "Amino acid result: %s%d%s." % (a, (index_adjusted - index_adjusted % 3) / 3 + 1, b)			
		# Increment
		point_mutations_find = consensus_temp.find('X', point_mutations_find + 1)

	print "*** Number of point mutations:", point_mutations_num, "***"
	print '\n',
	
	# Find dashes in the reference sequence: those are the insertions
	insertions_num = 0
	insertions_find = realigned_ref.find('-', 0)
	while not insertions_find == -1:
		contig_count = 1
		while realigned_ref[insertions_find + contig_count] == '-':
			contig_count = contig_count + 1
		# (Take into account any earlier insertions)
		index_adjusted = insertions_find - realigned_ref.count('-', 0, insertions_find)
		offset = index_adjusted % 3
		countback = insertions_find - offset - 6
		translation_snippet_a = realigned_ref[countback:insertions_find + contig_count + 6]
		a = Seq.translate(translation_snippet_a.replace('-',''))
		translation_snippet_b = realigned_test[countback:insertions_find + contig_count + 6]
		b = Seq.translate(translation_snippet_b.replace('-',''))
		if contig_count % 3 == 0:
			print "Non-frameshift insertion found: %d base insertion at position %d" % (contig_count, index_adjusted + 1)
			print "DNA Reference:  %s" % (translation_snippet_a)
			print "Prot Reference: %s" % (a),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
			print "DNA Result:     %s" % (translation_snippet_b)
			print "Prot Result:    %s" % (b),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
		else:
			print "Frameshift insertion found: %d base insertion at position %d" % (contig_count, index_adjusted + 1)
			print "DNA Reference:  %s" % (translation_snippet_a)
			print "Prot Reference: %s" % (a),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
			print "DNA Result:     %s" % (translation_snippet_b)
			print "Prot Result:    %s" % (b),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
		insertions_find = realigned_ref.find('-', insertions_find + contig_count)
		print '\n',

	# Find dashes in the test sequence: those are the deletions
	deletions_num = 0
	deletions_find = realigned_test.find('-', 0)
	while not deletions_find == -1:
		contig_count = 1
		while realigned_test[deletions_find + contig_count] == '-':
			contig_count = contig_count + 1
		# (Take into account any earlier insertions)
		index_adjusted = deletions_find - realigned_ref.count('-', 0, deletions_find)
		offset = index_adjusted % 3
		countback = deletions_find - offset - 6
		translation_snippet_a = realigned_ref[countback:deletions_find + contig_count + 6]
		a = Seq.translate(translation_snippet_a.replace('-',''))
		translation_snippet_b = realigned_test[countback:deletions_find + contig_count + 6]
		b = Seq.translate(translation_snippet_b.replace('-',''))
		if contig_count % 3 == 0:
			print "Non-frameshift deletion found: %d base deletion, position %d to %d" % (contig_count, index_adjusted + 1, index_adjusted + contig_count + 1)
			print "DNA Reference:  %s" % (translation_snippet_a)
			print "Prot Reference: %s" % (a),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
			print "DNA Result:     %s" % (translation_snippet_b)
			print "Prot Result:    %s" % (b),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
		else:
			print "Frameshift deletion found: %d base deletion, position %d to %d" % (contig_count, index_adjusted + 1, index_adjusted + contig_count + 1)
			print "DNA Reference:  %s" % (translation_snippet_a)
			print "Prot Reference: %s" % (a),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
			print "DNA Result:     %s" % (translation_snippet_b)
			print "Prot Result:    %s" % (b),
			if orf_frame <= 3: print "(5'->3' frame %d)" % (orf_frame)
			else: print "(3'->5' frame %d)" % (orf_frame - 3)
		deletions_find = realigned_test.find('-', deletions_find + contig_count)
		print '\n',
	
	print '-' * 40
	# Increment and clean up
	counter = counter + 1
	os.remove(working_file)
	os.remove(os.path.join(os.curdir, '.test_working.aln'))

print "Xiaodi Wu, Biophysics 101, 2007-02-27\n"

...and the output...

----------------------------------------
ORF DETERMINATION
----------------------------------------
5'->3' frame 1:
start codon position: 61. stop codon position: 1012. transcript length: 951. (** using this one **) 
5'->3' frame 2:
start codon position: 105. stop codon position: 246. transcript length: 141. 
5'->3' frame 3:
start codon position: 566. stop codon position: 809. transcript length: 243. 
3'->5' frame 1:
start codon position: 94. stop codon position: 91. transcript length: 3. 
3'->5' frame 2:
start codon position: 929. stop codon position: 335. transcript length: 594. 
3'->5' frame 3:
start codon position: 0. stop codon position: 0. transcript length: 0. 
----------------------------------------
COMPARING AGAINST gi|178850|gb|K00396.1|HUMAPOE3
[all nucleotide positions given relative to start codon]
----------------------------------------
lcl|HUMAPOE3_with_deletion
----------------------------------------

*** Number of point mutations: 0 ***

Frameshift deletion found: 5 base deletion, position 743 to 748
DNA Reference:  CGCCTGGACGAGGTGAAG
Prot Reference: RLDEVK (5'->3' frame 1)
DNA Result:     CGCCTGG-----GTGAAG
Prot Result:    RLGE (5'->3' frame 1)

----------------------------------------
lcl|HUMAPOE3_with_mutation
----------------------------------------

Non-silent point mutation found: G599A. Amino acid result: G200E.
Non-silent point mutation found: A743C. Amino acid result: D248A.
*** Number of point mutations: 2 ***

----------------------------------------
Xiaodi Wu, Biophysics 101, 2007-02-27