Harvard:Biophysics 101/2007/Notebook:HRH/2007-2-20
From OpenWetWare
Jump to navigationJump to search
Script in progress:
#!/usr/bin/env python
import os
from Bio import GenBank, Seq, Clustalw, SeqRecord, SubsMat
from Bio.Align import AlignInfo
from Bio.Align.FormatConvert import FormatConverter
from Bio.Alphabet import IUPAC
from Bio.Clustalw import MultipleAlignCL
from Bio.Seq import Seq, translate
#from Bio.Tools import Transcribe
from sets import Set
cline = Clustalw.MultipleAlignCL(os.path.join(os.curdir, 'apoe.fasta'))
cline.set_output('test.aln')
alignment = Clustalw.do_alignment(cline)
summary_align = AlignInfo.SummaryInfo(alignment)
# This segment determines if the error is a mutation or deletion
seq = []
for i in range(alignment.get_alignment_length()):
col = alignment.get_column(i)
s = Set() # create a new set
for c in range(len(col)):
s.add(col[c]) # add each column element to the set
if c == 0:
seq.append(col[c])
if '-' in s: # determine if there is a deletion
print 'Deletion at %d: %s' %(i, col)
else:
if len(s) > 1: # multiple elements in s indicate a mutation
print 'Point mutation at %d: %s' %(i, col)
f53h1 = "5'->3' Frame 1: Start codon position:" # frame 5 - 3 header
f53h2 = "5'->3' Frame 2: Start codon position:" # frame 5 - 3 header
f53h3 = "5'->3' Frame 3: Start codon position:" # frame 5 - 3 header
f35h1 = "3'->5' Frame 1: Start codon position:" # frame 5 - 3 header
f35h2 = "3'->5' Frame 2: Start codon position:" # frame 5 - 3 header
f35h3 = "3'->5' Frame 3: Start codon position:" # frame 5 - 3 header
scph = 'Stop codon position:'
tlh = 'Transcript length:'
length = 1
# find start and stop codons
count = 0
for i in range(len(seq)):
if seq[i] == 'A' and seq[i+1] == 'T' and seq[i+2] == 'G' and i%3 == 0:
count = 1
for j in range(i+2, len(seq)):
if seq[j] == 'T' and seq[j+1] == 'A' and seq[j+2] == 'A' and j%3 == 0:
break
elif seq[j] == 'T' and seq[j+1] == 'A' and seq[j+2] == 'G' and j%3 == 0:
break
elif seq[j] == 'T' and seq[j+1] == 'G' and seq[j+2] == 'A' and j%3 == 0:
break
if length < j - i:
length = j - i
print f53h1, i, scph, j, tlh, j - i, '(**using this one**)'
elif length > j - i:
print f53h1, i, scph, j, tlh, j - i
if count == 0:
print f53h1, 'NA', scph, 'NA', tlh, 'NA'
count = 0
for i in range(len(seq)):
if seq[i] == 'A' and seq[i+1] == 'T' and seq[i+2] == 'G' and i%3 == 1:
count = 1
for j in range(i+2, len(seq)):
if seq[j] == 'T' and seq[j+1] == 'A' and seq[j+2] == 'A' and j%3 == 1:
break
elif seq[j] == 'T' and seq[j+1] == 'A' and seq[j+2] == 'G' and j%3 == 1:
break
elif seq[j] == 'T' and seq[j+1] == 'G' and seq[j+2] == 'A' and j%3 == 1:
break
if length < j - i:
length = j - i
print f53h2, i, scph, j, tlh, j - i, '(**using this one, disregard previous**)'
elif length > j - i:
print f53h2, i, scph, j, tlh, j - i
if count == 0:
print f53h2, 'NA', scph, 'NA', tlh, 'NA'
count = 0
for i in range(len(seq)):
if seq[i] == 'A' and seq[i+1] == 'T' and seq[i+2] == 'G' and i%3 == 2:
count = 1
for j in range(i+2, len(seq)):
if seq[j] == 'T' and seq[j+1] == 'A' and seq[j+2] == 'A' and j%3 == 2:
break
elif seq[j] == 'T' and seq[j+1] == 'A' and seq[j+2] == 'G' and j%3 == 2:
break
elif seq[j] == 'T' and seq[j+1] == 'G' and seq[j+2] == 'A' and j%3 == 2:
break
if length < j - i:
length = j - i
print f53h3, i, scph, j, tlh, j - i, '(**using this one, disregard previous**)'
elif length > j - i:
print f53h3, i, scph, j, tlh, j - i
if count == 0:
print f53h3, 'NA', scph, 'NA', tlh, 'NA'
#3-5
count = 0
for i in range(len(seq)):
if seq[i] == 'G' and seq[i+1] == 'T' and seq[i+2] == 'A' and i%3 == 0:
count = 1
for j in range(0, i-2):
if seq[j] == 'A' and seq[j+1] == 'A' and seq[j+2] == 'T':
break
if seq[j] == 'A' and seq[j+1] == 'G' and seq[j+2] == 'T':
break
if seq[j] == 'G' and seq[j+1] == 'A' and seq[j+2] == 'T':
break
if length < i - j:
length = i - j
print f35h1, i, scph, j, tlh, i - j, '(**using this one, disregard previous**)'
elif length > j - i:
print f35h1, i, scph, j, tlh, i - j
if count == 0:
print f35h1, 'NA', scph, 'NA', tlh, 'NA'
count = 0
for i in range(len(seq)):
if seq[i] == 'G' and seq[i+1] == 'T' and seq[i+2] == 'A' and i%3 == 1:
count = 1
for j in range(0, i-2):
if seq[j] == 'A' and seq[j+1] == 'A' and seq[j+2] == 'T':
break
if seq[j] == 'A' and seq[j+1] == 'G' and seq[j+2] == 'T':
break
if seq[j] == 'G' and seq[j+1] == 'A' and seq[j+2] == 'T':
break
if length < i - j:
length = i - j
print f35h2, i, scph, j, tlh, i - j, '(**using this one, disregard previous**)'
elif length > j - i:
print f35h2, i, scph, j, tlh, i - j
if count == 0:
print f35h2, 'NA', scph, 'NA', tlh, 'NA'
count = 0
for i in range(len(seq)):
if seq[i] == 'G' and seq[i+1] == 'T' and seq[i+2] == 'A' and i%3 == 2:
count = 1
for j in range(0, i-2):
if seq[j] == 'A' and seq[j+1] == 'A' and seq[j+2] == 'T':
break
if seq[j] == 'A' and seq[j+1] == 'G' and seq[j+2] == 'T':
break
if seq[j] == 'G' and seq[j+1] == 'A' and seq[j+2] == 'T':
break
if length < i - j:
length = i - j
print f35h3, i, scph, j, tlh, i - j, '(**using this one, disregard previous**)'
elif length > j - i:
print f35h3, i, scph, j, tlh, i - j
if count == 0:
print f35h3, 'NA', scph, 'NA', tlh, 'NA'