Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-6


 * 1) !/usr/bin/env python

from Bio import GenBank, Seq

print "thinking...please wait one moment..."
 * 1) Wait message

record_parser = GenBank.FeatureParser ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser) parsed_record = ncbi_dict['119392085'] # IHH gene
 * 1) Create GenBank object to parse a raw record
 * 1) Download and parse record

print "GenBank ID:", parsed_record.id

s = parsed_record.seq.tostring print "total sequence length:", len(s)
 * 1) Extract the sequence from the parsed_record

p = Seq.translate(s).split('*') # Split a stop codon q = Seq.translate('C' + s).split('*') # Try a different frame r = Seq.translate('CC' + s).split('*') # Try another
 * 1) Translate!
 * 1) pc = Seq.translate(Seq.reverse_complement(s)).split('*') # And yet another
 * 2) qc = Seq.translate('C' + Seq.reverse_complement(s)).split('*') # And another
 * 3) rc = Seq.translate('CC' + Seq.reverse_complement(s)).split('*') # And another

pqr = p pqr.extend(q) pqr.extend(r)
 * 1) Group them all
 * 1) pqr.extend(pc)
 * 2) pqr.extend(qc)
 * 3) pqr.extend(rc)

def trim(x): return x[x.find('M'):] p_trimmed = map(trim, pqr)
 * 1) Find start codon for each

def max(x, y): if len(x) < len(y): return y else: return x p_max = reduce(max, p_trimmed) print "total translated sequence length:", len(p_max)
 * 1) Find longest of them

max_repeat = 9

print "\npoly-A count -- method 1" for i in range(max_repeat): substr = ''.join(['A' for n in range(i+1)]) print substr, s.count(substr)

print "\npoly-A count -- method 2" for i in range(max_repeat): substr = ''.join(['A' for n in range(i+1)]) count = 0 pos = s.find(substr,0) while not pos == -1: count = count + 1 pos = s.find(substr,pos+1) print substr, count

print "\npoly-T count -- method 1" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) print substr, s.count(substr)

print "\npoly-T count -- method 2" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) count = 0 pos = s.find(substr,0) while not pos == -1: count = count + 1 pos = s.find(substr,pos+1) print substr, count print "\ntranslated sequence" print p_max

print "\nraw record (please wait a moment...)" ncbi_dict_raw = GenBank.NCBIDictionary('nucleotide', 'genbank') raw_record = ncbi_dict_raw['119392085'] print raw_record

print '-' * 40 print "Xiaodi Wu, Biophysics 101, 2007-02-06\n"