Harvard:Biophysics 101/2007/Notebook:Michael Wang/2007-2-1

Feburary 1
from Bio.Seq import translate from Bio import GenBank, Seq import sys
 * 1) !/usr/bin/env python

array_position = int(sys.argv[-1]) print "Retrieving entry number ",array_position
 * 1) user defines which array element to pick

search_terms = "Xenopus AND notch" gi_list = GenBank.search_for(search_terms) print "Searching for", search_terms, "entries" print gi_list print "Done searching" print "Entry ",array_position ncbi_dict_alpha = GenBank.NCBIDictionary('nucleotide', 'genbank')
 * 1) Creates a non-parsed library of genes with search terms Xenopus and notch

print ncbi_dict_alpha[gi_list[0]]
 * 1) prints the raw entry

record_parser = GenBank.FeatureParser
 * 1) We can create a GenBank object that will parse a raw record
 * 2) This facilitates extracting specific information from the sequences

ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)
 * 1) NCBIDictionary is an interface to Genbank

parsed_record = ncbi_dict[gi_list[0]] print parsed_record print "GenBank id:", parsed_record.id
 * 1) If you pass NCBIDictionary a GenBank id, it will download that record

s = parsed_record.seq.tostring print "total sequence length:", len(s)
 * 1) Extract the sequence from the parsed_record

my_protein = translate(s) print "translated sequence:", my_protein
 * 1) Translating the raw sequence

max_repeat = 9

print "multiple T analysis" print "method 1" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) print substr, s.count(substr)

print "\nmethod 2" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) count = 0 pos = s.find(substr,0) while not pos == -1: count = count + 1 pos = s.find(substr,pos+1) print substr, count