TChan/Notebook/2007-2-6


 * Full disclosure: This was posted 2.19.07


 * 1) !/usr/bin/env python

from Bio import GenBank, Seq from Bio.Seq import Seq,translate
 * 1) Tiff: in order to translate the DNA seq to a protein seq

record_parser = GenBank.FeatureParser
 * 1) We can create a GenBank object that will parse a raw record
 * 2) This facilitates extracting specific information from the sequences

ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)
 * 1) NCBIDictionary is an interface to Genbank

parsed_record = ncbi_dict['178850']
 * 1) If you pass NCBIDictionary a GenBank id, it will download that record
 * 2) Tiff: changed it from 116496646 to 116496647

print "GenBank id:", parsed_record.id

s = parsed_record.seq.tostring print "total sequence length:", len(s)
 * 1) Extract the sequence from the parsed_record

my_seq = Seq(s) my_protein = translate(my_seq) print 'protein translation is %s' % my_protein.tostring print 'protein amino acid length is %i' % len(my_protein.tostring)
 * 1) Tiff: my_seq is a Seq version of the GenBank DNA sequence, and thus my_Protein is the protein sequence translated from this

max_repeat = 9

print "method 1" for i in range(max_repeat): # Tiff: changed from 'A' to 'T' to get count of poly-Ts substr = ''.join(['T' for n in range(i+1)]) print substr, s.count(substr)

print "\nmethod 2" for i in range(max_repeat): # Tiff: changed from 'A' to 'T' to get count of poly-Ts substr = ''.join(['T' for n in range(i+1)]) count = 0 pos = s.find(substr,0) while not pos == -1: count = count + 1 pos = s.find(substr,pos+1) print substr, count

gi_list = GenBank.search_for("Yersinia enterocolitica AND YopH") print gi_list

ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank')

for j in ncbi_dict[gi_list]: gb_record = ncbi_dict[gi_list[j]] print gb_record