Harvard:Biophysics 101/2007/Notebook:Xiaodi Wu/2007-2-6
From OpenWetWare
Jump to navigationJump to search
#!/usr/bin/env python from Bio import GenBank, Seq # Wait message print "thinking...please wait one moment..." # Create GenBank object to parse a raw record record_parser = GenBank.FeatureParser() ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser) # Download and parse record parsed_record = ncbi_dict['119392085'] # IHH gene print "GenBank ID:", parsed_record.id # Extract the sequence from the parsed_record s = parsed_record.seq.tostring() print "total sequence length:", len(s) # Translate! p = Seq.translate(s).split('*') # Split a stop codon q = Seq.translate('C' + s).split('*') # Try a different frame r = Seq.translate('CC' + s).split('*') # Try another # pc = Seq.translate(Seq.reverse_complement(s)).split('*') # And yet another # qc = Seq.translate('C' + Seq.reverse_complement(s)).split('*') # And another # rc = Seq.translate('CC' + Seq.reverse_complement(s)).split('*') # And another # Group them all pqr = p pqr.extend(q) pqr.extend(r) # pqr.extend(pc) # pqr.extend(qc) # pqr.extend(rc) # Find start codon for each def trim(x): return x[x.find('M'):] p_trimmed = map(trim, pqr) # Find longest of them def max(x, y): if len(x) < len(y): return y else: return x p_max = reduce(max, p_trimmed) print "total translated sequence length:", len(p_max) max_repeat = 9 print "\npoly-A count -- method 1" for i in range(max_repeat): substr = ''.join(['A' for n in range(i+1)]) print substr, s.count(substr) print "\npoly-A count -- method 2" for i in range(max_repeat): substr = ''.join(['A' for n in range(i+1)]) count = 0 pos = s.find(substr,0) while not pos == -1: count = count + 1 pos = s.find(substr,pos+1) print substr, count print "\npoly-T count -- method 1" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) print substr, s.count(substr) print "\npoly-T count -- method 2" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) count = 0 pos = s.find(substr,0) while not pos == -1: count = count + 1 pos = s.find(substr,pos+1) print substr, count print "\ntranslated sequence" print p_max print "\nraw record (please wait a moment...)" ncbi_dict_raw = GenBank.NCBIDictionary('nucleotide', 'genbank') raw_record = ncbi_dict_raw['119392085'] print raw_record print '-' * 40 print "Xiaodi Wu, Biophysics 101, 2007-02-06\n"