Harvard:Biophysics 101/2007/Notebook:HRH/2007-2-6

Assigned on Feb 1, 2007

Script:


 * 1) !/usr/bin/env python


 * 1) Hetmann Hsieh
 * 2) Assignment 1
 * 3) 2/6/07

from Bio import GenBank, Seq


 * 1) We can create a GenBank object that will parse a raw record
 * 2) This facilitates extracting specific information from the sequences

record_parser = GenBank.FeatureParser


 * 1) NCBIDictionary is an interface to Genbank

ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)


 * 1) If you pass NCBIDictionary a GenBank id, it will download that record

parsed_record = ncbi_dict['116496644']

print "GenBank id:", parsed_record.id


 * 1) Extract the sequence from the parsed_record

s = parsed_record.seq.tostring print "total sequence length:", len(s)

max_repeat = 9

print "method 1" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) print substr, s.count(substr)

print "\nmethod 2" for i in range(max_repeat): substr = ''.join(['T' for n in range(i+1)]) count = 0 pos = s.find(substr,0) while not pos == -1: count = count + 1 pos = s.find(substr,pos+1) print substr, count


 * 1) Other parts of the assignment are as follows
 * 2) Print the translated protein sequence and length

from Bio.Seq import translate print "protein translation:" protein = translate(s) print protein print "protein length:", len(protein)
 * 1) New NCBIDictionary without a parser, printing of raw record

ncbi_dict2 = GenBank.NCBIDictionary('nucleotide', 'genbank')

raw_record = ncbi_dict2['116496644'] print raw_record

Output:

GenBank id: BC126205.1 total sequence length: 3773 method 1 T 805 TT 115 TTT 28 TTTT 3 TTTTT 0 TTTTTT 0 TTTTTTT 0 TTTTTTTT 0 TTTTTTTTT 0

method 2 T 805 TT 143 TTT 31 TTTT 3 TTTTT 0 TTTTTT 0 TTTTTTT 0 TTTTTTTT 0 TTTTTTTTT 0 protein translation: EGERLKEAMRSPRTRGRSGRPLSLLLALLCALRAKVCGASGQFELEILSMQNVNGELQNGNCCGGARNPGDRKCTRDECDTYFKVCLKEYQSRVTAGGPCSFGSGSTPVIGGNTFNLKASRGNDRNRIVLPFSFAWPRSYTLLVEAWDSSNDTVQPDSIIEKASHSGMINPSRQWQTLKQNTGVAHFEYQIRVTCDDYYYGFGCNKFCRPRDDFFGHYACDQNGNKTCMEGWMGPECNRAICRQGCSPKHGSCKLPGDCRCQYGWQGLYCDKCIPHPGCVHGICNEPWQCLCETNWGGQLCDKDLNYCGTHQPCLNGGTCSNTGPDKYQCSCPEGYSGPNCEIAEHACLSDPCHNRGSCKETSLGFECECSPGWTGPTCSTNIDDCSPNNCSHGGTCQDLVNGFKCVCPPQWTGKTCQLDANECEAKPCVNAKSCKNLIASYYCDCLPGWMGQNCDININDCLGQCQNDASCRDLVNGYRCICPPGYAGDHCERDIDECASNPCLNGGHCQNEINRFQCLCPTGFSGNLCQLDIDYCEPNPCQNGAQCYNRASDYFCKCPEDYEGKNCSHLKDHCRTTPCEVIDSCTVAMASNDTPEGVRYISSNVCGPHGKCKSQSGGKFTCDCNKGFTGTYCHENINDCESNPCRNGGTCIDGVNSYKCICSDGWEGAYCETNINDCSQNPCHNGGTCRDLVNDFYCDCKNGWKGKTCHSRDSQCDEATCNNGGTCYDEGDAFKCMCPGGWEGTTCNIARNSSCLPNPCHNGGTCVVNGESFTCVCKEGWEGPICAQNTNDCSPHPCYNSGTCVDGDNWYRCECAPGFAGPDCRININECQSSPCAFGATCVDEINGYRCVCPPGHSGAKCQEVSGRPCITMGSVIPDGAKWDDDCNTCQCLNGRIACSKVWCGPRPCLLHKGHSECPSGQSCIPILDDQCFVHPCTGVGECRSSSLQPVKTKCTSDSYYQDNCANITFTFNKEMMSPGLTTEHICSELRNLNILKNVSAEYSIYIACEPSPSANNEIHVAISAEDIRDDGNPIKEITDKIIDLVSKRDGNSSLIAAVAEVRVQRRPLKNRTDFLVPLLSSVLTVAWICCLVTAFYWCLRKRRKPGSHTHSASEDNTTNNVREQLNQIKNPIEKHGANTVPIKDYENKNSKMSKIRTHNSEVEEDDMDKHQQKARFAKQPAYTLVDREEKPPNGTPTKHPNWTNKQDNRDLESAQSLNRMEYIV*QTAGTAAAR*SLRACSSLNCRVILESEAVA protein length: 1257 LOCUS      BC126205                3773 bp    mRNA    linear   PRI 23-OCT-2006 DEFINITION Homo sapiens jagged 1 (Alagille syndrome), mRNA (cDNA clone            MGC:161483 IMAGE:8991921), complete cds. ACCESSION  BC126205 VERSION    BC126205.1  GI:116496644 KEYWORDS   MGC. SOURCE     Homo sapiens (human) ORGANISM Homo sapiens Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini; Hominidae; Homo. REFERENCE  1  (bases 1 to 3773) AUTHORS  Strausberg,R.L., Feingold,E.A., Grouse,L.H., Derge,J.G., Klausner,R.D., Collins,F.S., Wagner,L., Shenmen,C.M., Schuler,G.D., Altschul,S.F., Zeeberg,B., Buetow,K.H., Schaefer,C.F., Bhat,N.K., Hopkins,R.F., Jordan,H., Moore,T., Max,S.I., Wang,J., Hsieh,F., Diatchenko,L., Marusina,K., Farmer,A.A., Rubin,G.M., Hong,L., Stapleton,M., Soares,M.B., Bonaldo,M.F., Casavant,T.L., Scheetz,T.E., Brownstein,M.J., Usdin,T.B., Toshiyuki,S., Carninci,P., Prange,C., Raha,S.S., Loquellano,N.A., Peters,G.J., Abramson,R.D., Mullahy,S.J., Bosak,S.A., McEwan,P.J., McKernan,K.J., Malek,J.A., Gunaratne,P.H., Richards,S., Worley,K.C., Hale,S., Garcia,A.M., Gay,L.J., Hulyk,S.W., Villalon,D.K., Muzny,D.M., Sodergren,E.J., Lu,X., Gibbs,R.A., Fahey,J., Helton,E., Ketteman,M., Madan,A., Rodrigues,S., Sanchez,A., Whiting,M., Madan,A., Young,A.C., Shevchenko,Y., Bouffard,G.G., Blakesley,R.W., Touchman,J.W., Green,E.D., Dickson,M.C., Rodriguez,A.C., Grimwood,J., Schmutz,J., Myers,R.M., Butterfield,Y.S., Krzywinski,M.I., Skalska,U., Smailus,D.E., Schnerch,A., Schein,J.E., Jones,S.J. and Marra,M.A. CONSRTM   Mammalian Gene Collection Program Team TITLE    Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences JOURNAL  Proc. Natl. Acad. Sci. U.S.A. 99 (26), 16899-16903 (2002) PUBMED  12477932 REFERENCE  2  (bases 1 to 3773) CONSRTM  NIH MGC Project TITLE    Direct Submission JOURNAL  Submitted (22-OCT-2006) National Institutes of Health, Mammalian Gene Collection (MGC), Bethesda, MD 20892-2590, USA REMARK   NIH-MGC Project URL: http://mgc.nci.nih.gov COMMENT    Contact: MGC help desk Email: cgapbs-r@mail.nih.gov Tissue Procurement: Mike Brownstein, NIMH cDNA Library Preparation: British Columbia Cancer Research Center cDNA Library Arrayed by: The I.M.A.G.E. Consortium (LLNL) DNA Sequencing by: Genome Sequence Centre, BC Cancer Agency, Vancouver, BC, Canada info@bcgsc.bc.ca           Martin Hirst, Thomas Zeng, Ryan Morin, Michelle Moksa, Johnson Pang, Diana Mah, Jing Wang, Kieth Fichter, Eric Chuah, Allen Delaney, Rob Kirkpatrick, Agnes Baross, Sarah Barber, Mabel Brown-John, Steve S. Chand, William Chow, Ryan Babakaiff, Dave Wong, Corey Matsuo, Jaclyn Beland, Susan Gibson, Luis delRio, Ruth Featherstone, Malachi Griffith, Obi Griffith, Ran Guin, Nancy Liao, Kim MacDonald, Mike R. Mayo, Josh Moran, Diana Palmquist, JR            Santos, Duane Smailus, Jeff Stott, Miranda Tsai, George Yang, Jacquie Schein, Asim Siddiqui,Steven Jones, Rob Holt, Marco Marra. Clone distribution: MGC clone distribution information can be found through the I.M.A.G.E. Consortium/LLNL at: http://image.llnl.gov Series: IRCB Plate: 7 Row: E Column: 7. Differences found between this sequence and the human reference genome (build 36) are described in misc_difference features below and these differences were also compared to chimpanzee genome (build 2). FEATURES            Location/Qualifiers source         1..3773 /organism="Homo sapiens" /mol_type="mRNA" /db_xref="taxon:9606" /clone="MGC:161483 IMAGE:8991921" /tissue_type="Brain, cerebellum, PCR rescued clones" /clone_lib="NIH_MGC_313" /note="Vector: pCR-XL-TOPO with reversed insert; Clone                    identification sequence tag: GACACATT" gene           1..3773 /gene="JAG1" /note="synonyms: AWS, HJ1, AHD, CD339" /db_xref="GeneID:182" /db_xref="HGNC:6188" /db_xref="MIM:601920" CDS            25..3681 /gene="JAG1" /codon_start=1 /product="jagged 1 (Alagille syndrome)" /protein_id="AAI26206.1" /db_xref="GI:116496645" /db_xref="GeneID:182" /db_xref="HGNC:6188" /db_xref="MIM:601920" /translation="MRSPRTRGRSGRPLSLLLALLCALRAKVCGASGQFELEILSMQN                    VNGELQNGNCCGGARNPGDRKCTRDECDTYFKVCLKEYQSRVTAGGPCSFGSGSTPVI                     GGNTFNLKASRGNDRNRIVLPFSFAWPRSYTLLVEAWDSSNDTVQPDSIIEKASHSGM                     INPSRQWQTLKQNTGVAHFEYQIRVTCDDYYYGFGCNKFCRPRDDFFGHYACDQNGNK                     TCMEGWMGPECNRAICRQGCSPKHGSCKLPGDCRCQYGWQGLYCDKCIPHPGCVHGIC                     NEPWQCLCETNWGGQLCDKDLNYCGTHQPCLNGGTCSNTGPDKYQCSCPEGYSGPNCE                     IAEHACLSDPCHNRGSCKETSLGFECECSPGWTGPTCSTNIDDCSPNNCSHGGTCQDL                     VNGFKCVCPPQWTGKTCQLDANECEAKPCVNAKSCKNLIASYYCDCLPGWMGQNCDIN                     INDCLGQCQNDASCRDLVNGYRCICPPGYAGDHCERDIDECASNPCLNGGHCQNEINR                     FQCLCPTGFSGNLCQLDIDYCEPNPCQNGAQCYNRASDYFCKCPEDYEGKNCSHLKDH                     CRTTPCEVIDSCTVAMASNDTPEGVRYISSNVCGPHGKCKSQSGGKFTCDCNKGFTGT                     YCHENINDCESNPCRNGGTCIDGVNSYKCICSDGWEGAYCETNINDCSQNPCHNGGTC RDLVNDFYCDCKNGWKGKTCHSRDSQCDEATCNNGGTCYDEGDAFKCMCPGGWEGTTC NIARNSSCLPNPCHNGGTCVVNGESFTCVCKEGWEGPICAQNTNDCSPHPCYNSGTCV DGDNWYRCECAPGFAGPDCRININECQSSPCAFGATCVDEINGYRCVCPPGHSGAKCQ EVSGRPCITMGSVIPDGAKWDDDCNTCQCLNGRIACSKVWCGPRPCLLHKGHSECPSG QSCIPILDDQCFVHPCTGVGECRSSSLQPVKTKCTSDSYYQDNCANITFTFNKEMMSP GLTTEHICSELRNLNILKNVSAEYSIYIACEPSPSANNEIHVAISAEDIRDDGNPIKE ITDKIIDLVSKRDGNSSLIAAVAEVRVQRRPLKNRTDFLVPLLSSVLTVAWICCLVTA FYWCLRKRRKPGSHTHSASEDNTTNNVREQLNQIKNPIEKHGANTVPIKDYENKNSKM SKIRTHNSEVEEDDMDKHQQKARFAKQPAYTLVDREEKPPNGTPTKHPNWTNKQDNRD LESAQSLNRMEYIV"    misc_difference 789                     /gene="JAG1"                     /note="'T' in cDNA is 'C' in the human genome; no amino acid change. The chimpanzee genome agrees with the human genomic sequence and not the cDNA."    misc_difference 3441                     /gene="JAG1"                     /note="'C' in cDNA is 'T' in the human genome; no amino acid change." ORIGIN             1 gagggggagc gtctcaaaga agcgatgcgt tccccacgga cgcgcggccg gtccgggcgc       61 cccctaagcc tcctgctcgc cctgctctgt gccctgcgag ccaaggtgtg tggggcctcg      121 ggtcagttcg agttggagat cctgtccatg cagaacgtga acggggagct gcagaacggg      181 aactgctgcg gcggcgcccg gaacccggga gaccgcaagt gcacccgcga cgagtgtgac      241 acatacttca aagtgtgcct caaggagtat cagtcccgcg tcacggccgg ggggccctgc      301 agcttcggct cagggtccac gcctgtcatc gggggcaaca ccttcaacct caaggccagc      361 cgcggcaacg accgcaaccg catcgtgctg cctttcagtt tcgcctggcc gaggtcctat      421 acgttgcttg tggaggcgtg ggattccagt aatgacaccg ttcaacctga cagtattatt      481 gaaaaggctt ctcactcggg catgatcaac cccagccggc agtggcagac gctgaagcag      541 aacacgggcg ttgcccactt tgagtatcag atccgcgtga cctgtgatga ctactactat      601 ggctttggct gcaataagtt ctgccgcccc agagatgact tctttggaca ctatgcctgt      661 gaccagaatg gcaacaaaac ttgcatggaa ggctggatgg gccccgaatg taacagagct 721 atttgccgac aaggctgcag tcctaagcat gggtcttgca aactcccagg tgactgcagg 781 tgccagtatg gctggcaagg cctgtactgt gataagtgca tcccacaccc gggatgcgtc 841 cacggcatct gtaatgagcc ctggcagtgc ctctgtgaga ccaactgggg cggccagctc 901 tgtgacaaag atctcaatta ctgtgggact catcagccgt gtctcaacgg gggaacttgt 961 agcaacacag gccctgacaa atatcagtgt tcctgccctg aggggtattc aggacccaac 1021 tgtgaaattg ctgagcacgc ctgcctctct gatccctgtc acaacagagg cagctgtaag 1081 gagacctccc tgggctttga gtgtgagtgt tccccaggct ggaccggccc cacatgctct 1141 acaaacattg atgactgttc tcctaataac tgttcccacg ggggcacctg ccaggacctg 1201 gttaacggat ttaagtgtgt gtgcccccca cagtggactg ggaaaacgtg ccagttagat 1261 gcaaatgaat gtgaggccaa accttgtgta aacgccaaat cctgtaagaa tctcattgcc 1321 agctactact gcgactgtct tcccggctgg atgggtcaga attgtgacat aaatattaat 1381 gactgccttg gccagtgtca gaatgacgcc tcctgtcggg atttggttaa tggttatcgc 1441 tgtatctgtc cacctggcta tgcaggcgat cactgtgaga gagacatcga tgaatgtgcc 1501 agcaacccct gtttgaatgg gggtcactgt cagaatgaaa tcaacagatt ccagtgtctg 1561 tgtcccactg gtttctctgg aaacctctgt cagctggaca tcgattattg tgagcctaat 1621 ccctgccaga acggtgccca gtgctacaac cgtgccagtg actatttctg caagtgcccc 1681 gaggactatg agggcaagaa ctgctcacac ctgaaagacc actgccgcac gaccccctgt 1741 gaagtgattg acagctgcac agtggccatg gcttccaacg acacacctga aggggtgcgg 1801 tatatttcct ccaacgtctg tggtcctcac gggaagtgca agagtcagtc gggaggcaaa 1861 ttcacctgtg actgtaacaa aggcttcacg ggaacatact gccatgaaaa tattaatgac 1921 tgtgagagca acccttgtag aaacggtggc acttgcatcg atggtgtcaa ctcctacaag 1981 tgcatctgta gtgacggctg ggagggggcc tactgtgaaa ccaatattaa tgactgcagc 2041 cagaacccct gccacaatgg gggcacgtgt cgcgacctgg tcaatgactt ctactgtgac 2101 tgtaaaaatg ggtggaaagg aaagacctgc cactcacgtg acagtcagtg tgatgaggcc 2161 acgtgcaaca acggtggcac ctgctatgat gagggggatg cttttaagtg catgtgtcct 2221 ggcggctggg aaggaacaac ctgtaacata gcccgaaaca gtagctgcct gcccaacccc 2281 tgccataatg ggggcacatg tgtggtcaac ggcgagtcct ttacgtgcgt ctgcaaggaa 2341 ggctgggagg ggcccatctg tgctcagaat accaatgact gcagccctca tccctgttac 2401 aacagcggca cctgtgtgga tggagacaac tggtaccggt gcgaatgtgc cccgggtttt 2461 gctgggcccg actgcagaat aaacatcaat gaatgccagt cttcaccttg tgcctttgga 2521 gcgacctgtg tggatgagat caatggctac cggtgtgtct gccctccagg gcacagtggt 2581 gccaagtgcc aggaagtttc agggagacct tgcatcacca tggggagtgt gataccagat 2641 ggggccaaat gggatgatga ctgtaatacc tgccagtgcc tgaatggacg gatcgcctgc 2701 tcaaaggtct ggtgtggccc tcgaccttgc ctgctccaca aagggcacag cgagtgcccc 2761 agcgggcaga gctgcatccc catcctggac gaccagtgct tcgtccaccc ctgcactggt 2821 gtgggcgagt gtcggtcttc cagtctccag ccggtgaaga caaagtgcac ctctgactcc 2881 tattaccagg ataactgtgc gaacatcaca tttaccttta acaaggagat gatgtcacca 2941 ggtcttacta cggagcacat ttgcagtgaa ttgaggaatt tgaatatttt gaagaatgtt 3001 tccgctgaat attcaatcta catcgcttgc gagccttccc cttcagcgaa caatgaaata 3061 catgtggcca tttctgctga agatatacgg gatgatggga acccgatcaa ggaaatcact 3121 gacaaaataa tcgatcttgt tagtaaacgt gatggaaaca gctcgctgat tgctgccgtt 3181 gcagaagtaa gagttcagag gcggcctctg aagaacagaa cagatttcct tgttcccttg 3241 ctgagctctg tcttaactgt ggcttggatc tgttgcttgg tgacggcctt ctactggtgc 3301 ctgcggaagc ggcggaagcc gggcagccac acacactcag cctctgagga caacaccacc 3361 aacaacgtgc gggagcagct gaaccagatc aaaaacccca ttgagaaaca tggggccaac 3421 acggtcccca tcaaggatta cgagaacaag aactccaaaa tgtctaaaat aaggacacac 3481 aattctgaag tagaagagga cgacatggac aaacaccagc agaaagcccg gtttgccaag 3541 cagccggcgt acacgctggt agacagagaa gagaagcccc ccaacggcac gccgacaaaa 3601 cacccaaact ggacaaacaa acaggacaac agagacttgg aaagtgccca gagcttaaac 3661 cgaatggagt acatcgtata gcagaccgcg ggcactgccg ccgctaggta gagtctgagg 3721 gcttgtagtt ctttaaactg tcgtgtcata ctcgagtctg aggccgttgc tga