Harvard:Biophysics 101/Notebook:ZS/2007-5-2

From OpenWetWare
Jump to navigationJump to search
#Zachary Sun, Biophysics 101
#5.03.07 code
#Tackling the problem of finding surrounding CDS's and information about a reference sequence from blast,
#this program parses the HTML output of blastn as of 4.26.07 for information which can then be fed
#into other NCBI programs to be programmed,
#Saves a local file; to stop queuing the web, just comment the code with pound symbols at end.

#update from v1: fixed some bug issues w/output, added the code in CDS option and additional parsing for information
#to be fed into polyphen as currently there is no working local database option.
#Note: this code has 2 cases:
#1) The seqence is in a CDS
#   *It outputs the CDS it is in, the gene asc. #, and the CDS sequence itself
#2) The sequence is not in a CDS, but surrounded by CDSes
#   *It outputs the CDS location of the surrounding sequence and the gene name of surrounding sequence, and mutation site
#
#Note: Any other data can be found for the gene asc. #, eg. protein sequences which go into polyphen

from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import xml.sax.handler
from xml.dom import minidom

#class seq (macular degen.) on first line, other CDS seq on second
#seq = "CACCCTCGCCAGTTACGAGCTGCCGAGCCGCTTCCTAGGCTCTCTGCGAATACGGACACGCATGCCACCCACAACAACTTTTTAAAAGAATCAGACGTGTGAAGGATTCTATTCGAATTACTTCTGCTCTCTGCTTTTATCACTTCACTGTGGGTCTGGGCGCGGGCTTTCTGCCAGCTCCGCGGACGCTGCCTTCGTCCRGCCGCAGAGGCCCCGCGGTCAGGGTCCCGCGTGCGGGGTACCGGGGGCAGAACCAGCGCGTGACCGGGGTCCGCGGTGCCGCAACGCCCCGGGTCTGCGCAGAGGCCCCTGCAGTCCCTGCCCGGCCCAGTCCGAGCTTCCCGGGCGGGCCCCCAGTCCGGCGATTTGCAGGAACTTTCCCCGGCGCTCCCACGCGAAGC"
seq = "ATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTC"
# the above is a user-defined variable; put in your sequence to be Blast against!

class RecInfo():
    def __init__(self, a, b, c, d, e, f, g):
        self.surroundStart = a
        self.surroundEnd = b
        self.refStart = c
        self.refEnd = d
        self.hits = e
        self.name = f
        self.title = g
        self.inCDS = "?"

    def printInfo(self):
        print "INFORMATION:"
        print "-----------"
        print "Gene name: ", self.name
        print "Location: " , self.refStart, "->", self.refEnd
        print "Hit qualification: ", self.hits
        print "Surrounding gene information:"
        print "----------------------------"
        for (sStart, sEnd, n) in zip(self.surroundStart, self.surroundEnd, self.title):
            print n, ": " , sStart, "->", sEnd
            

result_handle = NCBIWWW.qblast("blastn","Test/gpipe/9606/allcontig_and_rna", seq, format_type="HTML") #the db is from the blast homepage (other db's dont give surrounding info) #
blast_results = result_handle.read()#

save_file = open("my_blast2.html","w")#
save_file.write(blast_results)#
save_file.close()#

load_file = open("my_blast2.html","r")
line = load_file.readline()

surroundStart = []
surroundEnd = []
title = []

CDS = "false"
CDS_ascID = "NA"
temp = ""
temp1 = ""

while(line):
    findLine = line.find("spanning the HSP") #to find the sequence I
    if findLine != -1:
        name = line[findLine+23:-1]
    findLine = line.find("PREDICTED")
    if findLine != -1: #if part of a CDS
        end = line.find("Length")
        temp = line[findLine-30:findLine]
        temp = temp.split("value=")
        #print temp
        temp1 = temp[len(temp)-1]
        temp1 = temp1[1:-1]
        #print temp1 
        CDS_ascID = temp1 # this is the gene ascension number
        CDS = "true"
        break
    findLine = line.find("Features flanking this part of subject sequence:")
    if findLine != -1: #found features flanking
        CDS = "false"
        line = load_file.readline()
        while line.find("&from=") != -1: #while there is still a feature
            w = line.find("&from=") #index of start site
            x = line.find("&to=") #index of mid
            y = line.find("&view=gbwithparts") #index of end
            surroundStart.append(int(line[w+6:x]))
            surroundEnd.append(int(line[x+4:y]))
            w = line.find("gbwithparts") #index of gene title
            x = line.find("</a>") #index of end of title
            title.append(line[w+13:x])
            line = load_file.readline()

        line = load_file.readline() #manual parse
        line = load_file.readline() #manual parse
        hits = line

        seqFirstFound = "false" #if first part found

        while line.find("input type") == -1:#while not done with sequence
            if line.find("Sbjct") != -1: #if begin
                if(seqFirstFound == "false"): #if first sequence
                    temp = line.split(" ") #split by space
                    refStart = temp[2]
                    seqFirstFound = "true"
                else: #find end sequence
                    temp = line[:-1]#delete /n
                    temp = temp.split(" ")
                    refEnd = temp[6]
            line = load_file.readline()

        break
    line = load_file.readline()


if CDS == "false":
    record = RecInfo(surroundStart, surroundEnd, refStart, refEnd, hits, name, title) 
    record.printInfo()
else:
    print "The snippit is a CDS:"
    print "Name: ", name
    print "Asc. Number: ", CDS_ascID
        
    # We can create a GenBank object that will parse a raw record
    # This facilitates extracting specific information from the sequences
    record_parser = GenBank.FeatureParser()

    # NCBIDictionary is an interface to Genbank
    ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)

    # If you pass NCBIDictionary a GenBank id, it will download that record
    parsed_record = ncbi_dict[CDS_ascID]

    # Extract the sequence from the parsed_record
    s = parsed_record.seq.tostring()
    print "Sequence: ", s
  

OUTPUT:

CASE 1: The sequence is surrounded by a CDS:

INFORMATION:
-----------
Gene name:  Homo sapiens chromosome 10 genomic contig, reference assembly
Location:  42968870 -> 42969270
Hit qualification:   Identities = 400/401 (99%), Gaps = 0/401 (0%)

Surrounding gene information:
----------------------------
3895 bp at 5' side: hypothetical protein :  42962770 -> 42964975
425 bp at 3' side: HtrA serine peptidase 1 :  42969695 -> 43022401

CASE 2: The sequence is in a CDS:

The snippit is a CDS:
Name:  Homo sapiens chromosome 10 genomic contig, reference assembly
Asc. Number:  XM_001131263
Sequence:  GAGATGGCAGCTGGCTTGGCAAGGGGACAGCACCTTTGTCACCACATTATGTCCCTGTACCCTACATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTCAGCACCACCTGACACTGTCTATCATCCACACTGCAGCAAGGTGATTCTGCCAAAACATATCTCCTTAAAAGCCAACTGGAGCTTCTCATCAGCATCAATGTGAAGCCAAAAATCCTTAGGAGGACAGAGGGAGTCCCTCACAACCTAGACTGGTCCCCTTCCCTCCAGCTGCCTCAACTGTCCACAGGACTCTCTTCCCACCTGCGGCCACACTGTGCAACCTGGAATTTCCCCACCTGGGCGGACTCATCACGTCATCACCAATTGGATGCATCTTCTGCTCTGTGCAGCTGGTGAAATCTTTCTCAACCCTTGAGATGCAGCCCAATCTTCTCCTAACATCTGGATTCCTCTCTGTCACTGCATTCCCTCCTGTCATCCTGCCTTTGTTTTCTTGCCCTCCTTTCTCTCCCGGGTGATAGGCATTAACTAAAATTAAATAAAAATTCAGATCATCCTTGCA