Harvard:Biophysics 101/Notebook:ZS/2007-5-2
From OpenWetWare
Jump to navigationJump to search
#Zachary Sun, Biophysics 101
#5.03.07 code
#Tackling the problem of finding surrounding CDS's and information about a reference sequence from blast,
#this program parses the HTML output of blastn as of 4.26.07 for information which can then be fed
#into other NCBI programs to be programmed,
#Saves a local file; to stop queuing the web, just comment the code with pound symbols at end.
#update from v1: fixed some bug issues w/output, added the code in CDS option and additional parsing for information
#to be fed into polyphen as currently there is no working local database option.
#Note: this code has 2 cases:
#1) The seqence is in a CDS
# *It outputs the CDS it is in, the gene asc. #, and the CDS sequence itself
#2) The sequence is not in a CDS, but surrounded by CDSes
# *It outputs the CDS location of the surrounding sequence and the gene name of surrounding sequence, and mutation site
#
#Note: Any other data can be found for the gene asc. #, eg. protein sequences which go into polyphen
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import xml.sax.handler
from xml.dom import minidom
#class seq (macular degen.) on first line, other CDS seq on second
#seq = "CACCCTCGCCAGTTACGAGCTGCCGAGCCGCTTCCTAGGCTCTCTGCGAATACGGACACGCATGCCACCCACAACAACTTTTTAAAAGAATCAGACGTGTGAAGGATTCTATTCGAATTACTTCTGCTCTCTGCTTTTATCACTTCACTGTGGGTCTGGGCGCGGGCTTTCTGCCAGCTCCGCGGACGCTGCCTTCGTCCRGCCGCAGAGGCCCCGCGGTCAGGGTCCCGCGTGCGGGGTACCGGGGGCAGAACCAGCGCGTGACCGGGGTCCGCGGTGCCGCAACGCCCCGGGTCTGCGCAGAGGCCCCTGCAGTCCCTGCCCGGCCCAGTCCGAGCTTCCCGGGCGGGCCCCCAGTCCGGCGATTTGCAGGAACTTTCCCCGGCGCTCCCACGCGAAGC"
seq = "ATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTC"
# the above is a user-defined variable; put in your sequence to be Blast against!
class RecInfo():
def __init__(self, a, b, c, d, e, f, g):
self.surroundStart = a
self.surroundEnd = b
self.refStart = c
self.refEnd = d
self.hits = e
self.name = f
self.title = g
self.inCDS = "?"
def printInfo(self):
print "INFORMATION:"
print "-----------"
print "Gene name: ", self.name
print "Location: " , self.refStart, "->", self.refEnd
print "Hit qualification: ", self.hits
print "Surrounding gene information:"
print "----------------------------"
for (sStart, sEnd, n) in zip(self.surroundStart, self.surroundEnd, self.title):
print n, ": " , sStart, "->", sEnd
result_handle = NCBIWWW.qblast("blastn","Test/gpipe/9606/allcontig_and_rna", seq, format_type="HTML") #the db is from the blast homepage (other db's dont give surrounding info) #
blast_results = result_handle.read()#
save_file = open("my_blast2.html","w")#
save_file.write(blast_results)#
save_file.close()#
load_file = open("my_blast2.html","r")
line = load_file.readline()
surroundStart = []
surroundEnd = []
title = []
CDS = "false"
CDS_ascID = "NA"
temp = ""
temp1 = ""
while(line):
findLine = line.find("spanning the HSP") #to find the sequence I
if findLine != -1:
name = line[findLine+23:-1]
findLine = line.find("PREDICTED")
if findLine != -1: #if part of a CDS
end = line.find("Length")
temp = line[findLine-30:findLine]
temp = temp.split("value=")
#print temp
temp1 = temp[len(temp)-1]
temp1 = temp1[1:-1]
#print temp1
CDS_ascID = temp1 # this is the gene ascension number
CDS = "true"
break
findLine = line.find("Features flanking this part of subject sequence:")
if findLine != -1: #found features flanking
CDS = "false"
line = load_file.readline()
while line.find("&from=") != -1: #while there is still a feature
w = line.find("&from=") #index of start site
x = line.find("&to=") #index of mid
y = line.find("&view=gbwithparts") #index of end
surroundStart.append(int(line[w+6:x]))
surroundEnd.append(int(line[x+4:y]))
w = line.find("gbwithparts") #index of gene title
x = line.find("</a>") #index of end of title
title.append(line[w+13:x])
line = load_file.readline()
line = load_file.readline() #manual parse
line = load_file.readline() #manual parse
hits = line
seqFirstFound = "false" #if first part found
while line.find("input type") == -1:#while not done with sequence
if line.find("Sbjct") != -1: #if begin
if(seqFirstFound == "false"): #if first sequence
temp = line.split(" ") #split by space
refStart = temp[2]
seqFirstFound = "true"
else: #find end sequence
temp = line[:-1]#delete /n
temp = temp.split(" ")
refEnd = temp[6]
line = load_file.readline()
break
line = load_file.readline()
if CDS == "false":
record = RecInfo(surroundStart, surroundEnd, refStart, refEnd, hits, name, title)
record.printInfo()
else:
print "The snippit is a CDS:"
print "Name: ", name
print "Asc. Number: ", CDS_ascID
# We can create a GenBank object that will parse a raw record
# This facilitates extracting specific information from the sequences
record_parser = GenBank.FeatureParser()
# NCBIDictionary is an interface to Genbank
ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)
# If you pass NCBIDictionary a GenBank id, it will download that record
parsed_record = ncbi_dict[CDS_ascID]
# Extract the sequence from the parsed_record
s = parsed_record.seq.tostring()
print "Sequence: ", s
OUTPUT:
CASE 1: The sequence is surrounded by a CDS:
INFORMATION: ----------- Gene name: Homo sapiens chromosome 10 genomic contig, reference assembly Location: 42968870 -> 42969270 Hit qualification: Identities = 400/401 (99%), Gaps = 0/401 (0%) Surrounding gene information: ---------------------------- 3895 bp at 5' side: hypothetical protein : 42962770 -> 42964975 425 bp at 3' side: HtrA serine peptidase 1 : 42969695 -> 43022401
CASE 2: The sequence is in a CDS:
The snippit is a CDS: Name: Homo sapiens chromosome 10 genomic contig, reference assembly Asc. Number: XM_001131263 Sequence: GAGATGGCAGCTGGCTTGGCAAGGGGACAGCACCTTTGTCACCACATTATGTCCCTGTACCCTACATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTCAGCACCACCTGACACTGTCTATCATCCACACTGCAGCAAGGTGATTCTGCCAAAACATATCTCCTTAAAAGCCAACTGGAGCTTCTCATCAGCATCAATGTGAAGCCAAAAATCCTTAGGAGGACAGAGGGAGTCCCTCACAACCTAGACTGGTCCCCTTCCCTCCAGCTGCCTCAACTGTCCACAGGACTCTCTTCCCACCTGCGGCCACACTGTGCAACCTGGAATTTCCCCACCTGGGCGGACTCATCACGTCATCACCAATTGGATGCATCTTCTGCTCTGTGCAGCTGGTGAAATCTTTCTCAACCCTTGAGATGCAGCCCAATCTTCTCCTAACATCTGGATTCCTCTCTGTCACTGCATTCCCTCCTGTCATCCTGCCTTTGTTTTCTTGCCCTCCTTTCTCTCCCGGGTGATAGGCATTAACTAAAATTAAATAAAAATTCAGATCATCCTTGCA