Harvard:Biophysics 101/Notebook:ZS/2007-5-2


 * 1) Zachary Sun, Biophysics 101
 * 2) 5.03.07 code
 * 3) Tackling the problem of finding surrounding CDS's and information about a reference sequence from blast,
 * 4) this program parses the HTML output of blastn as of 4.26.07 for information which can then be fed
 * 5) into other NCBI programs to be programmed,
 * 6) Saves a local file; to stop queuing the web, just comment the code with pound symbols at end.


 * 1) update from v1: fixed some bug issues w/output, added the code in CDS option and additional parsing for information
 * 2) to be fed into polyphen as currently there is no working local database option.
 * 3) Note: this code has 2 cases:
 * 4) 1) The seqence is in a CDS
 * 5)   *It outputs the CDS it is in, the gene asc. #, and the CDS sequence itself
 * 6) 2) The sequence is not in a CDS, but surrounded by CDSes
 * 7)   *It outputs the CDS location of the surrounding sequence and the gene name of surrounding sequence, and mutation site
 * 8) Note: Any other data can be found for the gene asc. #, eg. protein sequences which go into polyphen
 * 1) Note: Any other data can be found for the gene asc. #, eg. protein sequences which go into polyphen

from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.Blast import NCBIXML import xml.sax.handler from xml.dom import minidom

seq = "ATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTC"
 * 1) class seq (macular degen.) on first line, other CDS seq on second
 * 2) seq = "CACCCTCGCCAGTTACGAGCTGCCGAGCCGCTTCCTAGGCTCTCTGCGAATACGGACACGCATGCCACCCACAACAACTTTTTAAAAGAATCAGACGTGTGAAGGATTCTATTCGAATTACTTCTGCTCTCTGCTTTTATCACTTCACTGTGGGTCTGGGCGCGGGCTTTCTGCCAGCTCCGCGGACGCTGCCTTCGTCCRGCCGCAGAGGCCCCGCGGTCAGGGTCCCGCGTGCGGGGTACCGGGGGCAGAACCAGCGCGTGACCGGGGTCCGCGGTGCCGCAACGCCCCGGGTCTGCGCAGAGGCCCCTGCAGTCCCTGCCCGGCCCAGTCCGAGCTTCCCGGGCGGGCCCCCAGTCCGGCGATTTGCAGGAACTTTCCCCGGCGCTCCCACGCGAAGC"
 * 1) the above is a user-defined variable; put in your sequence to be Blast against!

class RecInfo: def __init__(self, a, b, c, d, e, f, g): self.surroundStart = a       self.surroundEnd = b        self.refStart = c        self.refEnd = d        self.hits = e        self.name = f        self.title = g        self.inCDS = "?"

def printInfo(self): print "INFORMATION:" print "---" print "Gene name: ", self.name print "Location: ", self.refStart, "->", self.refEnd print "Hit qualification: ", self.hits print "Surrounding gene information:" print "" for (sStart, sEnd, n) in zip(self.surroundStart, self.surroundEnd, self.title): print n, ": ", sStart, "->", sEnd

result_handle = NCBIWWW.qblast("blastn","Test/gpipe/9606/allcontig_and_rna", seq, format_type="HTML") #the db is from the blast homepage (other db's dont give surrounding info) # blast_results = result_handle.read#

save_file = open("my_blast2.html","w")# save_file.write(blast_results)# save_file.close#

load_file = open("my_blast2.html","r") line = load_file.readline

surroundStart = [] surroundEnd = [] title = []

CDS = "false" CDS_ascID = "NA" temp = "" temp1 = ""

while(line): findLine = line.find("spanning the HSP") #to find the sequence I   if findLine != -1: name = line[findLine+23:-1] findLine = line.find("PREDICTED") if findLine != -1: #if part of a CDS end = line.find("Length") temp = line[findLine-30:findLine] temp = temp.split("value=") #print temp temp1 = temp[len(temp)-1] temp1 = temp1[1:-1] #print temp1 CDS_ascID = temp1 # this is the gene ascension number CDS = "true" break findLine = line.find("Features flanking this part of subject sequence:") if findLine != -1: #found features flanking CDS = "false" line = load_file.readline while line.find("&from=") != -1: #while there is still a feature w = line.find("&from=") #index of start site x = line.find("&to=") #index of mid y = line.find("&view=gbwithparts") #index of end surroundStart.append(int(line[w+6:x])) surroundEnd.append(int(line[x+4:y])) w = line.find("gbwithparts") #index of gene title x = line.find("") #index of end of title title.append(line[w+13:x]) line = load_file.readline

line = load_file.readline #manual parse line = load_file.readline #manual parse hits = line

seqFirstFound = "false" #if first part found

while line.find("input type") == -1:#while not done with sequence if line.find("Sbjct") != -1: #if begin if(seqFirstFound == "false"): #if first sequence temp = line.split(" ") #split by space refStart = temp[2] seqFirstFound = "true" else: #find end sequence temp = line[:-1]#delete /n temp = temp.split(" ") refEnd = temp[6] line = load_file.readline

break line = load_file.readline

if CDS == "false": record = RecInfo(surroundStart, surroundEnd, refStart, refEnd, hits, name, title) record.printInfo else: print "The snippit is a CDS:" print "Name: ", name print "Asc. Number: ", CDS_ascID # We can create a GenBank object that will parse a raw record # This facilitates extracting specific information from the sequences record_parser = GenBank.FeatureParser

# NCBIDictionary is an interface to Genbank ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser)

# If you pass NCBIDictionary a GenBank id, it will download that record parsed_record = ncbi_dict[CDS_ascID]

# Extract the sequence from the parsed_record s = parsed_record.seq.tostring print "Sequence: ", s

OUTPUT:

CASE 1: The sequence is surrounded by a CDS:

INFORMATION: --- Gene name: Homo sapiens chromosome 10 genomic contig, reference assembly Location: 42968870 -> 42969270 Hit qualification:  Identities = 400/401 (99%), Gaps = 0/401 (0%)

Surrounding gene information:

3895 bp at 5' side: hypothetical protein : 42962770 -> 42964975 425 bp at 3' side: HtrA serine peptidase 1 : 42969695 -> 43022401

CASE 2: The sequence is in a CDS:

The snippit is a CDS: Name: Homo sapiens chromosome 10 genomic contig, reference assembly Asc. Number: XM_001131263 Sequence: GAGATGGCAGCTGGCTTGGCAAGGGGACAGCACCTTTGTCACCACATTATGTCCCTGTACCCTACATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTCAGCACCACCTGACACTGTCTATCATCCACACTGCAGCAAGGTGATTCTGCCAAAACATATCTCCTTAAAAGCCAACTGGAGCTTCTCATCAGCATCAATGTGAAGCCAAAAATCCTTAGGAGGACAGAGGGAGTCCCTCACAACCTAGACTGGTCCCCTTCCCTCCAGCTGCCTCAACTGTCCACAGGACTCTCTTCCCACCTGCGGCCACACTGTGCAACCTGGAATTTCCCCACCTGGGCGGACTCATCACGTCATCACCAATTGGATGCATCTTCTGCTCTGTGCAGCTGGTGAAATCTTTCTCAACCCTTGAGATGCAGCCCAATCTTCTCCTAACATCTGGATTCCTCTCTGTCACTGCATTCCCTCCTGTCATCCTGCCTTTGTTTTCTTGCCCTCCTTTCTCTCCCGGGTGATAGGCATTAACTAAAATTAAATAAAAATTCAGATCATCCTTGCA