Harvard:Biophysics 101/Notebook:ZS/2007-5-2
From OpenWetWare
Jump to navigationJump to search
#Zachary Sun, Biophysics 101 #5.03.07 code #Tackling the problem of finding surrounding CDS's and information about a reference sequence from blast, #this program parses the HTML output of blastn as of 4.26.07 for information which can then be fed #into other NCBI programs to be programmed, #Saves a local file; to stop queuing the web, just comment the code with pound symbols at end. #update from v1: fixed some bug issues w/output, added the code in CDS option and additional parsing for information #to be fed into polyphen as currently there is no working local database option. #Note: this code has 2 cases: #1) The seqence is in a CDS # *It outputs the CDS it is in, the gene asc. #, and the CDS sequence itself #2) The sequence is not in a CDS, but surrounded by CDSes # *It outputs the CDS location of the surrounding sequence and the gene name of surrounding sequence, and mutation site # #Note: Any other data can be found for the gene asc. #, eg. protein sequences which go into polyphen from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.Blast import NCBIXML import xml.sax.handler from xml.dom import minidom #class seq (macular degen.) on first line, other CDS seq on second #seq = "CACCCTCGCCAGTTACGAGCTGCCGAGCCGCTTCCTAGGCTCTCTGCGAATACGGACACGCATGCCACCCACAACAACTTTTTAAAAGAATCAGACGTGTGAAGGATTCTATTCGAATTACTTCTGCTCTCTGCTTTTATCACTTCACTGTGGGTCTGGGCGCGGGCTTTCTGCCAGCTCCGCGGACGCTGCCTTCGTCCRGCCGCAGAGGCCCCGCGGTCAGGGTCCCGCGTGCGGGGTACCGGGGGCAGAACCAGCGCGTGACCGGGGTCCGCGGTGCCGCAACGCCCCGGGTCTGCGCAGAGGCCCCTGCAGTCCCTGCCCGGCCCAGTCCGAGCTTCCCGGGCGGGCCCCCAGTCCGGCGATTTGCAGGAACTTTCCCCGGCGCTCCCACGCGAAGC" seq = "ATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTC" # the above is a user-defined variable; put in your sequence to be Blast against! class RecInfo(): def __init__(self, a, b, c, d, e, f, g): self.surroundStart = a self.surroundEnd = b self.refStart = c self.refEnd = d self.hits = e self.name = f self.title = g self.inCDS = "?" def printInfo(self): print "INFORMATION:" print "-----------" print "Gene name: ", self.name print "Location: " , self.refStart, "->", self.refEnd print "Hit qualification: ", self.hits print "Surrounding gene information:" print "----------------------------" for (sStart, sEnd, n) in zip(self.surroundStart, self.surroundEnd, self.title): print n, ": " , sStart, "->", sEnd result_handle = NCBIWWW.qblast("blastn","Test/gpipe/9606/allcontig_and_rna", seq, format_type="HTML") #the db is from the blast homepage (other db's dont give surrounding info) # blast_results = result_handle.read()# save_file = open("my_blast2.html","w")# save_file.write(blast_results)# save_file.close()# load_file = open("my_blast2.html","r") line = load_file.readline() surroundStart = [] surroundEnd = [] title = [] CDS = "false" CDS_ascID = "NA" temp = "" temp1 = "" while(line): findLine = line.find("spanning the HSP") #to find the sequence I if findLine != -1: name = line[findLine+23:-1] findLine = line.find("PREDICTED") if findLine != -1: #if part of a CDS end = line.find("Length") temp = line[findLine-30:findLine] temp = temp.split("value=") #print temp temp1 = temp[len(temp)-1] temp1 = temp1[1:-1] #print temp1 CDS_ascID = temp1 # this is the gene ascension number CDS = "true" break findLine = line.find("Features flanking this part of subject sequence:") if findLine != -1: #found features flanking CDS = "false" line = load_file.readline() while line.find("&from=") != -1: #while there is still a feature w = line.find("&from=") #index of start site x = line.find("&to=") #index of mid y = line.find("&view=gbwithparts") #index of end surroundStart.append(int(line[w+6:x])) surroundEnd.append(int(line[x+4:y])) w = line.find("gbwithparts") #index of gene title x = line.find("</a>") #index of end of title title.append(line[w+13:x]) line = load_file.readline() line = load_file.readline() #manual parse line = load_file.readline() #manual parse hits = line seqFirstFound = "false" #if first part found while line.find("input type") == -1:#while not done with sequence if line.find("Sbjct") != -1: #if begin if(seqFirstFound == "false"): #if first sequence temp = line.split(" ") #split by space refStart = temp[2] seqFirstFound = "true" else: #find end sequence temp = line[:-1]#delete /n temp = temp.split(" ") refEnd = temp[6] line = load_file.readline() break line = load_file.readline() if CDS == "false": record = RecInfo(surroundStart, surroundEnd, refStart, refEnd, hits, name, title) record.printInfo() else: print "The snippit is a CDS:" print "Name: ", name print "Asc. Number: ", CDS_ascID # We can create a GenBank object that will parse a raw record # This facilitates extracting specific information from the sequences record_parser = GenBank.FeatureParser() # NCBIDictionary is an interface to Genbank ncbi_dict = GenBank.NCBIDictionary('nucleotide', 'genbank', parser = record_parser) # If you pass NCBIDictionary a GenBank id, it will download that record parsed_record = ncbi_dict[CDS_ascID] # Extract the sequence from the parsed_record s = parsed_record.seq.tostring() print "Sequence: ", s
OUTPUT:
CASE 1: The sequence is surrounded by a CDS:
INFORMATION: ----------- Gene name: Homo sapiens chromosome 10 genomic contig, reference assembly Location: 42968870 -> 42969270 Hit qualification: Identities = 400/401 (99%), Gaps = 0/401 (0%) Surrounding gene information: ---------------------------- 3895 bp at 5' side: hypothetical protein : 42962770 -> 42964975 425 bp at 3' side: HtrA serine peptidase 1 : 42969695 -> 43022401
CASE 2: The sequence is in a CDS:
The snippit is a CDS: Name: Homo sapiens chromosome 10 genomic contig, reference assembly Asc. Number: XM_001131263 Sequence: GAGATGGCAGCTGGCTTGGCAAGGGGACAGCACCTTTGTCACCACATTATGTCCCTGTACCCTACATGCTGCGCCTATACCCAGGACCGATGGTAACTGAGGCGGAGGGGAAAGGAGGGCCTGAGATGGCAAGTCTGTCCTCCTCGGTGGTTCCTGTGTCCTTCATTTCCACTCTGCGAGAGTCTGTGCTGGACCCTGGAGTTGGTGGAGAAGGAGCCAGTGACAAGCAGAGGAGCAAACTGTCTTTATCACACTCCATGATCCCAGCTGCTAAAATCCACACTGAGCTCTGCTTACCAGCCTTCTTCTCTCCTGCTGGAACCCAGAGGAGGTTCCAGCAGCCTCAGCACCACCTGACACTGTCTATCATCCACACTGCAGCAAGGTGATTCTGCCAAAACATATCTCCTTAAAAGCCAACTGGAGCTTCTCATCAGCATCAATGTGAAGCCAAAAATCCTTAGGAGGACAGAGGGAGTCCCTCACAACCTAGACTGGTCCCCTTCCCTCCAGCTGCCTCAACTGTCCACAGGACTCTCTTCCCACCTGCGGCCACACTGTGCAACCTGGAATTTCCCCACCTGGGCGGACTCATCACGTCATCACCAATTGGATGCATCTTCTGCTCTGTGCAGCTGGTGAAATCTTTCTCAACCCTTGAGATGCAGCCCAATCTTCTCCTAACATCTGGATTCCTCTCTGTCACTGCATTCCCTCCTGTCATCCTGCCTTTGTTTTCTTGCCCTCCTTTCTCTCCCGGGTGATAGGCATTAACTAAAATTAAATAAAAATTCAGATCATCCTTGCA