Difference between revisions of "Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-4-15"
(New page: This version of the full script pickles the output to a file in addition to printing. Below is code to read in and unpickle what was written (and then print that. A modified version of th...)
Latest revision as of 20:55, 15 April 2007
This version of the full script pickles the output to a file in addition to printing. Below is code to read in and unpickle what was written (and then print that. A modified version of this will be used with the webapp because each query will be searched against all previous queries to see if that specific query has been done before. if it's been done before, its pickled output will be in a file called outIDNUM.txt where IDNUM is the query number for the original query. this way we reduce query time we hope. we may add some database field to keep track of how many times a given sequence is queried.
from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys import pickle id = "1" outputlist =  # All the relevant output will be to a file. output_file = file("out" + id + ".txt", 'w') # C-style struct to pass parameters class AllelicVariant: pass # basic function to open fasta file and get sequence def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next() return record.seq.data # lookup blast snp data def blast_snp_lookup(seq): result_handle = NCBIWWW.qblast("blastn", "snp/human_9606/human_9606", seq) return result_handle.read() # basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc # extracts snp data def extract_snp_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed =  for v in variants: # now populate the struct id = get_text(v.getElementsByTagName("Hit_accession").childNodes) score = get_text(v.getElementsByTagName("Hsp_score").childNodes) # only consider it a genuine snp if the hit score is above 100 if int(score) > 100: parsed.append(id) return parsed # queries the database and returns all info in an XML format def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient() query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records # extracts allelic variant data, as the name implies, using the struct above def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed =  for v in variants: a = AllelicVariant() # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name").childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation").getElementsByTagName("Mim-text_text").childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description").getElementsByTagName("Mim-text_text").childNodes) parsed.append(a) return parsed # BEGIN ACTUAL PROGRAM # read sequence from file sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait." # look up data b = blast_snp_lookup(sequence) # extract data e = extract_snp_data(b) outputlist.append(str(len(e)) + " single nucleotide polymorphism(s) found:\n") if len(e) > 0: for snp_id in e: outputlist.append(snp_id + "\n") else: outputlist.append("[none]\n") sys.exit() # nothing more to be done if no snp found #print '-' * 40 #print "\n" print "got a snp... still thinking..." # do an omim search on each snp for snp_id in e: o = omim_snp_search(snp_id) if len(o) == 0: outputlist.append("No information found for " + snp_id + "\n") continue # nothing more to be done if no records can be found # otherwise, find the allelic variant data outputlist.append(snp_id + " details:" + "\n") for i in o: v = extract_allelic_variant_data(i.read()) if v != None: for a in v: outputlist.append(a.name + "\n") outputlist.append(a.mutation + "\n") outputlist.append(a.description + "\n") #print '-' * 40 #print "\n" print "yay! we're done!" for item in outputlist: print item pickle.dump(outputlist, output_file) print "wrote pickled version to file" output_file.close()
Read Pickled Input
import pickle import sys id = "1" fin = open("out" + id + ".txt", "r") text = pickle.load(fin) for item in text: print item