Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-4-15

This version of the full script pickles the output to a file in addition to printing. Below is code to read in and unpickle what was written (and then print that. A modified version of this will be used with the webapp because each query will be searched against all previous queries to see if that specific query has been done before. if it's been done before, its pickled output will be in a file called outIDNUM.txt where IDNUM is the query number for the original query.  this way we reduce query time we hope.  we may add some database field to keep track of how many times a given sequence is queried.

from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys import pickle

id = "1" outputlist = []

output_file = file("out" + id + ".txt", 'w')
 * 1) All the relevant output will be to a file.

class AllelicVariant: pass def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next return record.seq.data def blast_snp_lookup(seq): result_handle = NCBIWWW.qblast("blastn", "snp/human_9606/human_9606", seq) return result_handle.read def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc def extract_snp_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed = [] for v in variants: # now populate the struct id = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes) score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes) # only consider it a genuine snp if the hit score is above 100 if int(score) > 100: parsed.append(id) return parsed def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait." b = blast_snp_lookup(sequence) e = extract_snp_data(b) outputlist.append(str(len(e)) + " single nucleotide polymorphism(s) found:\n") if len(e) > 0: for snp_id in e:		outputlist.append(snp_id + "\n") else: outputlist.append("[none]\n") sys.exit # nothing more to be done if no snp found print "got a snp... still thinking..."
 * 1) C-style struct to pass parameters
 * 1) basic function to open fasta file and get sequence
 * 1) lookup blast snp data
 * 1) basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
 * 1) extracts snp data
 * 1) queries the database and returns all info in an XML format
 * 1) extracts allelic variant data, as the name implies, using the struct above
 * 1) BEGIN ACTUAL PROGRAM
 * 2) read sequence from file
 * 1) look up data
 * 1) extract data
 * 1) print '-' * 40
 * 2) print "\n"

for snp_id in e:	o = omim_snp_search(snp_id) if len(o) == 0: outputlist.append("No information found for " + snp_id + "\n") continue # nothing more to be done if no records can be found # otherwise, find the allelic variant data outputlist.append(snp_id + " details:" + "\n") for i in o:		v = extract_allelic_variant_data(i.read) if v != None: for a in v:				outputlist.append(a.name + "\n") outputlist.append(a.mutation + "\n") outputlist.append(a.description + "\n") #print '-' * 40 #print "\n" print "yay! we're done!" for item in outputlist: print item pickle.dump(outputlist, output_file) print "wrote pickled version to file"
 * 1) do an omim search on each snp

output_file.close

Read Pickled Input import pickle import sys

id = "1" fin = open("out" + id + ".txt", "r") text = pickle.load(fin)

for item in text: print item