Difference between revisions of "Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-4-15"

From OpenWetWare
Jump to: navigation, search
(New page: This version of the full script pickles the output to a file in addition to printing. Below is code to read in and unpickle what was written (and then print that. A modified version of th...)
(No difference)

Latest revision as of 20:55, 15 April 2007

This version of the full script pickles the output to a file in addition to printing. Below is code to read in and unpickle what was written (and then print that. A modified version of this will be used with the webapp because each query will be searched against all previous queries to see if that specific query has been done before. if it's been done before, its pickled output will be in a file called outIDNUM.txt where IDNUM is the query number for the original query. this way we reduce query time we hope. we may add some database field to keep track of how many times a given sequence is queried.

from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.EUtils import DBIdsClient
import xml.dom.minidom
from xml.dom.minidom import parse, parseString
from threading import Thread
import time, sys
import pickle

id = "1"
outputlist = []

# All the relevant output will be to a file.
output_file = file("out" + id + ".txt", 'w')

# C-style struct to pass parameters
class AllelicVariant:
# basic function to open fasta file and get sequence
def get_sequence_from_fasta(file):
	file_handle = open(file)
	records = SeqIO.parse(file_handle, format="fasta")
	record = records.next()
	return record.seq.data
# lookup blast snp data
def blast_snp_lookup(seq):
	result_handle = NCBIWWW.qblast("blastn", "snp/human_9606/human_9606", seq)
	return result_handle.read()
# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
    rc = ""
    for node in node_list:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc
# extracts snp data
def extract_snp_data(str):
	dom = parseString(str)
	variants = dom.getElementsByTagName("Hit")
	if len(variants) == 0:
	parsed = []
	for v in variants:
		# now populate the struct
		id = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes)
		score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes)
		# only consider it a genuine snp if the hit score is above 100
		if int(score) > 100:
	return parsed
# queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(dnsnp_id, "omim")
	records = [i.efetch(rettype="xml") for i in query]
	return records
# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
	dom = parseString(str)
	variants = dom.getElementsByTagName("Mim-allelic-variant")
	if len(variants) == 0:
	parsed = []
	for v in variants:
		a = AllelicVariant() # create empty instance of struct
		# now populate the struct
		a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
		a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
		a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
	return parsed
# read sequence from file
sequence = get_sequence_from_fasta("example.fasta")
print "Sequence received; please wait."
# look up data
b = blast_snp_lookup(sequence)
# extract data
e = extract_snp_data(b)
outputlist.append(str(len(e)) + " single nucleotide polymorphism(s) found:\n")
if len(e) > 0:
	for snp_id in e:
		outputlist.append(snp_id + "\n")
	sys.exit() # nothing more to be done if no snp found
#print '-' * 40
#print "\n"
print "got a snp... still thinking..."

# do an omim search on each snp
for snp_id in e:
	o = omim_snp_search(snp_id)
	if len(o) == 0:
		outputlist.append("No information found for " + snp_id + "\n")
		continue # nothing more to be done if no records can be found
	# otherwise, find the allelic variant data
	outputlist.append(snp_id + " details:" + "\n")
	for i in o:
		v = extract_allelic_variant_data(i.read())
		if v != None:
			for a in v:
				outputlist.append(a.name + "\n")
				outputlist.append(a.mutation + "\n")
				outputlist.append(a.description + "\n")
	#print '-' * 40
	#print "\n"
print "yay! we're done!"
for item in outputlist:
	print item
pickle.dump(outputlist, output_file)
print "wrote pickled version to file"


Read Pickled Input

import pickle
import sys

id = "1" 
fin = open("out" + id + ".txt", "r")
text = pickle.load(fin)

for item in text:
    print item