Harvard:Biophysics 101/2007/Notebook:Kaull/2007-5-1

From OpenWetWare
Jump to: navigation, search

Tuesday Finish BLAST SNP -> OMIM workaround via locus ID

-> This is done - see code below.

Thursday Be able to verify SNP is the correct one.

Here are my thoughts on the matter:

  1. Search OMIM directly for the SNP. This should give the one-hit wonders, where the SNP is the only known cause of the trait - at least that's how I understand it, after poking about on OMIM. Return the info paragraph they give for the disease. (This much we have.)
  2. If no luck, use locus ID. Search for the rs# in the "Allelic Variants" section of the results. If there's a direct hit, return the relevant paragraph and the basic info about the disease.
  3. If still nothing, but there are results for the locus ID - maybe just return those results. Perhaps with a ranking of how relevant they are to the SNP - synonymous mutant or no, frameshift or no, location, etc.

Anyway. Code.

from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.EUtils import DBIdsClient
import xml.dom.minidom
from xml.dom.minidom import parse, parseString
from xml.dom import minidom
from threading import Thread
import time, sys
import pickle
import urllib

outputlist = []

# C-style struct to pass parameters
class AllelicVariant:

# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
    rc = ""
    for node in node_list:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc

# queries the database and returns all info in an XML format
def omim_tag_search(tag_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(tag_id, "omim")
	records = [i.efetch(rettype="text") for i in query]
	return records
# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
	dom = parseString(str)
	variants = dom.getElementsByTagName("Mim-allelic-variant")
	if len(variants) == 0:
	parsed = []
	for v in variants:
		a = AllelicVariant() # create empty instance of struct
		# now populate the struct
		a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
		a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
		a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
	return parsed

def parseSNP(snp_id):
    SNP_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp'
    url = SNP_URL + '&id=' + snp_id + '&mode=xml'
    dom = minidom.parse(urllib.urlopen(url))
    symbol = dom.getElementsByTagName("FxnSet_symbol")
    return symbol[0].toxml().split('>')[1].split('<')[0]

snp_id = '11200638'

tag_id = parseSNP(snp_id)
print "Tag is: ", tag_id

o = omim_tag_search(tag_id)

if len(o) == 0:
	outputlist.append("No information found for " + snp_id + "\n")
# nothing more to be done if no records can be found
# otherwise, find the allelic variant data
        outputlist.append(snp_id + " details:" + "\n")
        for i in o:
                v = extract_allelic_variant_data(i.read())
        	if v != None:
                        for a in v:
                                outputlist.append(a.name + "\n")
                                outputlist.append(a.mutation + "\n")
                                outputlist.append(a.description + "\n")
#print '-' * 40
#print "\n"
print "yay! we're done!"
for item in outputlist:
	print item

#print 'second act'
#output_file = file("outfile.txt", "w")
#print o