Harvard:Biophysics 101/2007/Notebook:Kaull/2007-5-3

OMIM workaround - in its final, nicely documented form.

Previous version has been redirected to previous day's page, in case it's still needed.

Specs are the same:
 * Accepts rs ID#
 * Returns AllelicVariant struct with name, mutation, and description parameters

from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient

from xml.dom import minidom from xml.dom.minidom import parse, parseString from threading import Thread

import pickle, sys, time, urllib


 * 1) Code taken from elsewhere, unedited
 * 1) Code taken from elsewhere, unedited

outputlist = []

class AllelicVariant: pass
 * 1) C-style struct to pass parameters

def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc
 * 1) basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html

def get_any_data(node): return node.toxml.split('>')[1].split('<')[0]

def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records
 * 1) queries the database and returns all info in an XML format


 * 1) Code taken from elsewhere, edited
 * 1) Code taken from elsewhere, edited

def extract_allelic_variant_data(records, snp_id, record_type): if record_type == 'old': return extract_omim_data_old(records) if record_type == 'new': return extract_omim_data_new(records, snp_id)
 * 1) Note: This is different from earlier versions!
 * 2)  shell / helper function, replaces earlier function with one parameter
 * 3)  uses XML record from OMIM, and corresponding SNP, to create AllelicVariant struct

return "Incorrect type in function extract_allelic_variant_data"


 * 1) New code - from Kay
 * 1) New code - from Kay


 * 1) Note: This code is a temporary solution to a dbSNP formatting issue.
 * 2) Older entries are best searched directly by ID #.
 * 3)  - > this case is the first covered
 * 4) Newer entries are not indexed in this fashion, although SNP ID data is
 * 5)  available on the individual entry.  These contain Allelic Variant data
 * 6)  which is located and extracted by the script
 * 7)  - > this case is the second covered


 * 1) **** TEST DATA **** (enable for testing! then delete me!)
 * 2) snp_id = 'rs11200638'
 * 3) snp_id = 'rs28937310'


 * 1) **** SUPPORT FOR NEW-STYLE PARSER ****

def parse_geneID_tag(snp_id): SNP_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp' snp_id_number = snp_id.strip('rs') url = SNP_URL + '&id=' + snp_id_number + '&mode=xml' dom = minidom.parse(urllib.urlopen(url)) symbol_data = dom.getElementsByTagName('FxnSet_symbol') symbol = get_any_data(symbol_data[0]) return symbol
 * 1) queries the SNP database and returns the annotation tag for that region
 * 2) of the genome - a short, unique alphanumeric string
 * 3) Currently, DBIdsClient does not support snp parsing.  So it's not used.
 * 4) A future update should correct this when possible, for ease of reading.

def omim_tag_search(tag_id): client = DBIdsClient.DBIdsClient query = client.search(tag_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records
 * 1) queries the database and returns all info in an XML format


 * 1) **** OMIM DATA EXTRACTION ****
 * 2) Records are first sent to extract_allelic_variant_data, which takes
 * 3) a list of XML entries and the parse type ('old' or 'new').  This is a
 * 4) shell function, designed for later ease of replacement - its sole function
 * 5) is to funnel the records to the appropriate extract_omim_data function.


 * 1) The extract_omim_data functions return an AllelicVariant struct.
 * 2) Currently, this struct has name, mutation, and description entries.

def extract_omim_data_old(records): dom = parseString(records) variants = dom.getElementsByTagName("Mim-allelic-variant")
 * 1) sends data in OMIM entry list to AllelicVariant struct
 * 2) configured for XML entries, in old format

parsed = [] for v in variants: a = AllelicVariant # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a)

return parsed def extract_omim_data_new(records, snp_id): dom = parseString(records) variants = dom.getElementsByTagName("Mim-allelic-variant")
 * 1) sends data in OMIM entry list to AllelicVariant struct
 * 2) configured for XML entries, in new format

# convert SNP ID into number format snp_id = snp_id.strip('rs') parsed = [] for v in variants: a = AllelicVariant # create empty instance of struct # now populate the struct

variant_snp_tag = v.getElementsByTagName("Mim-allelic-variant_snpLinks")

# additional code checks for SNPs linked to allelic variant # if SNPs are present, it searches for the correct SNP - if found, # the variant is added to results list if variant_snp_tag != list: isLinked = False for i in variant_snp_tag: variant_snp = get_text(i.getElementsByTagName("Mim-link_uids")[0].childNodes) variant_snp = str(variant_snp) if variant_snp == snp_id: isLinked = True

if isLinked: a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed


 * 1) **** MAIN FUNCTIONS ****

def snp_to_omim(snp_id): records = omim_snp_search(snp_id)
 * 1) takes SNP ID and gets search results from OMIM in XML format

# if records are found, then search is complete # use old parser - SNP was directly in search text if records != list: return records, 'old'

# otherwise, find geneID tag for the region, and search that tag_id = parse_geneID_tag(snp_id) records = omim_tag_search(tag_id)

# use new parser - SNP was hidden in XML markup return records, 'new'

def omim_to_output(records, snp_id, record_type): output = list if records == list: output.append("No information found for " + snp_id + "\n") return output
 * 1) takes list of XML records from OMIM, and the corresponding SNP ID
 * 2) generates desired output as a list of text entries

for i in records: variants = extract_allelic_variant_data(i.read, snp_id, record_type) if variants != None: for a in variants: output.append(a.name + "\n") output.append(a.mutation + "\n") output.append(a.description + "\n")

return output

omim_records, record_type = snp_to_omim(snp_id) output_list = omim_to_output(omim_records, snp_id, record_type)

for item in output_list: print item