Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-4-26
From OpenWetWare
Jump to navigationJump to search
Comments
- I think I now have all of the necessary pieces of code to create a program, which will parse the XML of any returned OMIM entry to retrieve the PMIDs, then search these through PubMed and then parse through the PubMed entries for MeshTerms whose DescriptorNames are labelled 'Y' to MajorTopicYN. However, there are still a few errors in the program, which I am trying to understand and fix.
Code Thus Far
from Bio.EUtils import DBIdsClient
import xml.dom.minidom
from xml.dom.minidom import parse, parseString
# C-style struct to pass parameters
class AllelicVariant:
pass
class PubmedID:
pass
class MeshTerms:
pass
# queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id):
client = DBIdsClient.DBIdsClient()
query = client.search(dnsnp_id, "omim")
records = [i.efetch(rettype="xml") for i in query]
return records
# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
rc = ""
for node in node_list:
if node.nodeType == node.TEXT_NODE:
rc = rc + node.data
return rc
# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
dom = parseString(str)
variants = dom.getElementsByTagName("Mim-allelic-variant")
# print "variant:", variants
if len(variants) == 0:
return
parsed = []
for v in variants:
a = AllelicVariant() # create empty instance of struct
# now populate the struct
a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
parsed.append(a)
return parsed
def extract_allelic_variant_pmid(str):
dom = parseString(str)
pmids = dom.getElementsByTagName("Mim-reference")
if len(pmids) == 0:
return
ids = []
for p in pmids:
i = PubmedID()
i.pmid = get_text(p.getElementsByTagName("Mim-reference_pubmedUID")[0].childNodes)
ids.append(i)
return ids
def extract_meshterms(str):
dom = parseString(str)
meshheadings = dom.getElementsByTagName("MeshHeading")
if len(meshterms) == 0:
return
meshterms = []
for h in meshheadings:
m = MeshTerms()
m = get_text(h.getElementsByTagName("DescriptorName MajorTopicYN")[0].childNodes)
if m == "Y":
mesh = get_text(h.getElementsByTagName("DescriptorName")[0].childNodes)
meshterms.append(mesh)
return meshterms
from Bio import PubMed
from Bio import Medline
import string
rec_parser = Medline.RecordParser()
medline_dict = PubMed.Dictionary(parser = rec_parser)
for i in omim_snp_search("rs11200638"):
result = i.read()
if result:
v = extract_allelic_variant_data(result)
p = extract_allelic_variant_pmid(result)
if v != None:
for a in v:
print a.name
print a.mutation
print a.description
if p != None:
for s in p:
cur_record = medline_dict[s]
m = extract_meshterms(cur_record)
if m != None:
for mh in m:
print mh
#if p != None:
# for i in p:
#print i.pmid
##disease = a.name
##
##search_term = "Review[ptyp] "+disease
###print search_term
##
##review_ids = PubMed.search_for(search_term)
##
##rec_parser = Medline.RecordParser()
##medline_dict = PubMed.Dictionary(parser = rec_parser)
##
##count = 1
##
##for did in review_ids[0:5]:
## cur_record = medline_dict[did]
## print '\n', count, ') ', string.rstrip(cur_record.title), cur_record.authors, string.strip(cur_record.source)
## count=count+1
##
##for i in omim_snp_search("rs11200638"):
## result = i.read()
## if result:
## p = extract_allelic_variant_pmid(result)
## if p != None:
## key_source = PubMed.search_for(p[0])
## key_rec = medline_dict[0]
## print key_rec
##
## keywords = get_text(key_rec.getElementsByTagName("MeshHeading")[0].childNodes)
## print keywords