Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-4-26

From OpenWetWare
Jump to navigationJump to search

Comments

  • I think I now have all of the necessary pieces of code to create a program, which will parse the XML of any returned OMIM entry to retrieve the PMIDs, then search these through PubMed and then parse through the PubMed entries for MeshTerms whose DescriptorNames are labelled 'Y' to MajorTopicYN. However, there are still a few errors in the program, which I am trying to understand and fix.

Code Thus Far

from Bio.EUtils import DBIdsClient
import xml.dom.minidom
from xml.dom.minidom import parse, parseString

# C-style struct to pass parameters
class AllelicVariant:
	pass

class PubmedID:
        pass

class MeshTerms:
        pass

# queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(dnsnp_id, "omim")
	records = [i.efetch(rettype="xml") for i in query]
	return records

# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
    rc = ""
    for node in node_list:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc

# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
    dom = parseString(str)
    variants = dom.getElementsByTagName("Mim-allelic-variant")
#    print "variant:", variants
    if len(variants) == 0:
        return
    parsed = []
    for v in variants:
        a = AllelicVariant() # create empty instance of struct
        # now populate the struct
        a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
        a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
        a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
        parsed.append(a)
    return parsed

def extract_allelic_variant_pmid(str):
    dom = parseString(str)
    pmids = dom.getElementsByTagName("Mim-reference")
    if len(pmids) == 0:
        return
    ids = []
    for p in pmids:
        i = PubmedID()
        i.pmid = get_text(p.getElementsByTagName("Mim-reference_pubmedUID")[0].childNodes)
        ids.append(i)
    return ids
	
def extract_meshterms(str):
    dom = parseString(str)
    meshheadings = dom.getElementsByTagName("MeshHeading")
    if len(meshterms) == 0:
        return
    meshterms = []
    for h in meshheadings:
        m = MeshTerms()
        m = get_text(h.getElementsByTagName("DescriptorName MajorTopicYN")[0].childNodes)
        if m == "Y":
            mesh = get_text(h.getElementsByTagName("DescriptorName")[0].childNodes)
        meshterms.append(mesh)
    return meshterms

from Bio import PubMed
from Bio import Medline
import string

rec_parser = Medline.RecordParser()
medline_dict = PubMed.Dictionary(parser = rec_parser)

for i in omim_snp_search("rs11200638"):
    result = i.read()
    if result:
        v = extract_allelic_variant_data(result)
        p = extract_allelic_variant_pmid(result)
    if v != None:
        for a in v:
            print a.name
            print a.mutation
            print a.description
    if p != None:
        for s in p:
            cur_record = medline_dict[s]
            m = extract_meshterms(cur_record)
            if m != None:
                for mh in m:
                    print mh
    #if p != None:
       # for i in p:
            #print i.pmid


##disease = a.name
##
##search_term = "Review[ptyp] "+disease
###print search_term
##
##review_ids = PubMed.search_for(search_term)
##
##rec_parser = Medline.RecordParser()
##medline_dict = PubMed.Dictionary(parser = rec_parser)
##
##count = 1
##
##for did in review_ids[0:5]:
##    cur_record = medline_dict[did]
##    print '\n', count, ')  ', string.rstrip(cur_record.title), cur_record.authors, string.strip(cur_record.source)
##    count=count+1
##
##for i in omim_snp_search("rs11200638"):
##    result = i.read()
##    if result:
##        p = extract_allelic_variant_pmid(result)
##    if p != None:
##        key_source = PubMed.search_for(p[0])
##        key_rec = medline_dict[0]
##        print key_rec
##
##        keywords = get_text(key_rec.getElementsByTagName("MeshHeading")[0].childNodes)
##        print keywords