Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-4-26
From OpenWetWare
Jump to navigationJump to search
Comments
- I think I now have all of the necessary pieces of code to create a program, which will parse the XML of any returned OMIM entry to retrieve the PMIDs, then search these through PubMed and then parse through the PubMed entries for MeshTerms whose DescriptorNames are labelled 'Y' to MajorTopicYN. However, there are still a few errors in the program, which I am trying to understand and fix.
Code Thus Far
from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString # C-style struct to pass parameters class AllelicVariant: pass class PubmedID: pass class MeshTerms: pass # queries the database and returns all info in an XML format def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient() query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records # basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc # extracts allelic variant data, as the name implies, using the struct above def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") # print "variant:", variants if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant() # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed def extract_allelic_variant_pmid(str): dom = parseString(str) pmids = dom.getElementsByTagName("Mim-reference") if len(pmids) == 0: return ids = [] for p in pmids: i = PubmedID() i.pmid = get_text(p.getElementsByTagName("Mim-reference_pubmedUID")[0].childNodes) ids.append(i) return ids def extract_meshterms(str): dom = parseString(str) meshheadings = dom.getElementsByTagName("MeshHeading") if len(meshterms) == 0: return meshterms = [] for h in meshheadings: m = MeshTerms() m = get_text(h.getElementsByTagName("DescriptorName MajorTopicYN")[0].childNodes) if m == "Y": mesh = get_text(h.getElementsByTagName("DescriptorName")[0].childNodes) meshterms.append(mesh) return meshterms from Bio import PubMed from Bio import Medline import string rec_parser = Medline.RecordParser() medline_dict = PubMed.Dictionary(parser = rec_parser) for i in omim_snp_search("rs11200638"): result = i.read() if result: v = extract_allelic_variant_data(result) p = extract_allelic_variant_pmid(result) if v != None: for a in v: print a.name print a.mutation print a.description if p != None: for s in p: cur_record = medline_dict[s] m = extract_meshterms(cur_record) if m != None: for mh in m: print mh #if p != None: # for i in p: #print i.pmid ##disease = a.name ## ##search_term = "Review[ptyp] "+disease ###print search_term ## ##review_ids = PubMed.search_for(search_term) ## ##rec_parser = Medline.RecordParser() ##medline_dict = PubMed.Dictionary(parser = rec_parser) ## ##count = 1 ## ##for did in review_ids[0:5]: ## cur_record = medline_dict[did] ## print '\n', count, ') ', string.rstrip(cur_record.title), cur_record.authors, string.strip(cur_record.source) ## count=count+1 ## ##for i in omim_snp_search("rs11200638"): ## result = i.read() ## if result: ## p = extract_allelic_variant_pmid(result) ## if p != None: ## key_source = PubMed.search_for(p[0]) ## key_rec = medline_dict[0] ## print key_rec ## ## keywords = get_text(key_rec.getElementsByTagName("MeshHeading")[0].childNodes) ## print keywords