Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-4-24

Tasks Accomplished

 * Grasped a much better understanding of XML parsing.
 * Finished code to parse OMIM records and return all Pubmed ID numbers (without errors).
 * Wrote code to search Pubmed ID numbers in Pubmed and then parse through the result to look for key words (or meshterms).
 * Came up for some ideas about how to deal with non-OMIM records. (i.e. dbSNP, HapMap, make an entry into one of these databases?)

Tasks to Complete for Thursday

 * Ask class how they would like key words to be determined. (i.e. search first Pubmed ID number and return meshterms; repeat for all Pubmed IDs?)
 * Correct how code to parse through Pubmed is implemented since it currently causes a stalling.
 * Think more about how to handle genotypes not found/annotated in OMIM and then implement this in code.

Code So Far
from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString

class AllelicVariant: pass
 * 1) C-style struct to pass parameters

class PubmedID: pass

def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records
 * 1) queries the database and returns all info in an XML format

def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc
 * 1) basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html

def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed
 * 1) extracts allelic variant data, as the name implies, using the struct above
 * 1)    print "variant:", variants

def extract_allelic_variant_pmid(str): dom = parseString(str) pmids = dom.getElementsByTagName("Mim-reference") if len(pmids) == 0: return ids = [] for p in pmids: i = PubmedID i.pmid = get_text(p.getElementsByTagName("Mim-reference_pubmedUID")[0].childNodes) ids.append(i) return ids for i in omim_snp_search("rs11200638"): result = i.read if result: v = extract_allelic_variant_data(result) p = extract_allelic_variant_pmid(result) if v != None: for a in v:           print a.name print a.mutation print a.description #if p != None: # for i in p:           #print i.pmid

from Bio import PubMed from Bio import Medline import string

disease = a.name

search_term = "Review[ptyp] "+disease
 * 1) print search_term

review_ids = PubMed.search_for(search_term)

rec_parser = Medline.RecordParser medline_dict = PubMed.Dictionary(parser = rec_parser)

count = 1

for did in review_ids[0:5]: cur_record = medline_dict[did] print '\n', count, ') ', string.rstrip(cur_record.title), cur_record.authors, string.strip(cur_record.source)    count=count+1

for i in omim_snp_search("rs11200638"): result = i.read if result: p = extract_allelic_variant_pmid(result) if p != None: key_source = PubMed.search_for(p[0]) key_rec = medline_dict[0] print key_rec

keywords = get_text(key_rec.getElementsByTagName("MeshHeading")[0].childNodes) print keywords