Harvard:Biophysics 101/Notebook:ZS/2007-5-3


 * 1) ICD9_prevalance.py
 * 2) Zachary Sun
 * 3) 5.1.07, Biophysics 101


 * 1) Ultimately: This code is able to output a formal disease name from informal info.
 * 2) INPUT: RS NUMBER
 * 3) Output1 : Mesh terms relevant to RS NUMBER
 * 4) INPUT2 : Mesh terms relevant to RS NUMBER
 * 5) Output2 : The single major mesh term (eg. disease name)
 * 6) INPUT3: The single major mesh term (eg. disease name)
 * 7) Output3: CDC# to standardize mesh term, formal disease name, and incidence


 * 1) This code does a couple of things:
 * 2) 1) **Cynthia**: Takes in a rs# and outputs a list of mesh terms and important mesh terms
 * 3) 2) **Zach**: Takes Cynthia's list of important mesh terms,
 * 4)    *removes short words from mesh terms, and hits against paper titles from which it was taken
 * 5)    *for the mesh terms with the most paper title hits, it says that mesh term is the MAIN TERM
 * 6)    *feeds the main term to http://icd9cm.chrisendres.com
 * 7) 3) **Zach**: Enables lookup of ICD9 ID numbers and formal name when given a MAIN TERM
 * 8)    *This is particularly useful as all data (WHO, CDC) is based
 * 9)    *on the ICD9 ID, which is a method of classifying all known
 * 10)    *diseases. This code is able to queue a website, http://icd9cm.chrisendres.com
 * 11)    *which has the database of diseases - it returns the best hits
 * 12)    *in dirty HTML, which can then be parsed to obtain ICD9 #'s and description.
 * 13) 4) **Zach** Enables lookup of prevalance data in database
 * 14)    *Currently, I only have it hooked up to the State of CA prevalance data from
 * 15)    *http://www.oshpd.ca.gov/hqad/PatientLevel/ICD9_Codes/index.htm, it does a
 * 16)    *lookup based on #1 and returns prevalance data. Havent looked up but can theoretically
 * 17)    *link to main CDC db at http://wonder.cdc.gov/.

import os import string import urllib import sys from Bio import PubMed from Bio import Medline

rsNumber = "rs11200638" # MODIFY THIS! print "INPUT: ", rsNumber


 * 1) cynthias section###

def parse_term(str, bool): parsed_term = str if(bool): parsed_term = parsed_term.replace('*', '') if str.find('/') != -1: parsed_term = parsed_term.replace('/', ' ') return parsed_term
 * 1) parses a mesh term to remove * and /

def parse_mesh(list): all_mesh_terms = [] major_mesh_terms = [] mesh_term = '' for i in range(len(list)): major = False if list[i].find('*') == -1: mesh_term = parse_term(list[i], major) all_mesh_terms.append(mesh_term) else: major = True mesh_term = parse_term(list[i], major) major_mesh_terms.append(mesh_term) all_mesh_terms.append(mesh_term) all_mesh = [all_mesh_terms, major_mesh_terms] return all_mesh article_ids = PubMed.search_for("rs11200638") rec_parser = Medline.RecordParser medline_dict = PubMed.Dictionary(parser = rec_parser)
 * 1) parses list of mesh terms
 * 2) returns embedded list, one with all terms and one major  terms

all_mesh = [] all_mesh_terms = [] major_mesh_terms = [] for did in article_ids[0:5]: cur_record = medline_dict[did] # print '\n', cur_record.title, cur_record.authors, cur_record.source mesh_headings = cur_record.mesh_headings if len(mesh_headings) != 0: all_mesh = parse_mesh(mesh_headings) all_mesh_terms.extend(all_mesh[0]) major_mesh_terms.extend(all_mesh[1])


 * 1) print '\n', all_mesh_terms, '\n', major_mesh_terms
 * 2) cynthia's section end#####


 * 1) Problem to solve: Although Cynthia's output parses out the
 * 2) major mesh terms from the paper, each mesh term can give a hit
 * ie: it does not figure out which is really the "major" major mesh term.
 * 1) I intend to solve it by hitting against the title to see if there are matches
 * 2) and by removing connecting words from the mesh terms.
 * 1) and by removing connecting words from the mesh terms.


 * 1) modifying cynthia's output

new_term = [] for term in major_mesh_terms: test = term.split(" ") #split by " " temp = "" for test_term in test: if len(test_term) <= 3: #if it is a small connecting word pass else: if temp != "": temp = temp + "+" temp = temp + test_term else: temp = test_term #print temp new_term.append(temp)

titles = [] for did in article_ids: cur_record = medline_dict[did] titles.append(cur_record.title)
 * 1) create list of article id's

count = 0 #counter bestterm = [0,"ERROR"] for term in new_term: count = 0 test = term.split("+") #split by "+" for test_term in test: #hit each word in term against title for title in titles: if str.upper(title).find(str.upper(test_term)) != -1: #if found the term in title count += 1 if count > bestterm[0]: bestterm[0] = count bestterm[1] = term
 * 1) titles = a list of all titles returned by PubMed


 * 1) print bestterm[1] #bestterm[1] is the main disease name

if(bestterm[1] == "ERROR"): # if not hit print "Warning: There are no relevant mesh terms for the passed rs#!" sys.exit(0)

fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file line = fh.readline disease_name = bestterm[1] #INSERT DISEASE NAME HERE FOR TESTING ICD9code = [] ICD9dName = [] found = 0

queue_name = 'http://icd9cm.chrisendres.com/index.php?action=search&srchtype=diseases&srchtext=' queue_name = queue_name + disease_name code_lookup = urllib.urlopen(queue_name).read #send queue request to site, returns dirty html out = open("index.txt", "w") #write dirty html to file out.write(code_lookup) out.close
 * 1) queueing http://icd9cm.chrisendres.com for code lookup
 * 1) queueing http://icd9cm.chrisendres.com for code lookup

tempName = "" readCode = open("index.txt", "r") #read dirty html lookup_line = readCode.readline print "***ICD9 and hits, arranged by importance***\n" while lookup_line: w= lookup_line.find(" ") #the unique marker before the disease if w != -1: #if it is found tempCode = lookup_line[32:40] #code in this section tempCode = string.split(tempCode, ' ') #split into number #print tempCode ICD9code.append(tempCode[0]) w = lookup_line.find(" ") if w != -1: #if there is a marker (junk html) tempName = lookup_line[32:w] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) else: tempName = lookup_line[32:len(lookup_line)-7] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) lookup_line = readCode.readline

print "\n\n***Prevalance data per IDC9 above***" print "Note: returns incidence data/yr, including subclasses\n"
 * 1) searching incidence data in CA data file
 * 2) note: returns class of hits
 * 1) note: returns class of hits

fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file

ICD9_prev_output= ],[],[
 * 1) new data structure: [0] = code, [1] = name, [2] = incidence

for (code,name) in zip(ICD9code,ICD9dName): fh.seek(0) line = fh.readline totalIncidence = 0 sumCount = 0 while line: #for every line in the data line = line[:-1] #remove /n lineVector = string.split(line, ',') #split to vector if lineVector[0].find(code) != -1: #if disease hit totalIncidence += int(lineVector[1]) sumCount += 1 else: if sumCount > 0: print "ID: ", code, "name: ", name, "incidence: ", totalIncidence ICD9_prev_output[0].append(code) ICD9_prev_output[1].append(name) ICD9_prev_output[2].append(totalIncidence) totalIncidence = 0 sumCount = 0 line = fh.readline fh.close


 * 1) returns the most relevant ICD9 information

mainCode = 0 mainName = 0 mainIncidence = 0
 * 1) THIS IS THE DATA YOU PROBABLY WANT

counter = 0 for c,n,incid in zip(ICD9_prev_output[0], ICD9_prev_output[1], ICD9_prev_output[2]): if incid > counter: counter = incid mainCode = c       mainName = n        mainIncidence = incid

print "THE MOST RELEVANT HIT:\n" print "CDC CODE: ", mainCode print "Disease Name: ", mainName print "Incidence/yr in CA: ", mainIncidence

OUTPUT:

INPUT: rs11200638
 * ICD9 and hits, arranged by importance***

Note: returns incidence data/yr, including subclasses
 * Prevalance data per IDC9 above***

ID: 362.52 name:  Exudative senile macular degeneration incidence:  23 ID: 362.51 name:  Nonexudative senile macular degeneration incidence:  22 ID: 362.53 name:  Cystoid macular degeneration incidence:  6 ID: 362.50 name:  Macular degeneration (senile), unspecified incidence:  11338 ID: 362.5 name:  Degeneration of macula and posterior pole incidence:  11560 ID: 362.63 name:  Lattice degeneration incidence:  21 ID: 429.1 name:  Myocardial degeneration incidence:  49 ID: 363.32 name:  Other macular scars incidence:  6 ID: 743.55 name:  Congenital macular changes incidence:  2 ID: 371.55 name:  Macular corneal dystrophy incidence:  10 THE MOST RELEVANT HIT:

CDC CODE: 362.5 Disease Name: Degeneration of macula and posterior pole Incidence/yr in CA: 11560