Harvard:Biophysics 101/Notebook:ZS/2007-5-3
From OpenWetWare
Jump to navigationJump to search
#ICD9_prevalance.py #Zachary Sun #5.1.07, Biophysics 101 #Ultimately: This code is able to output a formal disease name from informal info. #INPUT: RS NUMBER #Output1 : Mesh terms relevant to RS NUMBER #INPUT2 : Mesh terms relevant to RS NUMBER #Output2 : The single major mesh term (eg. disease name) #INPUT3: The single major mesh term (eg. disease name) #Output3: CDC# to standardize mesh term, formal disease name, and incidence #This code does a couple of things: #1) **Cynthia**: Takes in a rs# and outputs a list of mesh terms and important mesh terms #2) **Zach**: Takes Cynthia's list of important mesh terms, # *removes short words from mesh terms, and hits against paper titles from which it was taken # *for the mesh terms with the most paper title hits, it says that mesh term is the MAIN TERM # *feeds the main term to http://icd9cm.chrisendres.com #3) **Zach**: Enables lookup of ICD9 ID numbers and formal name when given a MAIN TERM # *This is particularly useful as all data (WHO, CDC) is based # *on the ICD9 ID, which is a method of classifying all known # *diseases. This code is able to queue a website, http://icd9cm.chrisendres.com # *which has the database of diseases - it returns the best hits # *in dirty HTML, which can then be parsed to obtain ICD9 #'s and description. #4) **Zach** Enables lookup of prevalance data in database # *Currently, I only have it hooked up to the State of CA prevalance data from # *http://www.oshpd.ca.gov/hqad/PatientLevel/ICD9_Codes/index.htm, it does a # *lookup based on #1 and returns prevalance data. Havent looked up but can theoretically # *link to main CDC db at http://wonder.cdc.gov/. import os import string import urllib import sys from Bio import PubMed from Bio import Medline rsNumber = "rs11200638" # MODIFY THIS! print "INPUT: ", rsNumber ###cynthias section### # parses a mesh term to remove * and / def parse_term(str, bool): parsed_term = str if(bool): parsed_term = parsed_term.replace('*', '') if str.find('/') != -1: parsed_term = parsed_term.replace('/', ' ') return parsed_term # parses list of mesh terms # returns embedded list, one with all terms and one major terms def parse_mesh(list): all_mesh_terms = [] major_mesh_terms = [] mesh_term = '' for i in range(len(list)): major = False if list[i].find('*') == -1: mesh_term = parse_term(list[i], major) all_mesh_terms.append(mesh_term) else: major = True mesh_term = parse_term(list[i], major) major_mesh_terms.append(mesh_term) all_mesh_terms.append(mesh_term) all_mesh = [all_mesh_terms, major_mesh_terms] return all_mesh article_ids = PubMed.search_for("rs11200638") rec_parser = Medline.RecordParser() medline_dict = PubMed.Dictionary(parser = rec_parser) all_mesh = [] all_mesh_terms = [] major_mesh_terms = [] for did in article_ids[0:5]: cur_record = medline_dict[did] # print '\n', cur_record.title, cur_record.authors, cur_record.source mesh_headings = cur_record.mesh_headings if len(mesh_headings) != 0: all_mesh = parse_mesh(mesh_headings) all_mesh_terms.extend(all_mesh[0]) major_mesh_terms.extend(all_mesh[1]) #print '\n', all_mesh_terms, '\n', major_mesh_terms ######cynthia's section end##### ##### #Problem to solve: Although Cynthia's output parses out the #major mesh terms from the paper, each mesh term can give a hit #ie: it does not figure out which is really the "major" major mesh term. #I intend to solve it by hitting against the title to see if there are matches #and by removing connecting words from the mesh terms. ##### ##modifying cynthia's output new_term = [] for term in major_mesh_terms: test = term.split(" ") #split by " " temp = "" for test_term in test: if len(test_term) <= 3: #if it is a small connecting word pass else: if temp != "": temp = temp + "+" temp = temp + test_term else: temp = test_term #print temp new_term.append(temp) titles = [] #create list of article id's for did in article_ids: cur_record = medline_dict[did] titles.append(cur_record.title) #titles = a list of all titles returned by PubMed count = 0 #counter bestterm = [0,"ERROR"] for term in new_term: count = 0 test = term.split("+") #split by "+" for test_term in test: #hit each word in term against title for title in titles: if str.upper(title).find(str.upper(test_term)) != -1: #if found the term in title count += 1 if count > bestterm[0]: bestterm[0] = count bestterm[1] = term #print bestterm[1] #bestterm[1] is the main disease name if(bestterm[1] == "ERROR"): # if not hit print "Warning: There are no relevant mesh terms for the passed rs#!" sys.exit(0) fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file line = fh.readline() disease_name = bestterm[1] #INSERT DISEASE NAME HERE FOR TESTING ICD9code = [] ICD9dName = [] found = 0 #### #queueing http://icd9cm.chrisendres.com for code lookup #### queue_name = 'http://icd9cm.chrisendres.com/index.php?action=search&srchtype=diseases&srchtext=' queue_name = queue_name + disease_name code_lookup = urllib.urlopen(queue_name).read() #send queue request to site, returns dirty html out = open("index.txt", "w") #write dirty html to file out.write(code_lookup) out.close() tempName = "" readCode = open("index.txt", "r") #read dirty html lookup_line = readCode.readline() print "***ICD9 and hits, arranged by importance***\n" while lookup_line: w= lookup_line.find("<div class=dlvl>") #the unique marker before the disease if w != -1: #if it is found tempCode = lookup_line[32:40] #code in this section tempCode = string.split(tempCode, ' ') #split into number #print tempCode ICD9code.append(tempCode[0]) w = lookup_line.find("</div>") if w != -1: #if there is a </div> marker (junk html) tempName = lookup_line[32:w] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) else: tempName = lookup_line[32:len(lookup_line)-7] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) lookup_line = readCode.readline() #### #searching incidence data in CA data file #note: returns class of hits #### print "\n\n***Prevalance data per IDC9 above***" print "Note: returns incidence data/yr, including subclasses\n" fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file #new data structure: [0] = code, [1] = name, [2] = incidence ICD9_prev_output= [[],[],[]] for (code,name) in zip(ICD9code,ICD9dName): fh.seek(0) line = fh.readline() totalIncidence = 0 sumCount = 0 while line: #for every line in the data line = line[:-1] #remove /n lineVector = string.split(line, ',') #split to vector if lineVector[0].find(code) != -1: #if disease hit totalIncidence += int(lineVector[1]) sumCount += 1 else: if sumCount > 0: print "ID: ", code, "name: ", name, "incidence: ", totalIncidence ICD9_prev_output[0].append(code) ICD9_prev_output[1].append(name) ICD9_prev_output[2].append(totalIncidence) totalIncidence = 0 sumCount = 0 line = fh.readline() fh.close() #returns the most relevant ICD9 information #THIS IS THE DATA YOU PROBABLY WANT mainCode = 0 mainName = 0 mainIncidence = 0 counter = 0 for c,n,incid in zip(ICD9_prev_output[0], ICD9_prev_output[1], ICD9_prev_output[2]): if incid > counter: counter = incid mainCode = c mainName = n mainIncidence = incid print "THE MOST RELEVANT HIT:\n" print "CDC CODE: ", mainCode print "Disease Name: ", mainName print "Incidence/yr in CA: " , mainIncidence
OUTPUT:
INPUT: rs11200638 ***ICD9 and hits, arranged by importance*** ***Prevalance data per IDC9 above*** Note: returns incidence data/yr, including subclasses ID: 362.52 name: Exudative senile macular degeneration incidence: 23 ID: 362.51 name: Nonexudative senile macular degeneration incidence: 22 ID: 362.53 name: Cystoid macular degeneration incidence: 6 ID: 362.50 name: Macular degeneration (senile), unspecified incidence: 11338 ID: 362.5 name: Degeneration of macula and posterior pole incidence: 11560 ID: 362.63 name: Lattice degeneration incidence: 21 ID: 429.1 name: Myocardial degeneration incidence: 49 ID: 363.32 name: Other macular scars incidence: 6 ID: 743.55 name: Congenital macular changes incidence: 2 ID: 371.55 name: Macular corneal dystrophy incidence: 10 THE MOST RELEVANT HIT: CDC CODE: 362.5 Disease Name: Degeneration of macula and posterior pole Incidence/yr in CA: 11560