User:Lindenb/Notebook/UMR915/20101012


 * Polyphen2 has been updated for all transcripts: http://genetics.bwh.harvard.edu/pph2/bgi.shtml
 * Strikes. Working at home.

Belgium
generating input for PPH2

jrunscript -f jeter.js > jeter.polyphen.txt wc jeter.polyphen.txt 62548

submitted both batch for hg18 at 09H45. Waiting... http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi end at ~15H00 diff HumDiv.txt HumVar.txt | wc 0

Results for HumDiv
## Totals: ##  lines input           62548 ##  lines skipped             0 ##  alleles annotated     63107 ##    missense              551 ##    nonsense                3 ##    coding-synon          369 ##    intron              60567 ##    utr-3                1473 ##    utr-5                 144

Results for HumVar
## Totals: ##  lines input           62548 ##  lines skipped             0 ##  alleles annotated     63107 ##    missense              551 ##    nonsense                3 ##    coding-synon          369 ##    intron              60567 ##    utr-3                1473 ##    utr-5                 144

Huuu??? Using file named "SNPs" : No diff beteen HumDiv and HumVar ??? but XML files say ok with params



and



But using filesn "Short" diff HumVarChrXXX.txt HumDivChrXXX.txt | wc 840

ok, everything is OK

digesting data
(updated some minor errors in the following script. see latest version at the bottom )

importPackage(java.io); importPackage(java.util.zip); String.prototype.trim = function {       return this.replace(/^\s*/, "").replace(/\s*$/, ""); } for(var div=0;div<2;++div) {       var prefix=(div==0?"Div":"Var"); var f=new File("Hum"+prefix+"ChrXXX.txt"); println("create temporary table pph2"+prefix+"("+ "chrom varchar(10),pos int not null,variation varchar(50),snp varchar(50) null,acc varchar(50),loc int,aa1 varchar(5),aa2 varchar(2),prediction varchar(50),prob float null,FPR float null,TPR float null,index(chrom),index(pos));"               ); var input=new BufferedReader(new FileReader(f)); var line; var nLine=0; while((line=input.readLine)!=null) {               ++nLine; if(nLine==1) continue; var tokens=line.split("[\t]"); var colon= tokens[0].indexOf(':'); var chrom=tokens[0].substring(0,colon); var s2=tokens[0].substring(colon+1).split("[\\.]"); print("insert into pph2"+prefix+"(chrom,pos,variation,snp,acc,loc,aa1,aa2,prediction,prob,FPR,TPR) values(");               print("\""+chrom+"\","+s2[0]+",'"+s2[1].substring(0,1)+"/"+s2[1].substring(1)+"',"+ (tokens[1].trim.startsWith("rs")?"'"+tokens[1].trim+"'":"NULL")+","+ "'"+tokens[2].trim+"',"+ ""+tokens[3].trim+","+ "'"+tokens[4].trim+"',"+ "'"+tokens[5].trim+"',"+ "'"+tokens[6].trim+"',"+ (tokens[7].trim=="?"?"NULL":tokens[7].trim)+","+ (tokens[8].trim=="?"?"NULL":tokens[8].trim)+","+ (tokens[9].trim=="?"?"NULL":tokens[9].trim) );               println(");"); }       input.close; } println("create temporary table sift("+ "chrom varchar(10),pos int ,variation varchar(10),codons varchar(50) null,transcript varchar(50) null,protein varchar(50) null,substitution varchar(50) null,region varchar(50) null,rsId varchar(50) null,snpType varchar(50) null,prediction varchar(50) null,score float null,medianInfo float null,index(chrom),index(pos));"               ); for(var sift=0;sift<4;++sift) {       var nLine=0; var f=null; switch(sift) {               case 0: f=new File("sift_aa.tsv");break; case 1: f=new File("sift_ab.tsv");break; case 2: f=new File("sift_ac.tsv");break; case 3: f=new File("sift_ad.tsv");break; }       var input=new BufferedReader(new FileReader(f)); while((line=input.readLine)!=null) {               ++nLine; if(nLine==1) continue; var tokens=line.split("[\t]"); var ss=tokens[0].split("[,]"); if(tokens[5]=="NON-GENIC") continue; print("insert into sift(chrom,pos,variation,codons,transcript,protein,substitution,region,rsId,snpType,prediction,score,medianInfo) values(");               print("\"chr"+ss[0]+"\","+ss[1]+",'"+ss[3]+"',"+ (tokens[1]=="-"?"NULL":"'"+tokens[1]+"'")+","+ (tokens[2]==""?"NULL":"'"+tokens[2]+"'")+","+ (tokens[3]==""?"NULL":"'"+tokens[3]+"'")+","+ (tokens[4]=="" || tokens[4]=="NA"?"NULL":"'"+tokens[4]+"'")+","+ "\""+tokens[5]+"\""+","+ (tokens[6]=="NA" || tokens[6]=="N/A"?"NULL":"\""+tokens[6]+"\"")+","+ (tokens[7]=="NA" || tokens[7]=="N/A"?"NULL":"'"+tokens[7]+"'")+","+ (tokens[8]=="NA" || tokens[8]=="N/A" ? "NULL":"'"+tokens[8]+"'")+","+ (tokens[9]=="NA" || tokens[9]=="N/A"? "NULL":tokens[9])+","+ (tokens[10]=="NA"|| tokens[10]=="N/A" ?"NULL":tokens[10]) );               println(");"); }       }  println("create temporary table indi1(chrom varchar(10),start int, end int, index(chrom),index(start));"); {       var f=new File("jeter.indi1.txt"); var input=new BufferedReader(new FileReader(f)); while((line=input.readLine)!=null) {               var tokens=line.split("[\t]"); println("insert into indi1(chrom,start,end) values("+ "'"+tokens[0].substring(1)+"'"+","+ ""+tokens[1]+","+ ""+tokens[2]+");"); }       }  println(  "create temporary table T1(chrom varchar(10),pos int,variation varchar(10));"+  "insert into T1(chrom,pos) select distinct chrom,pos from pph2Div;"+  "insert into T1(chrom,pos) select distinct chrom,pos from pph2Var;"+  "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from sift;"+  "create temporary table T2(chrom varchar(10),pos int,variation varchar(10),index(chrom),index(pos),index(chrom,pos));"+  "insert into T2(chrom,pos,variation) select distinct chrom,pos,variation from T1 order by 1,2;"+  "select T2.chrom as 'chrom',T2.pos as 'position', "+  "IF(I1.chrom is NULL,'NO','IN_INDI1') as 'in_indi1',"+  "S.variation as 'SIFT.variation',S.codons as 'SIFT.codons',S.transcript as 'SIFT.transcript',S.protein as 'S.protein',S.substitution as 'SIFT.substitution',S.region as 'SIFT.region',S.rsId as 'S.rsId',S.snpType as 'SIFT.snpType',S.prediction as 'SIFT.prediction',S.score as 'SIFT.score',S.medianInfo as 'SIFT.medianInfo',"+ "D.snp as 'PPH2Div.SNP',D.acc as 'PPH2Div.SNP',D.loc as 'PPH2Div.loc',D.aa1  as 'PPH2Div.aa1',D.aa2  as 'PPH2Div.aa2',D.prediction  as 'PPH2Div.prediction',D.prob  as 'PPH2Div.prob',D.FPR  as 'PPH2Div.fpr',D.TPR  as 'PPH2Div.tpr',"+ "V.snp as 'PPH2Var.SNP',V.acc as 'PPH2Var.SNP',V.loc as 'PPH2Var.loc',V.aa1  as 'PPH2Var.aa1',V.aa2  as 'PPH2Var.aa2',V.prediction  as 'PPH2Var.prediction',V.prob  as 'PPH2Var.prob',V.FPR  as 'PPH2Var.fpr',V.TPR  as 'PPH2Var.tpr'"+ " from T2 "+ " left join sift as S on (S.chrom=T2.chrom and S.pos=T2.pos and S.variation=T2.variation)"+ " left join pph2Div as D on (D.chrom=T2.chrom and D.pos=T2.pos and D.variation=T2.variation)"+ " left join pph2Var as V on (V.chrom=T2.chrom and V.pos=T2.pos and V.variation=T2.variation)"+ " left join indi1 as I1 on (I1.chrom=T2.chrom and I1.start<=T2.pos and I1.end>=T2.pos)" );

retrieve dbSNP130:

egrep -i '(damaging|TOLERATED|benign)' jeter.txt | cut -d '       ' -f 1,2 | sort | uniq |\ awk '{printf("select * from snp130 where chrom=\"%s\" and chromStart=%s-1;\n",$1,$2);}' > jeter2.sql mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 < jeter2.sql  > jeter3.tsv

latest script:

importPackage(java.io); importPackage(java.util.zip); String.prototype.trim = function {        return this.replace(/^\s*/, "").replace(/\s*$/, ""); }  for(var div=0;div<2;++div) {        var prefix=(div==0?"Div":"Var"); var f=new File("Hum"+prefix+"ChrXXX.txt"); println("create temporary table pph2"+prefix+"("+ "chrom varchar(10),pos int not null,variation varchar(50),snp varchar(50) null,acc varchar(50),loc int,aa1 varchar(5),aa2 varchar(2),prediction varchar(50),prob float null,FPR float null,TPR float null,index(chrom),index(pos));"                 ); var input=new BufferedReader(new FileReader(f)); var line; var nLine=0; while((line=input.readLine)!=null) {                 ++nLine; if(nLine==1) continue; var tokens=line.split("[\t]"); var colon= tokens[0].indexOf(':'); var chrom=tokens[0].substring(0,colon); var s2=tokens[0].substring(colon+1).split("[\\.]"); print("insert into pph2"+prefix+"(chrom,pos,variation,snp,acc,loc,aa1,aa2,prediction,prob,FPR,TPR) values(");                 print("\""+chrom+"\","+s2[0]+",'"+s2[1].substring(0,1)+"/"+s2[1].substring(1)+"',"+ (tokens[1].trim.startsWith("rs")?"'"+tokens[1].trim+"'":"NULL")+","+ "'"+tokens[2].trim+"',"+ ""+tokens[3].trim+","+ "'"+tokens[4].trim+"',"+ "'"+tokens[5].trim+"',"+ "'"+tokens[6].trim+"',"+ (tokens[7].trim=="?"?"NULL":tokens[7].trim)+","+ (tokens[8].trim=="?"?"NULL":tokens[8].trim)+","+ (tokens[9].trim=="?"?"NULL":tokens[9].trim) );                 println(");"); }        input.close; }  println("create temporary table sift("+ "chrom varchar(10),pos int ,variation varchar(10),codons varchar(50) null,transcript varchar(50) null,protein varchar(50) null,substitution varchar(50) null,region varchar(50) null,rsId varchar(50) null,snpType varchar(50) null,prediction varchar(50) null,score float null,medianInfo float null,index(chrom),index(pos));"                 ); for(var sift=0;sift<4;++sift) {        var nLine=0; var f=null; switch(sift) {                 case 0: f=new File("sift_aa.tsv");break; case 1: f=new File("sift_ab.tsv");break; case 2: f=new File("sift_ac.tsv");break; case 3: f=new File("sift_ad.tsv");break; }        var input=new BufferedReader(new FileReader(f)); while((line=input.readLine)!=null) {                 ++nLine; if(nLine==1) continue; var tokens=line.split("[\t]"); var ss=tokens[0].split("[,]"); if(tokens[5]=="NON-GENIC") continue; print("insert into sift(chrom,pos,variation,codons,transcript,protein,substitution,region,rsId,snpType,prediction,score,medianInfo) values(");                 print("\"chr"+ss[0]+"\","+ss[1]+",'"+ss[3]+"',"+ (tokens[1]=="-"?"NULL":"'"+tokens[1]+"'")+","+ (tokens[2]==""?"NULL":"'"+tokens[2]+"'")+","+ (tokens[3]==""?"NULL":"'"+tokens[3]+"'")+","+ (tokens[4]=="" || tokens[4]=="NA"?"NULL":"'"+tokens[4]+"'")+","+ "\""+tokens[5]+"\""+","+ (tokens[6]=="NA" || tokens[6]=="N/A"?"NULL":"\""+tokens[6]+"\"")+","+ (tokens[7]=="NA" || tokens[7]=="N/A"?"NULL":"'"+tokens[7]+"'")+","+ (tokens[8]=="NA" || tokens[8]=="N/A" ? "NULL":"'"+tokens[8]+"'")+","+ (tokens[9]=="NA" || tokens[9]=="N/A"? "NULL":tokens[9])+","+ (tokens[10]=="NA"|| tokens[10]=="N/A" ?"NULL":tokens[10]) );                 println(");"); }        }   println("create temporary table indi1(chrom varchar(10),start int, end int, index(chrom),index(start));"); {        var f=new File("jeter.indi1.txt"); var input=new BufferedReader(new FileReader(f)); while((line=input.readLine)!=null) {                 var tokens=line.split("[\t]"); println("insert into indi1(chrom,start,end) values("+ "'"+tokens[0].substring(1)+"'"+","+ ""+tokens[1]+","+ ""+tokens[2]+");"); }        input.close; }  println("create temporary table snp130(chrom varchar(10),pos int,name varchar(50),avHet float);"); {        var f=new File("jeter3.tsv"); var input=new BufferedReader(new FileReader(f)); while((line=input.readLine)!=null) {                 var tokens=line.split("[\t]"); println("insert into snp130(chrom,pos,name,avHet) values("+ "'"+tokens[1]+"',"+ ""+tokens[2]+","+ "\'"+tokens[4]+"\',"+ tokens[13]+ ");");                 }         input.close; println("update snp130 set pos=pos+1;\n"); }  println(   "create temporary table T1(chrom varchar(10),pos int,variation varchar(10));"+   "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from pph2Div;"+   "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from pph2Var;"+   "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from sift;"+   "create temporary table T2(chrom varchar(10),pos int,variation varchar(10),index(chrom),index(pos),index(chrom,pos));"+   "insert into T2(chrom,pos,variation) select distinct chrom,pos,variation from T1 order by 1,2;"+   "select T2.chrom as 'chrom',T2.pos as 'position', T2.variation as 'variation',"+   "X.name as 'dbsnp130.name',X.avHet as 'dbsnp130.avHet', "+   "IF(I1.chrom is NULL,'NO','IN_INDI1') as 'in_indi1',"+   "S.variation as 'SIFT.variation',S.codons as 'SIFT.codons',S.transcript as 'SIFT.transcript',S.protein as 'S.protein',S.substitution as 'SIFT.substitution',S.region as 'SIFT.region',S.rsId as 'S.rsId',S.snpType as 'SIFT.snpType',S.prediction as 'SIFT.prediction',S.score as 'SIFT.score',S.medianInfo as 'SIFT.medianInfo',"+ "D.snp as 'PPH2Div.SNP',D.acc as 'PPH2Div.SNP',D.loc as 'PPH2Div.loc',D.aa1  as 'PPH2Div.aa1',D.aa2  as 'PPH2Div.aa2',D.prediction  as 'PPH2Div.prediction',D.prob  as 'PPH2Div.prob',D.FPR  as 'PPH2Div.fpr',D.TPR  as 'PPH2Div.tpr',"+ "V.snp as 'PPH2Var.SNP',V.acc as 'PPH2Var.SNP',V.loc as 'PPH2Var.loc',V.aa1  as 'PPH2Var.aa1',V.aa2  as 'PPH2Var.aa2',V.prediction  as 'PPH2Var.prediction',V.prob  as 'PPH2Var.prob',V.FPR  as 'PPH2Var.fpr',V.TPR  as 'PPH2Var.tpr'"+ " from T2 "+ " left join sift as S on (S.chrom=T2.chrom and S.pos=T2.pos and S.variation=T2.variation)"+ " left join snp130 as X on (X.chrom=T2.chrom and X.pos=T2.pos)"+ " left join pph2Div as D on (D.chrom=T2.chrom and D.pos=T2.pos and D.variation=T2.variation)"+ " left join pph2Var as V on (V.chrom=T2.chrom and V.pos=T2.pos and V.variation=T2.variation)"+ " left join indi1 as I1 on (I1.chrom=T2.chrom and I1.start<=T2.pos and I1.end>=T2.pos)"+ " group by 1,2,3 ;" );

sent to RR:

mysql -u root -D test < jeter.sql > jeter.txt egrep -i '(damaging|TOLERATED|benign|SIFT)' jeter.txt  > result.txt