User:Lindenb/Notebook/UMR915/20101012
From OpenWetWare

- Polyphen2 has been updated for all transcripts: http://genetics.bwh.harvard.edu/pph2/bgi.shtml
- Strikes. Working at home.
Belgium
generating input for PPH2
jrunscript -f jeter.js > jeter.polyphen.txt wc jeter.polyphen.txt 62548
submitted both batch for hg18 at 09H45. Waiting... http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi end at ~15H00
diff HumDiv.txt HumVar.txt | wc 0
Results for HumDiv
## Totals: ## lines input 62548 ## lines skipped 0 ## alleles annotated 63107 ## missense 551 ## nonsense 3 ## coding-synon 369 ## intron 60567 ## utr-3 1473 ## utr-5 144
Results for HumVar
## Totals: ## lines input 62548 ## lines skipped 0 ## alleles annotated 63107 ## missense 551 ## nonsense 3 ## coding-synon 369 ## intron 60567 ## utr-3 1473 ## utr-5 144
Huuu??? Using file named "SNPs" : No diff beteen HumDiv and HumVar ???
but XML files say ok with params
<WEKA Class="deleterious"
FDR="0.182"
FPR="0.262"
Modelfile="HumVar.UniRef100.NBd.f11.model"
Prediction="possibly damaging"
Score="0.513"
TPR="0.806" />
and
<WEKA Class="deleterious"
FDR="0.211"
FPR="0.111"
Modelfile="HumDiv.UniRef100.NBd.f11.model"
Prediction="possibly damaging"
Score="0.654"
TPR="0.828" />
But using filesn "Short"
diff HumVarChrXXX.txt HumDivChrXXX.txt | wc 840
ok, everything is OK
digesting data
(updated some minor errors in the following script. see latest version at the bottom )
importPackage(java.io);
importPackage(java.util.zip);
String.prototype.trim = function ()
{
return this.replace(/^\s*/, "").replace(/\s*$/, "");
}
for(var div=0;div<2;++div)
{
var prefix=(div==0?"Div":"Var");
var f=new File("Hum"+prefix+"ChrXXX.txt");
println("create temporary table pph2"+prefix+"("+
"chrom varchar(10),pos int not null,variation varchar(50),snp varchar(50) null,acc varchar(50),loc int,aa1 varchar(5),aa2 varchar(2),prediction varchar(50),prob float null,FPR float null,TPR float null,index(chrom),index(pos));"
);
var input=new BufferedReader(new FileReader(f));
var line;
var nLine=0;
while((line=input.readLine())!=null)
{
++nLine;
if(nLine==1) continue;
var tokens=line.split("[\t]");
var colon= tokens[0].indexOf(':');
var chrom=tokens[0].substring(0,colon);
var s2=tokens[0].substring(colon+1).split("[\\.]");
print("insert into pph2"+prefix+"(chrom,pos,variation,snp,acc,loc,aa1,aa2,prediction,prob,FPR,TPR) values(");
print("\""+chrom+"\","+s2[0]+",'"+s2[1].substring(0,1)+"/"+s2[1].substring(1)+"',"+
(tokens[1].trim().startsWith("rs")?"'"+tokens[1].trim()+"'":"NULL")+","+
"'"+tokens[2].trim()+"',"+
""+tokens[3].trim()+","+
"'"+tokens[4].trim()+"',"+
"'"+tokens[5].trim()+"',"+
"'"+tokens[6].trim()+"',"+
(tokens[7].trim()=="?"?"NULL":tokens[7].trim())+","+
(tokens[8].trim()=="?"?"NULL":tokens[8].trim())+","+
(tokens[9].trim()=="?"?"NULL":tokens[9].trim())
);
println(");");
}
input.close();
}
println("create temporary table sift("+
"chrom varchar(10),pos int ,variation varchar(10),codons varchar(50) null,transcript varchar(50) null,protein varchar(50) null,substitution varchar(50) null,region varchar(50) null,rsId varchar(50) null,snpType varchar(50) null,prediction varchar(50) null,score float null,medianInfo float null,index(chrom),index(pos));"
);
for(var sift=0;sift<4;++sift)
{
var nLine=0;
var f=null;
switch(sift)
{
case 0: f=new File("sift_aa.tsv");break;
case 1: f=new File("sift_ab.tsv");break;
case 2: f=new File("sift_ac.tsv");break;
case 3: f=new File("sift_ad.tsv");break;
}
var input=new BufferedReader(new FileReader(f));
while((line=input.readLine())!=null)
{
++nLine;
if(nLine==1) continue;
var tokens=line.split("[\t]");
var ss=tokens[0].split("[,]");
if(tokens[5]=="NON-GENIC") continue;
print("insert into sift(chrom,pos,variation,codons,transcript,protein,substitution,region,rsId,snpType,prediction,score,medianInfo) values(");
print("\"chr"+ss[0]+"\","+ss[1]+",'"+ss[3]+"',"+
(tokens[1]=="-"?"NULL":"'"+tokens[1]+"'")+","+
(tokens[2]==""?"NULL":"'"+tokens[2]+"'")+","+
(tokens[3]==""?"NULL":"'"+tokens[3]+"'")+","+
(tokens[4]=="" || tokens[4]=="NA"?"NULL":"'"+tokens[4]+"'")+","+
"\""+tokens[5]+"\""+","+
(tokens[6]=="NA" || tokens[6]=="N/A"?"NULL":"\""+tokens[6]+"\"")+","+
(tokens[7]=="NA" || tokens[7]=="N/A"?"NULL":"'"+tokens[7]+"'")+","+
(tokens[8]=="NA" || tokens[8]=="N/A" ? "NULL":"'"+tokens[8]+"'")+","+
(tokens[9]=="NA" || tokens[9]=="N/A"? "NULL":tokens[9])+","+
(tokens[10]=="NA"|| tokens[10]=="N/A" ?"NULL":tokens[10])
);
println(");");
}
}
println("create temporary table indi1(chrom varchar(10),start int , end int, index(chrom),index(start));");
{
var f=new File("jeter.indi1.txt");
var input=new BufferedReader(new FileReader(f));
while((line=input.readLine())!=null)
{
var tokens=line.split("[\t]");
println("insert into indi1(chrom,start,end) values("+
"'"+tokens[0].substring(1)+"'"+","+
""+tokens[1]+","+
""+tokens[2]+");");
}
}
println(
"create temporary table T1(chrom varchar(10),pos int,variation varchar(10));"+
"insert into T1(chrom,pos) select distinct chrom,pos from pph2Div;"+
"insert into T1(chrom,pos) select distinct chrom,pos from pph2Var;"+
"insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from sift;"+
"create temporary table T2(chrom varchar(10),pos int,variation varchar(10),index(chrom),index(pos),index(chrom,pos));"+
"insert into T2(chrom,pos,variation) select distinct chrom,pos,variation from T1 order by 1,2;"+
"select T2.chrom as 'chrom',T2.pos as 'position', "+
"IF(I1.chrom is NULL,'NO','IN_INDI1') as 'in_indi1',"+
"S.variation as 'SIFT.variation',S.codons as 'SIFT.codons',S.transcript as 'SIFT.transcript',S.protein as 'S.protein',S.substitution as 'SIFT.substitution',S.region as 'SIFT.region',S.rsId as 'S.rsId',S.snpType as 'SIFT.snpType',S.prediction as 'SIFT.prediction',S.score as 'SIFT.score',S.medianInfo as 'SIFT.medianInfo',"+
"D.snp as 'PPH2Div.SNP',D.acc as 'PPH2Div.SNP',D.loc as 'PPH2Div.loc',D.aa1 as 'PPH2Div.aa1',D.aa2 as 'PPH2Div.aa2',D.prediction as 'PPH2Div.prediction',D.prob as 'PPH2Div.prob',D.FPR as 'PPH2Div.fpr',D.TPR as 'PPH2Div.tpr',"+
"V.snp as 'PPH2Var.SNP',V.acc as 'PPH2Var.SNP',V.loc as 'PPH2Var.loc',V.aa1 as 'PPH2Var.aa1',V.aa2 as 'PPH2Var.aa2',V.prediction as 'PPH2Var.prediction',V.prob as 'PPH2Var.prob',V.FPR as 'PPH2Var.fpr',V.TPR as 'PPH2Var.tpr'"+
" from T2 "+
" left join sift as S on (S.chrom=T2.chrom and S.pos=T2.pos and S.variation=T2.variation)"+
" left join pph2Div as D on (D.chrom=T2.chrom and D.pos=T2.pos and D.variation=T2.variation)"+
" left join pph2Var as V on (V.chrom=T2.chrom and V.pos=T2.pos and V.variation=T2.variation)"+
" left join indi1 as I1 on (I1.chrom=T2.chrom and I1.start<=T2.pos and I1.end>=T2.pos)"
);
retrieve dbSNP130:
egrep -i '(damaging|TOLERATED|benign)' jeter.txt | cut -d ' ' -f 1,2 | sort | uniq |\
awk '{printf("select * from snp130 where chrom=\"%s\" and chromStart=%s-1;\n",$1,$2);}' > jeter2.sql
mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 < jeter2.sql > jeter3.tsv
latest script:
importPackage(java.io);
importPackage(java.util.zip);
String.prototype.trim = function ()
{
return this.replace(/^\s*/, "").replace(/\s*$/, "");
}
for(var div=0;div<2;++div)
{
var prefix=(div==0?"Div":"Var");
var f=new File("Hum"+prefix+"ChrXXX.txt");
println("create temporary table pph2"+prefix+"("+
"chrom varchar(10),pos int not null,variation varchar(50),snp varchar(50) null,acc varchar(50),loc int,aa1 varchar(5),aa2 varchar(2),prediction varchar(50),prob float null,FPR float null,TPR float null,index(chrom),index(pos));"
);
var input=new BufferedReader(new FileReader(f));
var line;
var nLine=0;
while((line=input.readLine())!=null)
{
++nLine;
if(nLine==1) continue;
var tokens=line.split("[\t]");
var colon= tokens[0].indexOf(':');
var chrom=tokens[0].substring(0,colon);
var s2=tokens[0].substring(colon+1).split("[\\.]");
print("insert into pph2"+prefix+"(chrom,pos,variation,snp,acc,loc,aa1,aa2,prediction,prob,FPR,TPR) values(");
print("\""+chrom+"\","+s2[0]+",'"+s2[1].substring(0,1)+"/"+s2[1].substring(1)+"',"+
(tokens[1].trim().startsWith("rs")?"'"+tokens[1].trim()+"'":"NULL")+","+
"'"+tokens[2].trim()+"',"+
""+tokens[3].trim()+","+
"'"+tokens[4].trim()+"',"+
"'"+tokens[5].trim()+"',"+
"'"+tokens[6].trim()+"',"+
(tokens[7].trim()=="?"?"NULL":tokens[7].trim())+","+
(tokens[8].trim()=="?"?"NULL":tokens[8].trim())+","+
(tokens[9].trim()=="?"?"NULL":tokens[9].trim())
);
println(");");
}
input.close();
}
println("create temporary table sift("+
"chrom varchar(10),pos int ,variation varchar(10),codons varchar(50) null,transcript varchar(50) null,protein varchar(50) null,substitution varchar(50) null,region varchar(50) null,rsId varchar(50) null,snpType varchar(50) null,prediction varchar(50) null,score float null,medianInfo float null,index(chrom),index(pos));"
);
for(var sift=0;sift<4;++sift)
{
var nLine=0;
var f=null;
switch(sift)
{
case 0: f=new File("sift_aa.tsv");break;
case 1: f=new File("sift_ab.tsv");break;
case 2: f=new File("sift_ac.tsv");break;
case 3: f=new File("sift_ad.tsv");break;
}
var input=new BufferedReader(new FileReader(f));
while((line=input.readLine())!=null)
{
++nLine;
if(nLine==1) continue;
var tokens=line.split("[\t]");
var ss=tokens[0].split("[,]");
if(tokens[5]=="NON-GENIC") continue;
print("insert into sift(chrom,pos,variation,codons,transcript,protein,substitution,region,rsId,snpType,prediction,score,medianInfo) values(");
print("\"chr"+ss[0]+"\","+ss[1]+",'"+ss[3]+"',"+
(tokens[1]=="-"?"NULL":"'"+tokens[1]+"'")+","+
(tokens[2]==""?"NULL":"'"+tokens[2]+"'")+","+
(tokens[3]==""?"NULL":"'"+tokens[3]+"'")+","+
(tokens[4]=="" || tokens[4]=="NA"?"NULL":"'"+tokens[4]+"'")+","+
"\""+tokens[5]+"\""+","+
(tokens[6]=="NA" || tokens[6]=="N/A"?"NULL":"\""+tokens[6]+"\"")+","+
(tokens[7]=="NA" || tokens[7]=="N/A"?"NULL":"'"+tokens[7]+"'")+","+
(tokens[8]=="NA" || tokens[8]=="N/A" ? "NULL":"'"+tokens[8]+"'")+","+
(tokens[9]=="NA" || tokens[9]=="N/A"? "NULL":tokens[9])+","+
(tokens[10]=="NA"|| tokens[10]=="N/A" ?"NULL":tokens[10])
);
println(");");
}
}
println("create temporary table indi1(chrom varchar(10),start int , end int, index(chrom),index(start));");
{
var f=new File("jeter.indi1.txt");
var input=new BufferedReader(new FileReader(f));
while((line=input.readLine())!=null)
{
var tokens=line.split("[\t]");
println("insert into indi1(chrom,start,end) values("+
"'"+tokens[0].substring(1)+"'"+","+
""+tokens[1]+","+
""+tokens[2]+");");
}
input.close();
}
println("create temporary table snp130(chrom varchar(10),pos int,name varchar(50),avHet float);");
{
var f=new File("jeter3.tsv");
var input=new BufferedReader(new FileReader(f));
while((line=input.readLine())!=null)
{
var tokens=line.split("[\t]");
println("insert into snp130(chrom,pos,name,avHet) values("+
"'"+tokens[1]+"',"+
""+tokens[2]+","+
"\'"+tokens[4]+"\',"+
tokens[13]+
");");
}
input.close();
println("update snp130 set pos=pos+1;\n");
}
println(
"create temporary table T1(chrom varchar(10),pos int,variation varchar(10));"+
"insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from pph2Div;"+
"insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from pph2Var;"+
"insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from sift;"+
"create temporary table T2(chrom varchar(10),pos int,variation varchar(10),index(chrom),index(pos),index(chrom,pos));"+
"insert into T2(chrom,pos,variation) select distinct chrom,pos,variation from T1 order by 1,2;"+
"select T2.chrom as 'chrom',T2.pos as 'position', T2.variation as 'variation',"+
"X.name as 'dbsnp130.name',X.avHet as 'dbsnp130.avHet', "+
"IF(I1.chrom is NULL,'NO','IN_INDI1') as 'in_indi1',"+
"S.variation as 'SIFT.variation',S.codons as 'SIFT.codons',S.transcript as 'SIFT.transcript',S.protein as 'S.protein',S.substitution as 'SIFT.substitution',S.region as 'SIFT.region',S.rsId as 'S.rsId',S.snpType as 'SIFT.snpType',S.prediction as 'SIFT.prediction',S.score as 'SIFT.score',S.medianInfo as 'SIFT.medianInfo',"+
"D.snp as 'PPH2Div.SNP',D.acc as 'PPH2Div.SNP',D.loc as 'PPH2Div.loc',D.aa1 as 'PPH2Div.aa1',D.aa2 as 'PPH2Div.aa2',D.prediction as 'PPH2Div.prediction',D.prob as 'PPH2Div.prob',D.FPR as 'PPH2Div.fpr',D.TPR as 'PPH2Div.tpr',"+
"V.snp as 'PPH2Var.SNP',V.acc as 'PPH2Var.SNP',V.loc as 'PPH2Var.loc',V.aa1 as 'PPH2Var.aa1',V.aa2 as 'PPH2Var.aa2',V.prediction as 'PPH2Var.prediction',V.prob as 'PPH2Var.prob',V.FPR as 'PPH2Var.fpr',V.TPR as 'PPH2Var.tpr'"+
" from T2 "+
" left join sift as S on (S.chrom=T2.chrom and S.pos=T2.pos and S.variation=T2.variation)"+
" left join snp130 as X on (X.chrom=T2.chrom and X.pos=T2.pos)"+
" left join pph2Div as D on (D.chrom=T2.chrom and D.pos=T2.pos and D.variation=T2.variation)"+
" left join pph2Var as V on (V.chrom=T2.chrom and V.pos=T2.pos and V.variation=T2.variation)"+
" left join indi1 as I1 on (I1.chrom=T2.chrom and I1.start<=T2.pos and I1.end>=T2.pos)"+
" group by 1,2,3 ;"
);
sent to RR:
mysql -u root -D test < jeter.sql > jeter.txt egrep -i '(damaging|TOLERATED|benign|SIFT)' jeter.txt > result.txt