User:Lindenb/Notebook/UMR915/20101012

From OpenWetWare
Jump to: navigation, search
Owwnotebook icon.png

20101011        Top        20101013       


Belgium

generating input for PPH2

  jrunscript -f jeter.js > jeter.polyphen.txt
  
  wc jeter.polyphen.txt
  62548

submitted both batch for hg18 at 09H45. Waiting... http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi end at ~15H00

 diff HumDiv.txt HumVar.txt | wc
 0


Results for HumDiv

 ## Totals:
 ##   lines input           62548
 ##   lines skipped             0
 ##   alleles annotated     63107
 ##     missense              551
 ##     nonsense                3
 ##     coding-synon          369
 ##     intron              60567
 ##     utr-3                1473
 ##     utr-5                 144

Results for HumVar

 ## Totals:
 ##   lines input           62548
 ##   lines skipped             0
 ##   alleles annotated     63107
 ##     missense              551
 ##     nonsense                3
 ##     coding-synon          369
 ##     intron              60567
 ##     utr-3                1473
 ##     utr-5                 144


Huuu??? Using file named "SNPs" : No diff beteen HumDiv and HumVar ??? but XML files say ok with params

 <WEKA Class="deleterious"
       FDR="0.182"
       FPR="0.262"
       Modelfile="HumVar.UniRef100.NBd.f11.model"
       Prediction="possibly damaging"
       Score="0.513"
       TPR="0.806" />

and

 <WEKA Class="deleterious"
       FDR="0.211"
       FPR="0.111"
       Modelfile="HumDiv.UniRef100.NBd.f11.model"
       Prediction="possibly damaging"
       Score="0.654"
       TPR="0.828" />

But using filesn "Short"

diff HumVarChrXXX.txt HumDivChrXXX.txt | wc
840

ok, everything is OK

digesting data

(updated some minor errors in the following script. see latest version at the bottom )

 importPackage(java.io);
 importPackage(java.util.zip);
 String.prototype.trim = function ()
       {
       return this.replace(/^\s*/, "").replace(/\s*$/, "");
       }
  
 for(var div=0;div<2;++div)
       {
       var prefix=(div==0?"Div":"Var");
       var f=new File("Hum"+prefix+"ChrXXX.txt");
       
       println("create temporary table pph2"+prefix+"("+
               "chrom varchar(10),pos int not null,variation varchar(50),snp varchar(50) null,acc varchar(50),loc int,aa1 varchar(5),aa2 varchar(2),prediction varchar(50),prob float null,FPR float null,TPR float null,index(chrom),index(pos));"
               );
      
       var input=new BufferedReader(new FileReader(f));
       var line;
       var nLine=0;
       while((line=input.readLine())!=null)
               {
               ++nLine;
               if(nLine==1) continue;
               var tokens=line.split("[\t]");
               var colon= tokens[0].indexOf(':');
               var chrom=tokens[0].substring(0,colon);
               var s2=tokens[0].substring(colon+1).split("[\\.]");
               print("insert into pph2"+prefix+"(chrom,pos,variation,snp,acc,loc,aa1,aa2,prediction,prob,FPR,TPR) values(");
               print("\""+chrom+"\","+s2[0]+",'"+s2[1].substring(0,1)+"/"+s2[1].substring(1)+"',"+
                       (tokens[1].trim().startsWith("rs")?"'"+tokens[1].trim()+"'":"NULL")+","+
                       "'"+tokens[2].trim()+"',"+
                       ""+tokens[3].trim()+","+
                       "'"+tokens[4].trim()+"',"+
                       "'"+tokens[5].trim()+"',"+
                       "'"+tokens[6].trim()+"',"+
                       (tokens[7].trim()=="?"?"NULL":tokens[7].trim())+","+
                       (tokens[8].trim()=="?"?"NULL":tokens[8].trim())+","+
                       (tokens[9].trim()=="?"?"NULL":tokens[9].trim())
                       );
               println(");");
               }
       input.close();
       }
 
 println("create temporary table sift("+
               "chrom varchar(10),pos int ,variation varchar(10),codons varchar(50) null,transcript varchar(50) null,protein varchar(50) null,substitution varchar(50) null,region varchar(50) null,rsId varchar(50) null,snpType varchar(50) null,prediction varchar(50) null,score float null,medianInfo float null,index(chrom),index(pos));"
               );
 for(var sift=0;sift<4;++sift)
       {
       var nLine=0;
       var f=null;
       switch(sift)
               {
               case 0: f=new File("sift_aa.tsv");break;
               case 1: f=new File("sift_ab.tsv");break;
               case 2: f=new File("sift_ac.tsv");break;
               case 3: f=new File("sift_ad.tsv");break;
               }
       var input=new BufferedReader(new FileReader(f));
       while((line=input.readLine())!=null)
               {
               ++nLine;
               if(nLine==1) continue;
               var tokens=line.split("[\t]");
               var ss=tokens[0].split("[,]");
               if(tokens[5]=="NON-GENIC") continue;
               print("insert into sift(chrom,pos,variation,codons,transcript,protein,substitution,region,rsId,snpType,prediction,score,medianInfo) values(");
               print("\"chr"+ss[0]+"\","+ss[1]+",'"+ss[3]+"',"+
                       (tokens[1]=="-"?"NULL":"'"+tokens[1]+"'")+","+
                       (tokens[2]==""?"NULL":"'"+tokens[2]+"'")+","+
                       (tokens[3]==""?"NULL":"'"+tokens[3]+"'")+","+
                       (tokens[4]=="" || tokens[4]=="NA"?"NULL":"'"+tokens[4]+"'")+","+
                       "\""+tokens[5]+"\""+","+
                       (tokens[6]=="NA" || tokens[6]=="N/A"?"NULL":"\""+tokens[6]+"\"")+","+
                       (tokens[7]=="NA" || tokens[7]=="N/A"?"NULL":"'"+tokens[7]+"'")+","+
                       (tokens[8]=="NA" || tokens[8]=="N/A" ? "NULL":"'"+tokens[8]+"'")+","+
                       (tokens[9]=="NA" || tokens[9]=="N/A"? "NULL":tokens[9])+","+
                       (tokens[10]=="NA"|| tokens[10]=="N/A" ?"NULL":tokens[10])
                       );
               println(");");
               }
       }
 
 println("create temporary table indi1(chrom varchar(10),start int , end int, index(chrom),index(start));");
       {
       var f=new File("jeter.indi1.txt");
       var input=new BufferedReader(new FileReader(f));
       while((line=input.readLine())!=null)
               {
               var tokens=line.split("[\t]");
               println("insert into indi1(chrom,start,end) values("+
                       "'"+tokens[0].substring(1)+"'"+","+
                       ""+tokens[1]+","+
                       ""+tokens[2]+");");
               }
       }
 println(
 "create temporary table T1(chrom varchar(10),pos int,variation varchar(10));"+
 "insert into T1(chrom,pos) select distinct chrom,pos from pph2Div;"+
 "insert into T1(chrom,pos) select distinct chrom,pos from pph2Var;"+
 "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from sift;"+
 "create temporary table T2(chrom varchar(10),pos int,variation varchar(10),index(chrom),index(pos),index(chrom,pos));"+
 "insert into T2(chrom,pos,variation) select distinct chrom,pos,variation from T1 order by 1,2;"+
 "select T2.chrom as 'chrom',T2.pos as 'position', "+
 "IF(I1.chrom is NULL,'NO','IN_INDI1') as 'in_indi1',"+
 "S.variation as 'SIFT.variation',S.codons as 'SIFT.codons',S.transcript as 'SIFT.transcript',S.protein as 'S.protein',S.substitution as 'SIFT.substitution',S.region as 'SIFT.region',S.rsId as 'S.rsId',S.snpType as 'SIFT.snpType',S.prediction as 'SIFT.prediction',S.score as 'SIFT.score',S.medianInfo as 'SIFT.medianInfo',"+
 "D.snp as 'PPH2Div.SNP',D.acc as 'PPH2Div.SNP',D.loc  as 'PPH2Div.loc',D.aa1  as 'PPH2Div.aa1',D.aa2  as 'PPH2Div.aa2',D.prediction  as 'PPH2Div.prediction',D.prob  as 'PPH2Div.prob',D.FPR  as 'PPH2Div.fpr',D.TPR  as 'PPH2Div.tpr',"+
 "V.snp as 'PPH2Var.SNP',V.acc as 'PPH2Var.SNP',V.loc  as 'PPH2Var.loc',V.aa1  as 'PPH2Var.aa1',V.aa2  as 'PPH2Var.aa2',V.prediction  as 'PPH2Var.prediction',V.prob  as 'PPH2Var.prob',V.FPR  as 'PPH2Var.fpr',V.TPR  as 'PPH2Var.tpr'"+
 " from T2 "+
 " left join sift as S on (S.chrom=T2.chrom and S.pos=T2.pos and S.variation=T2.variation)"+
 " left join pph2Div as D on  (D.chrom=T2.chrom and D.pos=T2.pos and D.variation=T2.variation)"+
 " left join pph2Var as V on  (V.chrom=T2.chrom and V.pos=T2.pos and V.variation=T2.variation)"+
 " left join indi1 as I1 on (I1.chrom=T2.chrom and I1.start<=T2.pos and I1.end>=T2.pos)"
 );


retrieve dbSNP130:

  egrep -i '(damaging|TOLERATED|benign)' jeter.txt | cut -d '        ' -f 1,2 | sort | uniq |\
  awk '{printf("select * from snp130 where chrom=\"%s\" and chromStart=%s-1;\n",$1,$2);}' > jeter2.sql
  mysql -N -h  genome-mysql.cse.ucsc.edu -A -u genome -D hg18 < jeter2.sql  > jeter3.tsv

latest script:


  importPackage(java.io);
  importPackage(java.util.zip);
  String.prototype.trim = function ()
        {
        return this.replace(/^\s*/, "").replace(/\s*$/, "");
        }
  
  
  
  for(var div=0;div<2;++div)
        {
        var prefix=(div==0?"Div":"Var");
        var f=new File("Hum"+prefix+"ChrXXX.txt");
        
        println("create temporary table pph2"+prefix+"("+
                 "chrom varchar(10),pos int not null,variation varchar(50),snp varchar(50) null,acc varchar(50),loc int,aa1 varchar(5),aa2 varchar(2),prediction varchar(50),prob float null,FPR float null,TPR float null,index(chrom),index(pos));"
                 );
        
        var input=new BufferedReader(new FileReader(f));
        var line;
        var nLine=0;
        while((line=input.readLine())!=null)
                 {
                 ++nLine;
                 if(nLine==1) continue;
                 var tokens=line.split("[\t]");
                 var colon= tokens[0].indexOf(':');
                 var chrom=tokens[0].substring(0,colon);
                 var s2=tokens[0].substring(colon+1).split("[\\.]");
                 print("insert into pph2"+prefix+"(chrom,pos,variation,snp,acc,loc,aa1,aa2,prediction,prob,FPR,TPR) values(");
                 print("\""+chrom+"\","+s2[0]+",'"+s2[1].substring(0,1)+"/"+s2[1].substring(1)+"',"+
                          (tokens[1].trim().startsWith("rs")?"'"+tokens[1].trim()+"'":"NULL")+","+
                          "'"+tokens[2].trim()+"',"+
                          ""+tokens[3].trim()+","+
                          "'"+tokens[4].trim()+"',"+
                          "'"+tokens[5].trim()+"',"+
                          "'"+tokens[6].trim()+"',"+
                          (tokens[7].trim()=="?"?"NULL":tokens[7].trim())+","+
                          (tokens[8].trim()=="?"?"NULL":tokens[8].trim())+","+
                          (tokens[9].trim()=="?"?"NULL":tokens[9].trim())
                          );
                 println(");");
                 }
        input.close();
        }
  
  println("create temporary table sift("+
                 "chrom varchar(10),pos int ,variation varchar(10),codons varchar(50) null,transcript varchar(50) null,protein varchar(50) null,substitution varchar(50) null,region varchar(50) null,rsId varchar(50) null,snpType varchar(50) null,prediction varchar(50) null,score float null,medianInfo float null,index(chrom),index(pos));"
                 );
  for(var sift=0;sift<4;++sift)
        {
        var nLine=0;
        var f=null;
        switch(sift)
                 {
                 case 0: f=new File("sift_aa.tsv");break;
                 case 1: f=new File("sift_ab.tsv");break;
                 case 2: f=new File("sift_ac.tsv");break;
                 case 3: f=new File("sift_ad.tsv");break;
                 }
        var input=new BufferedReader(new FileReader(f));
        while((line=input.readLine())!=null)
                 {
                 ++nLine;
                 if(nLine==1) continue;
                 var tokens=line.split("[\t]");
                 var ss=tokens[0].split("[,]");
                 if(tokens[5]=="NON-GENIC") continue;
                 print("insert into sift(chrom,pos,variation,codons,transcript,protein,substitution,region,rsId,snpType,prediction,score,medianInfo) values(");
                 print("\"chr"+ss[0]+"\","+ss[1]+",'"+ss[3]+"',"+
                          (tokens[1]=="-"?"NULL":"'"+tokens[1]+"'")+","+
                          (tokens[2]==""?"NULL":"'"+tokens[2]+"'")+","+
                          (tokens[3]==""?"NULL":"'"+tokens[3]+"'")+","+
                          (tokens[4]=="" || tokens[4]=="NA"?"NULL":"'"+tokens[4]+"'")+","+
                          "\""+tokens[5]+"\""+","+
                          (tokens[6]=="NA" || tokens[6]=="N/A"?"NULL":"\""+tokens[6]+"\"")+","+
                          (tokens[7]=="NA" || tokens[7]=="N/A"?"NULL":"'"+tokens[7]+"'")+","+
                          (tokens[8]=="NA" || tokens[8]=="N/A" ? "NULL":"'"+tokens[8]+"'")+","+
                          (tokens[9]=="NA" || tokens[9]=="N/A"? "NULL":tokens[9])+","+
                          (tokens[10]=="NA"|| tokens[10]=="N/A" ?"NULL":tokens[10])
                          );
                 
                 println(");");
                 }
        }
  
  println("create temporary table indi1(chrom varchar(10),start int , end int, index(chrom),index(start));");
        {
        var f=new File("jeter.indi1.txt");
        var input=new BufferedReader(new FileReader(f));
        while((line=input.readLine())!=null)
                 {
                 var tokens=line.split("[\t]");
                 println("insert into indi1(chrom,start,end) values("+
                          "'"+tokens[0].substring(1)+"'"+","+
                          ""+tokens[1]+","+
                          ""+tokens[2]+");");
                          
                 }
        input.close();
        }
  println("create temporary table snp130(chrom varchar(10),pos int,name varchar(50),avHet float);");
        {
        var f=new File("jeter3.tsv");
        var input=new BufferedReader(new FileReader(f));
        while((line=input.readLine())!=null)
                 {
                 var tokens=line.split("[\t]");
                 println("insert into snp130(chrom,pos,name,avHet) values("+
                          "'"+tokens[1]+"',"+
                          ""+tokens[2]+","+
                          "\'"+tokens[4]+"\',"+
                          tokens[13]+
                          ");");
                          
                 }
        input.close();
        println("update snp130 set pos=pos+1;\n");
        }
  
  println(
  "create temporary table T1(chrom varchar(10),pos int,variation varchar(10));"+
  "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from pph2Div;"+
  "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from pph2Var;"+
  "insert into T1(chrom,pos,variation) select distinct chrom,pos,variation from sift;"+
  "create temporary table T2(chrom varchar(10),pos int,variation varchar(10),index(chrom),index(pos),index(chrom,pos));"+
  "insert into T2(chrom,pos,variation) select distinct chrom,pos,variation from T1 order by 1,2;"+
  "select T2.chrom as 'chrom',T2.pos as 'position', T2.variation as 'variation',"+
  "X.name as 'dbsnp130.name',X.avHet as 'dbsnp130.avHet', "+
  "IF(I1.chrom is NULL,'NO','IN_INDI1') as 'in_indi1',"+
  "S.variation as 'SIFT.variation',S.codons as 'SIFT.codons',S.transcript as 'SIFT.transcript',S.protein as 'S.protein',S.substitution as 'SIFT.substitution',S.region as 'SIFT.region',S.rsId as 'S.rsId',S.snpType as 'SIFT.snpType',S.prediction as 'SIFT.prediction',S.score as 'SIFT.score',S.medianInfo as 'SIFT.medianInfo',"+
  "D.snp as 'PPH2Div.SNP',D.acc as 'PPH2Div.SNP',D.loc  as 'PPH2Div.loc',D.aa1  as 'PPH2Div.aa1',D.aa2  as 'PPH2Div.aa2',D.prediction  as 'PPH2Div.prediction',D.prob  as 'PPH2Div.prob',D.FPR  as 'PPH2Div.fpr',D.TPR  as 'PPH2Div.tpr',"+
  "V.snp as 'PPH2Var.SNP',V.acc as 'PPH2Var.SNP',V.loc  as 'PPH2Var.loc',V.aa1  as 'PPH2Var.aa1',V.aa2  as 'PPH2Var.aa2',V.prediction  as 'PPH2Var.prediction',V.prob  as 'PPH2Var.prob',V.FPR  as 'PPH2Var.fpr',V.TPR  as 'PPH2Var.tpr'"+
  " from T2 "+
  " left join sift as S on (S.chrom=T2.chrom and S.pos=T2.pos and S.variation=T2.variation)"+
  " left join snp130 as X on (X.chrom=T2.chrom and X.pos=T2.pos)"+
  " left join pph2Div as D on  (D.chrom=T2.chrom and D.pos=T2.pos and D.variation=T2.variation)"+
  " left join pph2Var as V on  (V.chrom=T2.chrom and V.pos=T2.pos and V.variation=T2.variation)"+
  " left join indi1 as I1 on (I1.chrom=T2.chrom and I1.start<=T2.pos and I1.end>=T2.pos)"+
  " group by 1,2,3 ;"
  );


sent to RR:

  mysql -u root -D test < jeter.sql > jeter.txt
  egrep -i '(damaging|TOLERATED|benign|SIFT)' jeter.txt   > result.txt