User:Lindenb/Notebook/UMR915/20100919

From OpenWetWare
Jump to navigationJump to search

20100916        Top        20100920       


Houais, c'est dimanche.

Indexing chrom:position in mongodb, tests

source of data is:

mysql -N -h  genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select chrom,chromStart,name from snp130  where chrom in("chr22","chrM")'


Test 1

indexing both fields: db.things.ensureIndex({chrom:1,position:1})

 BEGIN {
       printf("db.snps.drop();\n");
       }
  
       {
       printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
       }
 
 END   {
       printf("db.snps.ensureIndex({chrom:1,position:1});\n");
       printf("var tStart=new Date().getTime();\n");
       printf("for(i=0;i< 1000;++i)\n{\n");
       printf("var pos1 =Math.floor(Math.random()*50000000);\n");
       printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
       printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
       printf("while(c.hasNext()) c.next();\n");
       printf("}\n");
       printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
       }

result: seconds:0.228

Test 2

indexing each field db.things.ensureIndex({chrom:1}); and db.things.ensureIndex({position:1});

 BEGIN {
       printf("db.snps.drop();\n");
       }
 
       {
       printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
       }
 
 END   {
       printf("db.snps.ensureIndex({chrom:1});\n");
       printf("db.snps.ensureIndex({position:1});\n");
       printf("var tStart=new Date().getTime();\n");
       printf("for(i=0;i< 1000;++i)\n{\n");
       printf("var pos1 =Math.floor(Math.random()*50000000);\n");
       printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
       printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
       printf("while(c.hasNext()) c.next();\n");
       printf("}\n");
       printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
       }

Result:0.25

Test 3

No index

 BEGIN {
       printf("db.snps.drop();\n");
       }
 
       {
       printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
       }
 
 END   {
       printf("var tStart=new Date().getTime();\n");
       printf("for(i=0;i< 1000;++i)\n{\n");
       printf("var pos1 =Math.floor(Math.random()*50000000);\n");
       printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
       printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
       printf("while(c.hasNext()) c.next();\n");
       printf("}\n");
       printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
       }

Result:seconds:277.751

TEST4

string padding:


 BEGIN {
       printf("db.snps.drop();\n");
       printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
       printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
       }
 
       {
       gsub(/chr/,"",$1);
       printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
       }
 
 END   {
       printf("var tStart=new Date().getTime();\n");
       printf("for(i=0;i< 1000;++i)\n{\n");
       printf("var pos1 =Math.floor(Math.random()*50000000);\n");
       printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
       printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
       printf("while(c.hasNext()) c.next();\n");
       printf("}\n");
       printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
       }

result: seconds:169.028

TEST 5

string padding + index

 BEGIN {
       printf("db.snps.drop();\n");
       printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
       printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
       }
 
       {
       gsub(/chr/,"",$1);
       printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
       }
 
 END   {
       printf("db.snps.ensureIndex({position:1});\n");
       printf("var tStart=new Date().getTime();\n");
       printf("for(i=0;i< 1000;++i)\n{\n");
       printf("var pos1 =Math.floor(Math.random()*50000000);\n");
       printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
       printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
       printf("while(c.hasNext()) c.next();\n");
       printf("}\n");
       printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
       }


Result: seconds:0.292

Test 6

_id is a padded string

  BEGIN {
        printf("db.snps.drop();\n");
        printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
        printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
        }
 
        {
        gsub(/chr/,"",$1);
        printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
        }
 
  END   {
        printf("var tStart=new Date().getTime();\n");
        printf("for(i=0;i< 1000;++i)\n{\n");
        printf("var pos1 =Math.floor(Math.random()*50000000);\n");
        printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
        printf("var c=db.snps.find({_id:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
        printf("while(c.hasNext()) c.next();\n");
        printf("}\n");
        printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
        }

result : seconds:1.252

Test 7

using padded string for _id and min:max for searching

  BEGIN {
        printf("db.snps.drop();\n");
        printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
        printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
        }
 
        {
        gsub(/chr/,"",$1);
        printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
        }
  
  END   {
        printf("var tStart=new Date().getTime();\n");
        printf("for(i=0;i< 1000;++i)\n{\n");
        printf("var pos1 =Math.floor(Math.random()*50000000);\n");
        printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
        printf("var c=db.snps.find().min({_id:pad(\"22\",pos1)}).max({_id:pad(\"22\",pos2)});\n");
        printf("while(c.hasNext()) c.next();\n");
        printf("}\n");
        printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
        }

result 3.189

Test 8

I used a composite _id: db.snps.save({_id:{chrom:"chr22",position:14430966},name:"rs2844899"});

but i was not able to query this table with $gt/$lt:

 > db.snps.find({_id:{chrom:"chr22",position:{$gt:14430966}}}).count()
 0
 > db.snps.find({_id:{chrom:"chr22",position:14430966}}).count()
 1