User:Lindenb/Notebook/UMR915/20100919
From OpenWetWare

Houais, c'est dimanche.
Indexing chrom:position in mongodb, tests
source of data is:
mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select chrom,chromStart,name from snp130 where chrom in("chr22","chrM")'
Test 1
indexing both fields: db.things.ensureIndex({chrom:1,position:1})
BEGIN { printf("db.snps.drop();\n"); } { printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3); } END { printf("db.snps.ensureIndex({chrom:1,position:1});\n"); printf("var tStart=new Date().getTime();\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random()*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n"); printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n"); printf("while(c.hasNext()) c.next();\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)"); }
result: seconds:0.228
Test 2
indexing each field db.things.ensureIndex({chrom:1}); and db.things.ensureIndex({position:1});
BEGIN { printf("db.snps.drop();\n"); } { printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3); } END { printf("db.snps.ensureIndex({chrom:1});\n"); printf("db.snps.ensureIndex({position:1});\n"); printf("var tStart=new Date().getTime();\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random()*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n"); printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n"); printf("while(c.hasNext()) c.next();\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)"); }
Result:0.25
Test 3
No index
BEGIN { printf("db.snps.drop();\n"); } { printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3); } END { printf("var tStart=new Date().getTime();\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random()*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n"); printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n"); printf("while(c.hasNext()) c.next();\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)"); }
Result:seconds:277.751
TEST4
string padding:
BEGIN { printf("db.snps.drop();\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); } { gsub(/chr/,"",$1); printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); } END { printf("var tStart=new Date().getTime();\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random()*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n"); printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n"); printf("while(c.hasNext()) c.next();\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)"); }
result: seconds:169.028
TEST 5
string padding + index
BEGIN { printf("db.snps.drop();\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); } { gsub(/chr/,"",$1); printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); } END { printf("db.snps.ensureIndex({position:1});\n"); printf("var tStart=new Date().getTime();\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random()*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n"); printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n"); printf("while(c.hasNext()) c.next();\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)"); }
Result: seconds:0.292
Test 6
_id is a padded string
BEGIN { printf("db.snps.drop();\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); } { gsub(/chr/,"",$1); printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); } END { printf("var tStart=new Date().getTime();\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random()*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n"); printf("var c=db.snps.find({_id:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n"); printf("while(c.hasNext()) c.next();\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)"); }
result : seconds:1.252
Test 7
using padded string for _id and min:max for searching
BEGIN { printf("db.snps.drop();\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); } { gsub(/chr/,"",$1); printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); } END { printf("var tStart=new Date().getTime();\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random()*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n"); printf("var c=db.snps.find().min({_id:pad(\"22\",pos1)}).max({_id:pad(\"22\",pos2)});\n"); printf("while(c.hasNext()) c.next();\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)"); }
result 3.189
Test 8
I used a composite _id: db.snps.save({_id:{chrom:"chr22",position:14430966},name:"rs2844899"});
but i was not able to query this table with $gt/$lt:
> db.snps.find({_id:{chrom:"chr22",position:{$gt:14430966}}}).count() 0 > db.snps.find({_id:{chrom:"chr22",position:14430966}}).count() 1