User:Lindenb/Notebook/UMR915/20100919

Houais, c'est dimanche.

=Indexing chrom:position in mongodb, tests=

source of data is:

mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select chrom,chromStart,name from snp130  where chrom in("chr22","chrM")'

Test 1
indexing both fields: db.things.ensureIndex({chrom:1,position:1})

BEGIN { printf("db.snps.drop;\n"); }       {        printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3); } END   { printf("db.snps.ensureIndex({chrom:1,position:1});\n"); printf("var tStart=new Date.getTime;\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random*10000);\n"); printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n"); printf("while(c.hasNext) c.next;\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date.getTime-tStart)/1000)"); }

result: seconds:0.228

Test 2
indexing each field db.things.ensureIndex({chrom:1}); and db.things.ensureIndex({position:1});

BEGIN { printf("db.snps.drop;\n"); }       {        printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3); } END   { printf("db.snps.ensureIndex({chrom:1});\n"); printf("db.snps.ensureIndex({position:1});\n"); printf("var tStart=new Date.getTime;\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random*10000);\n"); printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n"); printf("while(c.hasNext) c.next;\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date.getTime-tStart)/1000)"); } Result:0.25

Test 3
No index

BEGIN { printf("db.snps.drop;\n"); }       {        printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3); } END   { printf("var tStart=new Date.getTime;\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random*10000);\n"); printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n"); printf("while(c.hasNext) c.next;\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date.getTime-tStart)/1000)"); } Result:seconds:277.751

TEST4
string padding:

BEGIN { printf("db.snps.drop;\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); }       {        gsub(/chr/,"",$1); printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); } END   { printf("var tStart=new Date.getTime;\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random*10000);\n"); printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n"); printf("while(c.hasNext) c.next;\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date.getTime-tStart)/1000)"); }

result: seconds:169.028

TEST 5
string padding + index BEGIN { printf("db.snps.drop;\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); }       {        gsub(/chr/,"",$1); printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); } END   { printf("db.snps.ensureIndex({position:1});\n"); printf("var tStart=new Date.getTime;\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random*10000);\n"); printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n"); printf("while(c.hasNext) c.next;\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date.getTime-tStart)/1000)"); }

Result: seconds:0.292

Test 6
_id is a padded string

BEGIN { printf("db.snps.drop;\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); }        {         gsub(/chr/,"",$1); printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); }  END   { printf("var tStart=new Date.getTime;\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random*10000);\n"); printf("var c=db.snps.find({_id:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n"); printf("while(c.hasNext) c.next;\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date.getTime-tStart)/1000)"); } result : seconds:1.252

Test 7
using padded string for _id and min:max for searching

BEGIN { printf("db.snps.drop;\n"); printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n"); printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n"); }        {         gsub(/chr/,"",$1); printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3); }  END   { printf("var tStart=new Date.getTime;\n"); printf("for(i=0;i< 1000;++i)\n{\n"); printf("var pos1 =Math.floor(Math.random*50000000);\n"); printf("var pos2 =pos1 + Math.floor(Math.random*10000);\n"); printf("var c=db.snps.find.min({_id:pad(\"22\",pos1)}).max({_id:pad(\"22\",pos2)});\n"); printf("while(c.hasNext) c.next;\n"); printf("}\n"); printf("print(\"seconds:\"+(new Date.getTime-tStart)/1000)"); }

result 3.189

Test 8
I used a composite _id: db.snps.save({_id:{chrom:"chr22",position:14430966},name:"rs2844899"});

but i was not able to query this table with $gt/$lt: > db.snps.find({_id:{chrom:"chr22",position:{$gt:14430966}}}).count 0 > db.snps.find({_id:{chrom:"chr22",position:14430966}}).count 1