User:Lindenb/Notebook/UMR915/20101117
From OpenWetWare

Integragen
creating custom tracks for Cedric
Test indexing the genome
test indexing the human genome with BDB JE: wordLength:9 maxHit:10000
du -h bdb 1.5G bdb
[lindenb@srv-clc-02 INDEXGENOME]$ java -cp /usr/local/package/je-4.0.103/lib/je -4.0.103.jar:classes IndexTheGenome -d bdb -f /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes -p index Indexing /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes/hg18.fa Found >chr10 (overflows: 0 keys:0 time=0mins) Found >chr10_random (overflows: 17885 keys:257706 time=4mins) Found >chr11 (overflows: 17945 keys:257713 time=4mins) Found >chr11_random (overflows: 52029 keys:260625 time=13mins) Found >chr12 (overflows: 52052 keys:260627 time=13mins) Found >chr13 (overflows: 93117 keys:261473 time=27mins) Found >chr13_random (overflows: 129736 keys:261703 time=38mins) Found >chr14 (overflows: 129810 keys:261704 time=38mins) Found >chr15 (overflows: 171271 keys:261877 time=51mins) Found >chr15_random (overflows: 240043 keys:261978 time=64mins) Found >chr16 (overflows: 240621 keys:261979 time=65mins) Found >chr16_random (overflows: 346684 keys:262049 time=80mins) Found >chr17 (overflows: 347054 keys:262049 time=80mins) Found >chr17_random (overflows: 556439 keys:262086 time=98mins) Found >chr18 (overflows: 561164 keys:262088 time=99mins) Found >chr18_random (overflows: 706759 keys:262104 time=116mins) Found >chr19 (overflows: 706765 keys:262104 time=116mins) Found >chr19_random (overflows: 1002241 keys:262114 time=130mins) Found >chr1 (overflows: 1002603 keys:262114 time=130mins) Found >chr1_random (overflows: 1758416 keys:262128 time=188mins) Found >chr20 (overflows: 1763859 keys:262128 time=189mins) Found >chr21 (overflows: 1996082 keys:262131 time=201mins) Found >chr21_random (overflows: 2111511 keys:262132 time=212mins) Found >chr22 (overflows: 2115003 keys:262132 time=213mins) Found >chr22_random (overflows: 2299994 keys:262134 time=221mins) Found >chr2 (overflows: 2300766 keys:262134 time=221mins) Found >chr2_random (overflows: 3162953 keys:262136 time=287mins) Found >chr3 (overflows: 3164380 keys:262136 time=287mins) Found >chr3_random (overflows: 3941130 keys:262137 time=346mins) Found >chr4 (overflows: 3942941 keys:262137 time=346mins) Found >chr4_random (overflows: 4704560 keys:262139 time=412mins) Found >chr5 (overflows: 4706889 keys:262139 time=412mins) Found >chr5_random (overflows: 5536592 keys:262141 time=482mins) Found >chr6 (overflows: 5537193 keys:262141 time=482mins) Found >chr6_random (overflows: 6454185 keys:262141 time=553mins) Found >chr7 (overflows: 6462576 keys:262141 time=554mins) Found >chr7_random (overflows: 7532043 keys:262141 time=626mins) Found >chr8 (overflows: 7534266 keys:262141 time=626mins) Found >chr8_random (overflows: 8438001 keys:262141 time=691mins) Found >chr9 (overflows: 8441078 keys:262141 time=692mins) Found >chr9_random (overflows: 9325708 keys:262141 time=746mins) Found >chrM (overflows: 9330073 keys:262141 time=747mins) Found >chrX (overflows: 9330092 keys:262141 time=747mins) Found >chrX_random (overflows: 10386612 keys:262141 time=839mins) Found >chrY (overflows: 10401405 keys:262141 time=840mins) overflows: 10583406 keys: 262141 time=853mins
source
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.List;
import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
/*
* javac -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar -sourcepath ~/workspace/SANDBOX/src -d classes ~/workspace/SANDBOX/src/IndexTheGenome.java
* java -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar:classes IndexTheGenome
*/
public class IndexTheGenome
{
private File directory;
private int wordLength=9;
private Environment environment;
private Database prefix2locs;
private int maxHit=10000;
private static class LocIndex
{
byte seqIndex;
int position;
}
private class LocIndexBinding
extends TupleBinding<List<LocIndex>>
{
@Override
public List<LocIndex> entryToObject(TupleInput in)
{
int n=in.readInt();
List<LocIndex> list=new ArrayList<IndexTheGenome.LocIndex>(n);
for(int i=0;i< n;++i)
{
LocIndex loc=new LocIndex();
loc.seqIndex=in.readByte();
loc.position=in.readInt();
list.add(loc);
}
return list;
}
@Override
public void objectToEntry(List<LocIndex> list, TupleOutput out)
{
out.writeInt(list.size());
for(LocIndex loc:list)
{
out.writeByte(loc.seqIndex);
out.writeInt(loc.position);
}
}
}
IndexTheGenome()
{
}
private void open() throws Exception
{
close();
EnvironmentConfig envCfg=new EnvironmentConfig();
envCfg.setAllowCreate(true);
envCfg.setReadOnly(false);
envCfg.setTransactional(false);
envCfg.setConfigParam(EnvironmentConfig.LOG_FILE_MAX,"250000000");
this.environment=new Environment(this.directory, envCfg);
DatabaseConfig cfg=new DatabaseConfig();
cfg.setAllowCreate(true);
cfg.setReadOnly(false);
cfg.setTransactional(false);
cfg.setDeferredWrite(true);
this.prefix2locs=this.environment.openDatabase(null, "prefix2locs", cfg);
}
private void close() throws Exception
{
try {
if(this.prefix2locs!=null)
{
this.prefix2locs.close();
}
}
catch (Exception e)
{
this.prefix2locs=null;
}
try {
if(this.environment!=null)
{
this.environment.cleanLog();
this.environment.close();
}
}
catch (Exception e)
{
this.environment=null;
}
}
private void doIndex(File file) throws Exception
{
int countOverflows=0;
int countKeys=0;
LocIndexBinding binding=new LocIndexBinding();
DatabaseEntry key=new DatabaseEntry();
DatabaseEntry value=new DatabaseEntry();
byte array[]=new byte[this.wordLength];
int arraySize=0;
byte seqIndex=-1;
int genome=0;
String line;
List<LocIndex> locs=null;
System.err.println("Indexing "+file);
long now=System.currentTimeMillis();
BufferedReader in=new BufferedReader(new FileReader(file));
while((line=in.readLine())!=null)
{
if(line.startsWith(">"))
{
this.prefix2locs.sync();
this.environment.cleanLog();
System.err.println("Found "+line +" (overflows: "+countOverflows+" keys:"+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins)");
++seqIndex;
arraySize=0;
genome=0;
}
else
{
for(int i=0;i< line.length();++i)
{
char c=Character.toUpperCase(line.charAt(i));
if(Character.isWhitespace(c)) continue;
if(c=='A' || c=='T' || c=='G' || c=='C')
{
array[arraySize++]=(byte)c;
if(arraySize==array.length)
{
key.setData(array);
if(this.prefix2locs.get(null, key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS)
{
locs=binding.entryToObject(value);
}
else
{
locs=new ArrayList<IndexTheGenome.LocIndex>(1);
countKeys++;
}
if(locs.size()<this.maxHit)
{
LocIndex index=new LocIndex();
index.seqIndex=seqIndex;
index.position=genome;
locs.add(index);
binding.objectToEntry(locs, value);
this.prefix2locs.put(null, key, value);
}
else
{
++countOverflows;
}
arraySize=0;
}
}
else
{
arraySize=0;
}
genome++;
}
}
}
in.close();
System.err.println("overflows: "+countOverflows+" keys: "+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins");
key=new DatabaseEntry();
Cursor c=this.prefix2locs.openCursor(null, null);
while(c.getNext(key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS)
{
locs=binding.entryToObject(value);
if(locs.size()>=this.maxHit) c.delete();
}
c.close();
}
public static void main(String[] args)
{
try {
String program="undefined";
File fastaDir=null;
IndexTheGenome app=new IndexTheGenome();
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
return;
}
else if(args[optind].equals("-d"))
{
app.directory=new File(args[++optind]);
}
else if(args[optind].equals("-f"))
{
fastaDir=new File(args[++optind]);
}
else if(args[optind].equals("-w"))
{
app.wordLength=Integer.parseInt(args[++optind]);
}
else if(args[optind].equals("-p"))
{
program=args[++optind];
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unnown option: "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
if(app.directory==null)
{
System.err.println("Dir missing");
return ;
}
app.open();
if(program.equals("index"))
{
if(fastaDir==null)
{
System.err.println("FastaDir missing");
return ;
}
for(File fasta:fastaDir.listFiles(new FilenameFilter()
{
@Override
public boolean accept(File base, String s)
{
return s.endsWith(".fa");
}}))
{
app.doIndex(fasta);
}
}
else
{
System.err.println("undefined "+program);
}
app.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}