User:Lindenb/Notebook/UMR915/20101117

=Integragen= creating custom tracks for Cedric

=Test indexing the genome= test indexing the human genome with BDB JE: wordLength:9 maxHit:10000

du -h bdb 1.5G	bdb

[lindenb@srv-clc-02 INDEXGENOME]$ java -cp /usr/local/package/je-4.0.103/lib/je -4.0.103.jar:classes IndexTheGenome -d bdb -f /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes -p index Indexing /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes/hg18.fa Found >chr10 (overflows: 0 keys:0 time=0mins) Found >chr10_random (overflows: 17885 keys:257706 time=4mins) Found >chr11 (overflows: 17945 keys:257713 time=4mins) Found >chr11_random (overflows: 52029 keys:260625 time=13mins) Found >chr12 (overflows: 52052 keys:260627 time=13mins) Found >chr13 (overflows: 93117 keys:261473 time=27mins) Found >chr13_random (overflows: 129736 keys:261703 time=38mins) Found >chr14 (overflows: 129810 keys:261704 time=38mins) Found >chr15 (overflows: 171271 keys:261877 time=51mins) Found >chr15_random (overflows: 240043 keys:261978 time=64mins) Found >chr16 (overflows: 240621 keys:261979 time=65mins) Found >chr16_random (overflows: 346684 keys:262049 time=80mins) Found >chr17 (overflows: 347054 keys:262049 time=80mins) Found >chr17_random (overflows: 556439 keys:262086 time=98mins) Found >chr18 (overflows: 561164 keys:262088 time=99mins) Found >chr18_random (overflows: 706759 keys:262104 time=116mins) Found >chr19 (overflows: 706765 keys:262104 time=116mins) Found >chr19_random (overflows: 1002241 keys:262114 time=130mins) Found >chr1 (overflows: 1002603 keys:262114 time=130mins) Found >chr1_random (overflows: 1758416 keys:262128 time=188mins) Found >chr20 (overflows: 1763859 keys:262128 time=189mins) Found >chr21 (overflows: 1996082 keys:262131 time=201mins) Found >chr21_random (overflows: 2111511 keys:262132 time=212mins) Found >chr22 (overflows: 2115003 keys:262132 time=213mins) Found >chr22_random (overflows: 2299994 keys:262134 time=221mins) Found >chr2 (overflows: 2300766 keys:262134 time=221mins) Found >chr2_random (overflows: 3162953 keys:262136 time=287mins) Found >chr3 (overflows: 3164380 keys:262136 time=287mins) Found >chr3_random (overflows: 3941130 keys:262137 time=346mins) Found >chr4 (overflows: 3942941 keys:262137 time=346mins) Found >chr4_random (overflows: 4704560 keys:262139 time=412mins) Found >chr5 (overflows: 4706889 keys:262139 time=412mins) Found >chr5_random (overflows: 5536592 keys:262141 time=482mins) Found >chr6 (overflows: 5537193 keys:262141 time=482mins) Found >chr6_random (overflows: 6454185 keys:262141 time=553mins) Found >chr7 (overflows: 6462576 keys:262141 time=554mins) Found >chr7_random (overflows: 7532043 keys:262141 time=626mins) Found >chr8 (overflows: 7534266 keys:262141 time=626mins) Found >chr8_random (overflows: 8438001 keys:262141 time=691mins) Found >chr9 (overflows: 8441078 keys:262141 time=692mins) Found >chr9_random (overflows: 9325708 keys:262141 time=746mins) Found >chrM (overflows: 9330073 keys:262141 time=747mins) Found >chrX (overflows: 9330092 keys:262141 time=747mins) Found >chrX_random (overflows: 10386612 keys:262141 time=839mins) Found >chrY (overflows: 10401405 keys:262141 time=840mins) overflows: 10583406 keys: 262141 time=853mins

source
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FilenameFilter; import java.util.ArrayList; import java.util.List;

import com.sleepycat.bind.tuple.TupleBinding; import com.sleepycat.bind.tuple.TupleInput; import com.sleepycat.bind.tuple.TupleOutput; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus;

/* * javac -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar -sourcepath ~/workspace/SANDBOX/src -d classes ~/workspace/SANDBOX/src/IndexTheGenome.java * java -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar:classes IndexTheGenome */ public class IndexTheGenome {	private File directory; private int wordLength=9; private Environment environment; private Database prefix2locs; private int maxHit=10000; private static class LocIndex {		byte seqIndex; int position; }	private class LocIndexBinding extends TupleBinding> {		@Override public List entryToObject(TupleInput in) {			int n=in.readInt; List list=new ArrayList(n); for(int i=0;i< n;++i) {				LocIndex loc=new LocIndex; loc.seqIndex=in.readByte; loc.position=in.readInt; list.add(loc); }			return list; }		@Override public void objectToEntry(List list, TupleOutput out) {			out.writeInt(list.size); for(LocIndex loc:list) {				out.writeByte(loc.seqIndex); out.writeInt(loc.position); }			}		}	IndexTheGenome {		}	private void open throws Exception {		close; EnvironmentConfig envCfg=new EnvironmentConfig; envCfg.setAllowCreate(true); envCfg.setReadOnly(false); envCfg.setTransactional(false); envCfg.setConfigParam(EnvironmentConfig.LOG_FILE_MAX,"250000000"); this.environment=new Environment(this.directory, envCfg); DatabaseConfig cfg=new DatabaseConfig; cfg.setAllowCreate(true); cfg.setReadOnly(false); cfg.setTransactional(false); cfg.setDeferredWrite(true); this.prefix2locs=this.environment.openDatabase(null, "prefix2locs", cfg); }	private void close throws Exception {		try { if(this.prefix2locs!=null) {				this.prefix2locs.close; }			} 		catch (Exception e)			{ this.prefix2locs=null; }		try { if(this.environment!=null) {				this.environment.cleanLog; this.environment.close; }			} 		catch (Exception e)			{ this.environment=null; }		}	private void doIndex(File file) throws Exception {		int countOverflows=0; int countKeys=0; LocIndexBinding binding=new LocIndexBinding; DatabaseEntry key=new DatabaseEntry; DatabaseEntry value=new DatabaseEntry; byte array[]=new byte[this.wordLength]; int arraySize=0; byte seqIndex=-1; int genome=0; String line; List locs=null; System.err.println("Indexing "+file); long now=System.currentTimeMillis; BufferedReader in=new BufferedReader(new FileReader(file)); while((line=in.readLine)!=null) {			if(line.startsWith(">")) {				this.prefix2locs.sync; this.environment.cleanLog; System.err.println("Found "+line +" (overflows: "+countOverflows+" keys:"+countKeys+" time="+ (System.currentTimeMillis-now)/(1000*60)+"mins)"); ++seqIndex; arraySize=0; genome=0; }			else {				for(int i=0;i< line.length;++i) {					char c=Character.toUpperCase(line.charAt(i)); if(Character.isWhitespace(c)) continue; if(c=='A' || c=='T' || c=='G' || c=='C') {						array[arraySize++]=(byte)c; if(arraySize==array.length) {							key.setData(array); if(this.prefix2locs.get(null, key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS) {								locs=binding.entryToObject(value); }							else {								locs=new ArrayList(1); countKeys++; }							if(locs.size<this.maxHit) {								LocIndex index=new LocIndex; index.seqIndex=seqIndex; index.position=genome; locs.add(index); binding.objectToEntry(locs, value); this.prefix2locs.put(null, key, value); }							else {								++countOverflows; }							arraySize=0; }						}					else {						arraySize=0; }					genome++; }				}			}		in.close; System.err.println("overflows: "+countOverflows+" keys: "+countKeys+" time="+ (System.currentTimeMillis-now)/(1000*60)+"mins"); key=new DatabaseEntry; Cursor c=this.prefix2locs.openCursor(null, null); while(c.getNext(key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS) {			locs=binding.entryToObject(value); if(locs.size>=this.maxHit) c.delete; }		c.close; }	public static void main(String[] args) {		try { String program="undefined"; File fastaDir=null; IndexTheGenome app=new IndexTheGenome; int optind=0; while(optind<args.length) {				if(args[optind].equals("-h")) {					return; }				else if(args[optind].equals("-d")) {					app.directory=new File(args[++optind]); }				else if(args[optind].equals("-f")) {					fastaDir=new File(args[++optind]); }				else if(args[optind].equals("-w")) {					app.wordLength=Integer.parseInt(args[++optind]); }				else if(args[optind].equals("-p")) {					program=args[++optind]; }				else if(args[optind].equals("--")) {					optind++; break; }				else if(args[optind].startsWith("-")) {					System.err.println("Unnown option: "+args[optind]); return; }				else {					break; }				++optind; }			if(app.directory==null) {				System.err.println("Dir missing"); return ; }			app.open; if(program.equals("index")) {				if(fastaDir==null) {					System.err.println("FastaDir missing"); return ; }				for(File fasta:fastaDir.listFiles(new FilenameFilter {					@Override public boolean accept(File base, String s)						{ return s.endsWith(".fa"); }}))					{					app.doIndex(fasta); }				}			else {				System.err.println("undefined "+program); }			app.close; }		catch (Exception e)			{ e.printStackTrace; }		}	}