User:Lindenb/Notebook/UMR915/20101117

From OpenWetWare

Jump to: navigation, search

2010116        Top        2010118       


Integragen

creating custom tracks for Cedric

Test indexing the genome

test indexing the human genome with BDB JE: wordLength:9 maxHit:10000

du -h bdb
1.5G	bdb


[lindenb@srv-clc-02 INDEXGENOME]$  java -cp /usr/local/package/je-4.0.103/lib/je
-4.0.103.jar:classes IndexTheGenome -d bdb -f /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes -p index
Indexing /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes/hg18.fa
Found >chr10 (overflows: 0 keys:0 time=0mins)
Found >chr10_random (overflows: 17885 keys:257706 time=4mins)
Found >chr11 (overflows: 17945 keys:257713 time=4mins)
Found >chr11_random (overflows: 52029 keys:260625 time=13mins)
Found >chr12 (overflows: 52052 keys:260627 time=13mins)
Found >chr13 (overflows: 93117 keys:261473 time=27mins)
Found >chr13_random (overflows: 129736 keys:261703 time=38mins)
Found >chr14 (overflows: 129810 keys:261704 time=38mins)
Found >chr15 (overflows: 171271 keys:261877 time=51mins)
Found >chr15_random (overflows: 240043 keys:261978 time=64mins)
Found >chr16 (overflows: 240621 keys:261979 time=65mins)
Found >chr16_random (overflows: 346684 keys:262049 time=80mins)
Found >chr17 (overflows: 347054 keys:262049 time=80mins)
Found >chr17_random (overflows: 556439 keys:262086 time=98mins)
Found >chr18 (overflows: 561164 keys:262088 time=99mins)
Found >chr18_random (overflows: 706759 keys:262104 time=116mins)
Found >chr19 (overflows: 706765 keys:262104 time=116mins)
Found >chr19_random (overflows: 1002241 keys:262114 time=130mins)
Found >chr1 (overflows: 1002603 keys:262114 time=130mins)
Found >chr1_random (overflows: 1758416 keys:262128 time=188mins)
Found >chr20 (overflows: 1763859 keys:262128 time=189mins)
Found >chr21 (overflows: 1996082 keys:262131 time=201mins)
Found >chr21_random (overflows: 2111511 keys:262132 time=212mins)
Found >chr22 (overflows: 2115003 keys:262132 time=213mins)
Found >chr22_random (overflows: 2299994 keys:262134 time=221mins)
Found >chr2 (overflows: 2300766 keys:262134 time=221mins)
Found >chr2_random (overflows: 3162953 keys:262136 time=287mins)
Found >chr3 (overflows: 3164380 keys:262136 time=287mins)
Found >chr3_random (overflows: 3941130 keys:262137 time=346mins)
Found >chr4 (overflows: 3942941 keys:262137 time=346mins)
Found >chr4_random (overflows: 4704560 keys:262139 time=412mins)
Found >chr5 (overflows: 4706889 keys:262139 time=412mins)
Found >chr5_random (overflows: 5536592 keys:262141 time=482mins)
Found >chr6 (overflows: 5537193 keys:262141 time=482mins)
Found >chr6_random (overflows: 6454185 keys:262141 time=553mins)
Found >chr7 (overflows: 6462576 keys:262141 time=554mins)
Found >chr7_random (overflows: 7532043 keys:262141 time=626mins)
Found >chr8 (overflows: 7534266 keys:262141 time=626mins)
Found >chr8_random (overflows: 8438001 keys:262141 time=691mins)
Found >chr9 (overflows: 8441078 keys:262141 time=692mins)
Found >chr9_random (overflows: 9325708 keys:262141 time=746mins)
Found >chrM (overflows: 9330073 keys:262141 time=747mins)
Found >chrX (overflows: 9330092 keys:262141 time=747mins)
Found >chrX_random (overflows: 10386612 keys:262141 time=839mins)
Found >chrY (overflows: 10401405 keys:262141 time=840mins)
overflows: 10583406 keys: 262141 time=853mins

source

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.List;

import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;

/*
 * javac -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar -sourcepath ~/workspace/SANDBOX/src -d classes  ~/workspace/SANDBOX/src/IndexTheGenome.java
 *  java -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar:classes IndexTheGenome
 */
public class IndexTheGenome
	{
	private File directory;
	private int wordLength=9;
	private Environment environment;
	private Database prefix2locs;
	private int maxHit=10000;
	
	
	private static class LocIndex
		{
		byte seqIndex;
		int position;
		}
	
	private class LocIndexBinding
		extends TupleBinding<List<LocIndex>>
		{
		@Override
		public List<LocIndex> entryToObject(TupleInput in)
			{
			int n=in.readInt();
			List<LocIndex> list=new ArrayList<IndexTheGenome.LocIndex>(n);
			for(int i=0;i< n;++i)
				{
				LocIndex loc=new LocIndex();
				loc.seqIndex=in.readByte();
				loc.position=in.readInt();
				list.add(loc);
				}
			return list;
			}
		@Override
		public void objectToEntry(List<LocIndex> list, TupleOutput out)
			{
			out.writeInt(list.size());
			for(LocIndex loc:list)
				{
				out.writeByte(loc.seqIndex);
				out.writeInt(loc.position);
				}
			}
		}
	
	IndexTheGenome()
		{
		
		}
	private void open() throws Exception
		{
		close();
		EnvironmentConfig envCfg=new EnvironmentConfig();
		envCfg.setAllowCreate(true);
		envCfg.setReadOnly(false);
		envCfg.setTransactional(false);
		
		envCfg.setConfigParam(EnvironmentConfig.LOG_FILE_MAX,"250000000");
		this.environment=new Environment(this.directory, envCfg);
		DatabaseConfig cfg=new DatabaseConfig();
		cfg.setAllowCreate(true);
		cfg.setReadOnly(false);
		cfg.setTransactional(false);
		cfg.setDeferredWrite(true);
		this.prefix2locs=this.environment.openDatabase(null, "prefix2locs", cfg);
		}
	private void close() throws Exception
		{
		try {
			if(this.prefix2locs!=null)
				{
				this.prefix2locs.close();
				}
			} 
		catch (Exception e)
			{
			this.prefix2locs=null;
			}
		
		try {
			if(this.environment!=null)
				{
				this.environment.cleanLog();
				this.environment.close();
				}
			} 
		catch (Exception e)
			{
			this.environment=null;
			}
		}
	
	private void doIndex(File file) throws Exception
		{
		int countOverflows=0;
		int countKeys=0;
		LocIndexBinding binding=new LocIndexBinding();
		DatabaseEntry key=new DatabaseEntry();
		DatabaseEntry value=new DatabaseEntry();
		byte array[]=new byte[this.wordLength];
		int arraySize=0;
		byte seqIndex=-1;
		int genome=0;
		String line;
		List<LocIndex> locs=null;
		System.err.println("Indexing "+file);
		long now=System.currentTimeMillis();
		BufferedReader in=new BufferedReader(new FileReader(file));
		while((line=in.readLine())!=null)
			{
			if(line.startsWith(">"))
				{
				this.prefix2locs.sync();
				this.environment.cleanLog();
				System.err.println("Found "+line +" (overflows: "+countOverflows+" keys:"+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins)");
				++seqIndex;
				arraySize=0;
				genome=0;
				}
			else
				{
				for(int i=0;i< line.length();++i)
					{
					char c=Character.toUpperCase(line.charAt(i));
					if(Character.isWhitespace(c)) continue;
					if(c=='A' || c=='T' || c=='G' || c=='C')
						{
						array[arraySize++]=(byte)c;
						if(arraySize==array.length)
							{
							key.setData(array);
							if(this.prefix2locs.get(null, key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS)
								{
								locs=binding.entryToObject(value);
								}
							else
								{
								locs=new ArrayList<IndexTheGenome.LocIndex>(1);
								countKeys++;
								}
							
							if(locs.size()<this.maxHit)
								{
								LocIndex index=new LocIndex();
								index.seqIndex=seqIndex;
								index.position=genome;
								locs.add(index);
								binding.objectToEntry(locs, value);
								this.prefix2locs.put(null, key, value);
								}
							else
								{
								++countOverflows;
								}
							arraySize=0;
							}
						}
					else
						{
						arraySize=0;
						}
					genome++;
					}
				}
			}
		in.close();
		System.err.println("overflows: "+countOverflows+" keys: "+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins");
		
		key=new DatabaseEntry();
		Cursor c=this.prefix2locs.openCursor(null, null);
		while(c.getNext(key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS)
			{
			locs=binding.entryToObject(value);
			if(locs.size()>=this.maxHit) c.delete();
			}
		c.close();
		}
	
	public static void main(String[] args)
		{
		try {
			String program="undefined";
			File fastaDir=null;
			IndexTheGenome app=new IndexTheGenome();
			int optind=0;
			while(optind<args.length)
				{
				if(args[optind].equals("-h"))
					{
					return;
					}
				else if(args[optind].equals("-d"))
					{
					app.directory=new File(args[++optind]);
					}
				else if(args[optind].equals("-f"))
					{
					fastaDir=new File(args[++optind]);
					}
				else if(args[optind].equals("-w"))
					{
					app.wordLength=Integer.parseInt(args[++optind]);
					}
				else if(args[optind].equals("-p"))
					{
					program=args[++optind];
					}
				else if(args[optind].equals("--"))
					{
					optind++;
					break;
					}
				else if(args[optind].startsWith("-"))
					{
					System.err.println("Unnown option: "+args[optind]);
					return;
					}
				else
					{
					break;
					}
				++optind;
				}
			if(app.directory==null)
				{
				System.err.println("Dir missing");
				return ;
				}
			app.open();
			if(program.equals("index"))
				{
				if(fastaDir==null)
					{
					System.err.println("FastaDir missing");
					return ;
					}
				
				for(File fasta:fastaDir.listFiles(new FilenameFilter()
					{
					@Override
					public boolean accept(File base, String s)
						{
						return s.endsWith(".fa");
						}}))
					{
					app.doIndex(fasta);
					}
				}
			else
				{
				System.err.println("undefined "+program);
				}
			
			app.close();
			}
		catch (Exception e)
			{
			e.printStackTrace();
			}
		}
	}
Personal tools