Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/bioinform/varsim
Browse files Browse the repository at this point in the history
  • Loading branch information
johnmu committed Nov 19, 2014
2 parents 3880d68 + 11e2bb8 commit 84418b7
Show file tree
Hide file tree
Showing 14 changed files with 516 additions and 92 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ target/**
*.jar

*.iml
/target/
10 changes: 7 additions & 3 deletions src/main/java/com/binatechnologies/varsim/DGVparser.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Random;


/**
Expand All @@ -15,6 +16,8 @@
public class DGVparser extends variantFileParser {
private final static Logger log = Logger.getLogger(DGVparser.class.getName());

Random _rand = null;

private SimpleReference _reference;

/**
Expand All @@ -23,7 +26,9 @@ public class DGVparser extends variantFileParser {
* @param fileName DGV flat file filename
* @param reference Reference genome
*/
public DGVparser(String fileName, SimpleReference reference) {
public DGVparser(String fileName, SimpleReference reference, Random rand) {
_rand = rand;

try {
_br = new BufferedReader(new InputStreamReader(decompressStream(fileName)));
readLine(); // skip the first line
Expand Down Expand Up @@ -182,8 +187,7 @@ public Variant parseLine() {
byte[] phase = {1, 1};
return new Variant(chr_name, chr_idx, start_loc, refs.length, refs,
alts, phase, false, var_id, "PASS", String.valueOf((char) _reference
.byteAt(chr_idx, start_loc - 1))
);
.byteAt(chr_idx, start_loc - 1)),_rand);
}

}
34 changes: 25 additions & 9 deletions src/main/java/com/binatechnologies/varsim/RandBED2VCF.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ public class RandBED2VCF extends randVCFgenerator {
@Option(name = "-max_len", usage = "Maximum variant length ["+MAX_LEN_ARG+"], inclusive")
int max_length_lim = MAX_LEN_ARG;

static final int SEED_ARG = 333;
static final long SEED_ARG = 333;
@Option(name = "-seed", usage = "Seed for random sampling ["+SEED_ARG+"]")
int seed = 333;
static long seed = 333;

@Option(name = "-ref", usage = "Reference Genome [Required]",metaVar = "file",required = true)
String reference_filename;
Expand All @@ -57,6 +57,12 @@ public class RandBED2VCF extends randVCFgenerator {
var_idx = 0;
}

RandBED2VCF(long seed) {
super(seed);
num_novel_added = 0;
var_idx = 0;
}

/**
* @param args
*/
Expand All @@ -81,11 +87,16 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var) throws IOException {
// remember BED is 0-based
Variant parse_bed_line(String line, Variant.Type type) {
String[] ll = line.split("\t");
if (ll.length < 4) return new Variant();
if (ll.length < 4) return new Variant(_rand);

int chr_idx = variantFileParser.getChromIndex(ll[0]);
int pos = Integer.parseInt(ll[1]);
int end = Integer.parseInt(ll[2]);

if(chr_idx <= 0){
return null;
}

int pos = Integer.parseInt(ll[1]) + 1; //0-indexed
//int end = Integer.parseInt(ll[2]);
String[] meta = ll[3].split(",");
byte[] ins_seq = null;
int len = 1;
Expand All @@ -108,7 +119,12 @@ Variant parse_bed_line(String line, Variant.Type type) {
if (type == Variant.Type.Deletion) {
alts[0] = new FlexSeq();
var_idx_str = "del_";
ref_seq = ref.byteRange(chr_idx, pos, end);
ref_seq = ref.byteRange(chr_idx, pos, pos + len);

if(ref_seq == null){
log.error("Range error: " + line);
}

} else if (type == Variant.Type.Insertion) {
if(ins_seq != null) {
alts[0] = new FlexSeq(ins_seq);
Expand All @@ -128,11 +144,11 @@ Variant parse_bed_line(String line, Variant.Type type) {
var_idx_str += var_idx;
var_idx++;

Genotypes geno = new Genotypes(chr_idx, 1, rand);
Genotypes geno = new Genotypes(chr_idx, 1, _rand);

return new Variant(ll[0], chr_idx, pos, ref_seq.length, ref_seq, alts,
geno.geno, false, var_idx_str, "PASS", String.valueOf(ref
.charAt(chr_idx, pos - 1)));
.charAt(chr_idx, pos - 1)),_rand);

}

Expand Down Expand Up @@ -188,7 +204,7 @@ public void run(String[] args) {
log.error("Bad lengths, max < min");
}

rand = new Random(seed);
_rand = new Random(seed);

log.info("Reading reference");
ref = new SimpleReference(reference_filename);
Expand Down
25 changes: 15 additions & 10 deletions src/main/java/com/binatechnologies/varsim/RandDGV2VCF.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public class RandDGV2VCF extends randVCFgenerator {

static final int SEED_ARG = 333;
@Option(name = "-seed", usage = "Seed for random sampling ["+SEED_ARG+"]")
int seed = SEED_ARG;
static int seed = SEED_ARG;

static final int NUM_INS_ARG = 2000;
@Option(name = "-num_ins", usage = "Number of insertion SV to sample ["+NUM_INS_ARG+"]")
Expand Down Expand Up @@ -63,6 +63,11 @@ public class RandDGV2VCF extends randVCFgenerator {
num_novel_added = 0;
}

RandDGV2VCF(long seed) {
super(seed);
num_novel_added = 0;
}

/**
* @param args command line arguments
*/
Expand All @@ -81,7 +86,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var,
int num_alt = var.get_num_alt();

// determine whether this one is novel
double rand_num = rand.nextDouble();
double rand_num = _rand.nextDouble();
if (rand_num <= ratio_novel) {
// make the variant novel, simply modify it
// TODO maybe modifying it is bad
Expand All @@ -96,7 +101,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var,
int end_val = Math.max(chr_len - buffer, Math.min(buffer, chr_len));

int time_out = 0;
int new_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1;
int new_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1;
while (!var.setNovelPosition(new_pos, ref)) {
if (time_out > 100) {
log.warn("Error, cannot set novel position: " + (end_val - start_val + 1));
Expand All @@ -107,7 +112,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var,

log.info(time_out + " : " + new_pos + " : " + var.deletion());

new_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1;
new_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1;
time_out++;
}

Expand Down Expand Up @@ -152,7 +157,7 @@ public void run(String[] args) {
System.exit(1);
}

rand = new Random(seed);
_rand = new Random(seed);

log.info("Reading reference");
SimpleReference ref = new SimpleReference(reference_filename);
Expand Down Expand Up @@ -190,8 +195,8 @@ public void run(String[] args) {
int total_lines = 0;
int total_duplicate = 0;
int total_out_of_range = 0;
DGVparser parser_one = new DGVparser(dgv_filename, ref);
Variant prev_var = new Variant();
DGVparser parser_one = new DGVparser(dgv_filename, ref,_rand);
Variant prev_var = new Variant(_rand);

// Read through a first time to generate the counts for sampling without replacement
while (parser_one.hasMoreInput()) {
Expand All @@ -204,7 +209,7 @@ public void run(String[] args) {
int chr_idx = var.chromosome();
int num_alt = var.get_num_alt();

Genotypes geno = new Genotypes(chr_idx, num_alt, rand);
Genotypes geno = new Genotypes(chr_idx, num_alt, _rand);
selected_geno.add(geno);
total_lines++;

Expand Down Expand Up @@ -288,8 +293,8 @@ public void run(String[] args) {
Sample_params INV_params = new Sample_params();

int geno_idx = 0;
parser_one = new DGVparser(dgv_filename, ref);
prev_var = new Variant();
parser_one = new DGVparser(dgv_filename, ref,_rand);
prev_var = new Variant(_rand);


// Read through and do the sampling
Expand Down
18 changes: 9 additions & 9 deletions src/main/java/com/binatechnologies/varsim/RandVCF2VCF.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var,
int chr_idx = var.chromosome();

// determine whether this one is novel
double rand_num = rand.nextDouble();
double rand_num = _rand.nextDouble();
if (rand_num <= ratio_novel) {
// make the variant novel, simply modify it
// TODO maybe modifying it is bad
Expand All @@ -106,7 +106,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var,
int end_val = Math.max(chr_len - buffer, Math.min(buffer, chr_len));

int time_out = 0;
int rand_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1;
int rand_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1;
while (!var.setNovelPosition(rand_pos, ref)) {
if (time_out > 100) {
log.warn("Error: cannot set novel position: " + (end_val - start_val + 1));
Expand All @@ -115,7 +115,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var,
break;
}

rand_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1;
rand_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1;
//log.info(time_out + " : " + var.deletion());

time_out++;
Expand Down Expand Up @@ -158,7 +158,7 @@ public void run(String[] args) {

SimpleReference ref = new SimpleReference(reference_filename);

rand = new Random(seed);
_rand = new Random(seed);

// read through VCF file and count the
log.info("Counting variants and assigning genotypes");
Expand All @@ -171,8 +171,8 @@ public void run(String[] args) {
int total_num_COMPLEX = 0;
int total_num_other = 0;
int total_num = 0;
VCFparser parser_one = new VCFparser(vcf_filename, null, false);
Variant prev_var = new Variant();
VCFparser parser_one = new VCFparser(vcf_filename, null, false,_rand);
Variant prev_var = new Variant(_rand);

// read though once to count the totals, this is so we don't have
// to store an array of the variants for sampling without replacement
Expand All @@ -186,7 +186,7 @@ public void run(String[] args) {
int chr_idx = var.chromosome();
int num_alt = var.get_num_alt();

Genotypes geno = new Genotypes(chr_idx, num_alt, rand,prop_het);
Genotypes geno = new Genotypes(chr_idx, num_alt, _rand,prop_het);
selected_geno.add(geno);

if (prev_var.equals(var)) {
Expand Down Expand Up @@ -268,8 +268,8 @@ public void run(String[] args) {

int geno_idx = 0;

parser_one = new VCFparser(vcf_filename, null, false);
prev_var = new Variant();
parser_one = new VCFparser(vcf_filename, null, false,_rand);
prev_var = new Variant(_rand);

// Read through it a second time, this time we do the sampling
while (parser_one.hasMoreInput()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@

import net.sf.picard.reference.FastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import org.apache.log4j.Logger;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;


public class SimpleReference {
private final static Logger log = Logger.getLogger(SimpleReference.class.getName());

// chr_idx -> reference_string
HashMap<Integer, Sequence> data;

Expand Down Expand Up @@ -147,7 +150,7 @@ byte[] byteRange(int chr_idx, int start_loc, int end_loc) {

Sequence contig = data.get(chr_idx);
if (contig == null) {
System.err.println("Contig not found: " + chr_idx);
log.error("Contig not found: " + chr_idx);
return null;
} else {
return contig.subSeq(start_loc, end_loc);
Expand Down
24 changes: 17 additions & 7 deletions src/main/java/com/binatechnologies/varsim/VCF2diploid.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
public class VCF2diploid {
private final static Logger log = Logger.getLogger(VCF2diploid.class.getName());

private Random _rand = null;

public enum GenderType {
FEMALE, MALE
}
Expand All @@ -42,6 +44,10 @@ public enum GenderType {
@Option(name = "-pass", usage = "Only accept the PASS variants")
private boolean _pass = false;

static final long SEED_ARG = 3333;
@Option(name = "-seed", usage = "Seed for random sampling ["+SEED_ARG+"]")
long seed = 3333;

private final static char DELETED_BASE = '~';


Expand All @@ -51,14 +57,14 @@ public enum GenderType {

public VCF2diploid() {


_rand = new Random(seed);

}

public void run(String[] args){
String VERSION = "VarSim " + getClass().getPackage().getImplementationVersion();

String usage = "Creater a diploid genome as associated files from a reference genome\n"
String usage = "Create a diploid genome as associated files from a reference genome\n"
+"and some VCF files. \n";

CmdLineParser cmd_parser = new CmdLineParser(this);
Expand Down Expand Up @@ -94,7 +100,7 @@ public void run(String[] args){
}

for (int i = 0; i < _vcfFiles.size(); i++) {
VCFparser parser = new VCFparser(_vcfFiles.get(i), _id, _pass);
VCFparser parser = new VCFparser(_vcfFiles.get(i), _id, _pass, _rand);
int n_ev = 0, var_nucs = 0;
while (parser.hasMoreInput()) {
Variant var = parser.parseLine();
Expand All @@ -106,6 +112,12 @@ public void run(String[] args){
log.warn("Not maternal nor paternal");
continue;
}

if (var.maternal() < 0 || var.paternal() < 0) {
// randomize the genotype
var.randomizeGenotype();
}

int chr = var.chromosome();
if (chr <= 0 || chr > _variants.length) {
log.warn("Chr out of range, probably unplaced");
Expand Down Expand Up @@ -203,10 +215,8 @@ public void makeDiploid() {
maternal_seq[c - 1] = paternal_seq[c - 1] = ref_seq.byteAt(c);
}

Hashtable<Integer, FlexSeq> pat_ins_seq = new Hashtable<Integer, FlexSeq>(
150);
Hashtable<Integer, FlexSeq> mat_ins_seq = new Hashtable<Integer, FlexSeq>(
150);
Hashtable<Integer, FlexSeq> pat_ins_seq = new Hashtable<Integer, FlexSeq>(150);
Hashtable<Integer, FlexSeq> mat_ins_seq = new Hashtable<Integer, FlexSeq>(150);

int n_var_pat = 0, n_var_mat = 0;
int n_base_pat = 0, n_base_mat = 0;
Expand Down
Loading

0 comments on commit 84418b7

Please sign in to comment.