diff --git a/.gitignore b/.gitignore index 13b0e8aa..f3f424ee 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ target/** *.jar *.iml +/target/ diff --git a/src/main/java/com/binatechnologies/varsim/DGVparser.java b/src/main/java/com/binatechnologies/varsim/DGVparser.java index 1e6a6a7b..37034067 100644 --- a/src/main/java/com/binatechnologies/varsim/DGVparser.java +++ b/src/main/java/com/binatechnologies/varsim/DGVparser.java @@ -7,6 +7,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.util.Random; /** @@ -15,6 +16,8 @@ public class DGVparser extends variantFileParser { private final static Logger log = Logger.getLogger(DGVparser.class.getName()); + Random _rand = null; + private SimpleReference _reference; /** @@ -23,7 +26,9 @@ public class DGVparser extends variantFileParser { * @param fileName DGV flat file filename * @param reference Reference genome */ - public DGVparser(String fileName, SimpleReference reference) { + public DGVparser(String fileName, SimpleReference reference, Random rand) { + _rand = rand; + try { _br = new BufferedReader(new InputStreamReader(decompressStream(fileName))); readLine(); // skip the first line @@ -182,8 +187,7 @@ public Variant parseLine() { byte[] phase = {1, 1}; return new Variant(chr_name, chr_idx, start_loc, refs.length, refs, alts, phase, false, var_id, "PASS", String.valueOf((char) _reference - .byteAt(chr_idx, start_loc - 1)) - ); + .byteAt(chr_idx, start_loc - 1)),_rand); } } diff --git a/src/main/java/com/binatechnologies/varsim/RandBED2VCF.java b/src/main/java/com/binatechnologies/varsim/RandBED2VCF.java index 6b4cc511..ac927067 100644 --- a/src/main/java/com/binatechnologies/varsim/RandBED2VCF.java +++ b/src/main/java/com/binatechnologies/varsim/RandBED2VCF.java @@ -32,9 +32,9 @@ public class RandBED2VCF extends randVCFgenerator { @Option(name = "-max_len", usage = "Maximum variant length ["+MAX_LEN_ARG+"], inclusive") int max_length_lim = MAX_LEN_ARG; - static final int SEED_ARG = 333; + static final long SEED_ARG = 333; @Option(name = "-seed", usage = "Seed for random sampling ["+SEED_ARG+"]") - int seed = 333; + static long seed = 333; @Option(name = "-ref", usage = "Reference Genome [Required]",metaVar = "file",required = true) String reference_filename; @@ -57,6 +57,12 @@ public class RandBED2VCF extends randVCFgenerator { var_idx = 0; } + RandBED2VCF(long seed) { + super(seed); + num_novel_added = 0; + var_idx = 0; + } + /** * @param args */ @@ -81,11 +87,16 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var) throws IOException { // remember BED is 0-based Variant parse_bed_line(String line, Variant.Type type) { String[] ll = line.split("\t"); - if (ll.length < 4) return new Variant(); + if (ll.length < 4) return new Variant(_rand); int chr_idx = variantFileParser.getChromIndex(ll[0]); - int pos = Integer.parseInt(ll[1]); - int end = Integer.parseInt(ll[2]); + + if(chr_idx <= 0){ + return null; + } + + int pos = Integer.parseInt(ll[1]) + 1; //0-indexed + //int end = Integer.parseInt(ll[2]); String[] meta = ll[3].split(","); byte[] ins_seq = null; int len = 1; @@ -108,7 +119,12 @@ Variant parse_bed_line(String line, Variant.Type type) { if (type == Variant.Type.Deletion) { alts[0] = new FlexSeq(); var_idx_str = "del_"; - ref_seq = ref.byteRange(chr_idx, pos, end); + ref_seq = ref.byteRange(chr_idx, pos, pos + len); + + if(ref_seq == null){ + log.error("Range error: " + line); + } + } else if (type == Variant.Type.Insertion) { if(ins_seq != null) { alts[0] = new FlexSeq(ins_seq); @@ -128,11 +144,11 @@ Variant parse_bed_line(String line, Variant.Type type) { var_idx_str += var_idx; var_idx++; - Genotypes geno = new Genotypes(chr_idx, 1, rand); + Genotypes geno = new Genotypes(chr_idx, 1, _rand); return new Variant(ll[0], chr_idx, pos, ref_seq.length, ref_seq, alts, geno.geno, false, var_idx_str, "PASS", String.valueOf(ref - .charAt(chr_idx, pos - 1))); + .charAt(chr_idx, pos - 1)),_rand); } @@ -188,7 +204,7 @@ public void run(String[] args) { log.error("Bad lengths, max < min"); } - rand = new Random(seed); + _rand = new Random(seed); log.info("Reading reference"); ref = new SimpleReference(reference_filename); diff --git a/src/main/java/com/binatechnologies/varsim/RandDGV2VCF.java b/src/main/java/com/binatechnologies/varsim/RandDGV2VCF.java index bc8514ba..e62d5d65 100644 --- a/src/main/java/com/binatechnologies/varsim/RandDGV2VCF.java +++ b/src/main/java/com/binatechnologies/varsim/RandDGV2VCF.java @@ -17,7 +17,7 @@ public class RandDGV2VCF extends randVCFgenerator { static final int SEED_ARG = 333; @Option(name = "-seed", usage = "Seed for random sampling ["+SEED_ARG+"]") - int seed = SEED_ARG; + static int seed = SEED_ARG; static final int NUM_INS_ARG = 2000; @Option(name = "-num_ins", usage = "Number of insertion SV to sample ["+NUM_INS_ARG+"]") @@ -63,6 +63,11 @@ public class RandDGV2VCF extends randVCFgenerator { num_novel_added = 0; } + RandDGV2VCF(long seed) { + super(seed); + num_novel_added = 0; + } + /** * @param args command line arguments */ @@ -81,7 +86,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var, int num_alt = var.get_num_alt(); // determine whether this one is novel - double rand_num = rand.nextDouble(); + double rand_num = _rand.nextDouble(); if (rand_num <= ratio_novel) { // make the variant novel, simply modify it // TODO maybe modifying it is bad @@ -96,7 +101,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var, int end_val = Math.max(chr_len - buffer, Math.min(buffer, chr_len)); int time_out = 0; - int new_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1; + int new_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1; while (!var.setNovelPosition(new_pos, ref)) { if (time_out > 100) { log.warn("Error, cannot set novel position: " + (end_val - start_val + 1)); @@ -107,7 +112,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var, log.info(time_out + " : " + new_pos + " : " + var.deletion()); - new_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1; + new_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1; time_out++; } @@ -152,7 +157,7 @@ public void run(String[] args) { System.exit(1); } - rand = new Random(seed); + _rand = new Random(seed); log.info("Reading reference"); SimpleReference ref = new SimpleReference(reference_filename); @@ -190,8 +195,8 @@ public void run(String[] args) { int total_lines = 0; int total_duplicate = 0; int total_out_of_range = 0; - DGVparser parser_one = new DGVparser(dgv_filename, ref); - Variant prev_var = new Variant(); + DGVparser parser_one = new DGVparser(dgv_filename, ref,_rand); + Variant prev_var = new Variant(_rand); // Read through a first time to generate the counts for sampling without replacement while (parser_one.hasMoreInput()) { @@ -204,7 +209,7 @@ public void run(String[] args) { int chr_idx = var.chromosome(); int num_alt = var.get_num_alt(); - Genotypes geno = new Genotypes(chr_idx, num_alt, rand); + Genotypes geno = new Genotypes(chr_idx, num_alt, _rand); selected_geno.add(geno); total_lines++; @@ -288,8 +293,8 @@ public void run(String[] args) { Sample_params INV_params = new Sample_params(); int geno_idx = 0; - parser_one = new DGVparser(dgv_filename, ref); - prev_var = new Variant(); + parser_one = new DGVparser(dgv_filename, ref,_rand); + prev_var = new Variant(_rand); // Read through and do the sampling diff --git a/src/main/java/com/binatechnologies/varsim/RandVCF2VCF.java b/src/main/java/com/binatechnologies/varsim/RandVCF2VCF.java index 2a349935..98bf5eed 100644 --- a/src/main/java/com/binatechnologies/varsim/RandVCF2VCF.java +++ b/src/main/java/com/binatechnologies/varsim/RandVCF2VCF.java @@ -92,7 +92,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var, int chr_idx = var.chromosome(); // determine whether this one is novel - double rand_num = rand.nextDouble(); + double rand_num = _rand.nextDouble(); if (rand_num <= ratio_novel) { // make the variant novel, simply modify it // TODO maybe modifying it is bad @@ -106,7 +106,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var, int end_val = Math.max(chr_len - buffer, Math.min(buffer, chr_len)); int time_out = 0; - int rand_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1; + int rand_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1; while (!var.setNovelPosition(rand_pos, ref)) { if (time_out > 100) { log.warn("Error: cannot set novel position: " + (end_val - start_val + 1)); @@ -115,7 +115,7 @@ void rand_output_vcf_record(BufferedWriter bw, Variant var, break; } - rand_pos = rand.nextInt(end_val - start_val + 1) + start_val + 1; + rand_pos = _rand.nextInt(end_val - start_val + 1) + start_val + 1; //log.info(time_out + " : " + var.deletion()); time_out++; @@ -158,7 +158,7 @@ public void run(String[] args) { SimpleReference ref = new SimpleReference(reference_filename); - rand = new Random(seed); + _rand = new Random(seed); // read through VCF file and count the log.info("Counting variants and assigning genotypes"); @@ -171,8 +171,8 @@ public void run(String[] args) { int total_num_COMPLEX = 0; int total_num_other = 0; int total_num = 0; - VCFparser parser_one = new VCFparser(vcf_filename, null, false); - Variant prev_var = new Variant(); + VCFparser parser_one = new VCFparser(vcf_filename, null, false,_rand); + Variant prev_var = new Variant(_rand); // read though once to count the totals, this is so we don't have // to store an array of the variants for sampling without replacement @@ -186,7 +186,7 @@ public void run(String[] args) { int chr_idx = var.chromosome(); int num_alt = var.get_num_alt(); - Genotypes geno = new Genotypes(chr_idx, num_alt, rand,prop_het); + Genotypes geno = new Genotypes(chr_idx, num_alt, _rand,prop_het); selected_geno.add(geno); if (prev_var.equals(var)) { @@ -268,8 +268,8 @@ public void run(String[] args) { int geno_idx = 0; - parser_one = new VCFparser(vcf_filename, null, false); - prev_var = new Variant(); + parser_one = new VCFparser(vcf_filename, null, false,_rand); + prev_var = new Variant(_rand); // Read through it a second time, this time we do the sampling while (parser_one.hasMoreInput()) { diff --git a/src/main/java/com/binatechnologies/varsim/SimpleReference.java b/src/main/java/com/binatechnologies/varsim/SimpleReference.java index 3066f389..32b4ace3 100644 --- a/src/main/java/com/binatechnologies/varsim/SimpleReference.java +++ b/src/main/java/com/binatechnologies/varsim/SimpleReference.java @@ -7,6 +7,7 @@ import net.sf.picard.reference.FastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; +import org.apache.log4j.Logger; import java.io.File; import java.util.ArrayList; @@ -14,6 +15,8 @@ public class SimpleReference { + private final static Logger log = Logger.getLogger(SimpleReference.class.getName()); + // chr_idx -> reference_string HashMap data; @@ -147,7 +150,7 @@ byte[] byteRange(int chr_idx, int start_loc, int end_loc) { Sequence contig = data.get(chr_idx); if (contig == null) { - System.err.println("Contig not found: " + chr_idx); + log.error("Contig not found: " + chr_idx); return null; } else { return contig.subSeq(start_loc, end_loc); diff --git a/src/main/java/com/binatechnologies/varsim/VCF2diploid.java b/src/main/java/com/binatechnologies/varsim/VCF2diploid.java index a97604cf..72f54355 100644 --- a/src/main/java/com/binatechnologies/varsim/VCF2diploid.java +++ b/src/main/java/com/binatechnologies/varsim/VCF2diploid.java @@ -22,6 +22,8 @@ public class VCF2diploid { private final static Logger log = Logger.getLogger(VCF2diploid.class.getName()); + private Random _rand = null; + public enum GenderType { FEMALE, MALE } @@ -42,6 +44,10 @@ public enum GenderType { @Option(name = "-pass", usage = "Only accept the PASS variants") private boolean _pass = false; + static final long SEED_ARG = 3333; + @Option(name = "-seed", usage = "Seed for random sampling ["+SEED_ARG+"]") + long seed = 3333; + private final static char DELETED_BASE = '~'; @@ -51,14 +57,14 @@ public enum GenderType { public VCF2diploid() { - + _rand = new Random(seed); } public void run(String[] args){ String VERSION = "VarSim " + getClass().getPackage().getImplementationVersion(); - String usage = "Creater a diploid genome as associated files from a reference genome\n" + String usage = "Create a diploid genome as associated files from a reference genome\n" +"and some VCF files. \n"; CmdLineParser cmd_parser = new CmdLineParser(this); @@ -94,7 +100,7 @@ public void run(String[] args){ } for (int i = 0; i < _vcfFiles.size(); i++) { - VCFparser parser = new VCFparser(_vcfFiles.get(i), _id, _pass); + VCFparser parser = new VCFparser(_vcfFiles.get(i), _id, _pass, _rand); int n_ev = 0, var_nucs = 0; while (parser.hasMoreInput()) { Variant var = parser.parseLine(); @@ -106,6 +112,12 @@ public void run(String[] args){ log.warn("Not maternal nor paternal"); continue; } + + if (var.maternal() < 0 || var.paternal() < 0) { + // randomize the genotype + var.randomizeGenotype(); + } + int chr = var.chromosome(); if (chr <= 0 || chr > _variants.length) { log.warn("Chr out of range, probably unplaced"); @@ -203,10 +215,8 @@ public void makeDiploid() { maternal_seq[c - 1] = paternal_seq[c - 1] = ref_seq.byteAt(c); } - Hashtable pat_ins_seq = new Hashtable( - 150); - Hashtable mat_ins_seq = new Hashtable( - 150); + Hashtable pat_ins_seq = new Hashtable(150); + Hashtable mat_ins_seq = new Hashtable(150); int n_var_pat = 0, n_var_mat = 0; int n_base_pat = 0, n_base_mat = 0; diff --git a/src/main/java/com/binatechnologies/varsim/VCFcompare.java b/src/main/java/com/binatechnologies/varsim/VCFcompare.java index ea66b946..fe59319f 100644 --- a/src/main/java/com/binatechnologies/varsim/VCFcompare.java +++ b/src/main/java/com/binatechnologies/varsim/VCFcompare.java @@ -167,19 +167,19 @@ private ArrayList convert_var_to_var_list(Variant var, boolean end) { if (var.getType() == Variant.OverallType.SNP) { no_split = true; } - if (var.paternal() == 0 && var.maternal() == 0) { + if (var.getgood_paternal() == 0 && var.getgood_maternal() == 0) { no_split = true; } - if (var.paternal() > 0 && var.getAlt(var.paternal()).getType() != FlexSeq.Type.SEQ) { + if (var.getgood_paternal() > 0 && var.getAlt(var.getgood_paternal()).getType() != FlexSeq.Type.SEQ) { no_split = true; } - if (var.maternal() > 0 && var.getAlt(var.maternal()).getType() != FlexSeq.Type.SEQ) { + if (var.getgood_maternal() > 0 && var.getAlt(var.getgood_maternal()).getType() != FlexSeq.Type.SEQ) { no_split = true; } - if (var.paternal() > 0 && var.getAlt(var.paternal()).length() == 0 && var.getRef().length == 0) { + if (var.getgood_paternal() > 0 && var.getAlt(var.getgood_paternal()).length() == 0 && var.getRef().length == 0) { no_split = true; } - if (var.maternal() > 0 && var.getAlt(var.maternal()).length() == 0 && var.getRef().length == 0) { + if (var.getgood_maternal() > 0 && var.getAlt(var.getgood_maternal()).length() == 0 && var.getRef().length == 0) { no_split = true; } @@ -189,8 +189,8 @@ private ArrayList convert_var_to_var_list(Variant var, boolean end) { } - if (var.getType(var.paternal()) != Variant.Type.Reference - && var.getType(var.maternal()) != Variant.Type.Reference) { + if (var.getType(var.getgood_paternal()) != Variant.Type.Reference + && var.getType(var.getgood_maternal()) != Variant.Type.Reference) { int[] allele = {var.get_allele(0), var.get_allele(1)}; byte[][] alt = {var.getAlt(allele[0]).getSeq(), var.getAlt(allele[1]).getSeq()}; @@ -507,6 +507,7 @@ public void setNum_true_correct(EnumStatsRatioCounter num_t // add to interval tree for (Variant curr_var : var_list) { + int curr_len = curr_var.max_len(); if (curr_len > max_len) { @@ -674,7 +675,10 @@ public void setNum_true_correct(EnumStatsRatioCounter num_t full_validated_count[idx.full_idx] += curr_var.max_len(); // this 'should' be overlap len validated_len += curr_var.max_len(); }else if(compute_as_split){ - output_blob.getNum_true_correct().addFP(curr_var.getType(), var.max_len()); + output_blob.getNum_true_correct().addFP(curr_var.getType(), curr_var.max_len()); + if(curr_var.getType() == Variant.OverallType.SNP && curr_var.max_len() > 1){ + log.warn("SNP with bad length: " + curr_var); + } FP_writer.println(var); } } @@ -683,6 +687,9 @@ public void setNum_true_correct(EnumStatsRatioCounter num_t if (!compute_as_split && validated_len < (total_len*overlap_ratio)) { // this is a false positive! output_blob.getNum_true_correct().addFP(curr_var_type, var.max_len()); + if(curr_var_type == Variant.OverallType.SNP && var.max_len() > 1){ + log.warn("SNP with bad length: " + var); + } FP_writer.println(var); } @@ -706,6 +713,7 @@ public void setNum_true_correct(EnumStatsRatioCounter num_t if (validated_len >= (overlap_ratio * total_len)) { // validated + output_blob.getNum_true_correct().addTP(var.getType(), var.max_len()); TP_writer.println(var); } else { @@ -961,9 +969,9 @@ public int compare_variant(Variant var, int geno, BitSet validated) { // check genotype if (true_var.isHom()) { // position is correct, check genotype - if (true_var.getType(true_var.paternal()) == Variant.Type.SNP + if (true_var.getType(true_var.getgood_paternal()) == Variant.Type.SNP && var.position() == true_var.position()) { - if (val == true_var.getAlt(true_var.paternal()).getSeq()[0]) { + if (val == true_var.getAlt(true_var.getgood_paternal()).getSeq()[0]) { matches_hom.add(new dual_idx(idx, full_idx)); } has_snp = true; diff --git a/src/main/java/com/binatechnologies/varsim/VCFparser.java b/src/main/java/com/binatechnologies/varsim/VCFparser.java index b584ea0c..a04de5b4 100644 --- a/src/main/java/com/binatechnologies/varsim/VCFparser.java +++ b/src/main/java/com/binatechnologies/varsim/VCFparser.java @@ -6,6 +6,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; +import java.util.Random; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -13,6 +14,8 @@ public class VCFparser extends variantFileParser { private final static Logger log = Logger.getLogger(VCFparser.class.getName()); + private Random _rand = null; + private int _id_ind = -1; private String _id = null; private boolean _pass = false; @@ -29,7 +32,8 @@ private VCFparser(){ * @param id ID of individual, essentially selects a column, null to use first ID column * @param pass If true, only output pass lines */ - public VCFparser(String fileName, String id, boolean pass) { + public VCFparser(String fileName, String id, boolean pass, Random rand) { + _rand = rand; try { _br = new BufferedReader(new InputStreamReader(decompressStream(fileName))); readLine(); @@ -45,6 +49,27 @@ public VCFparser(String fileName, String id, boolean pass) { } } + /** + * Reads a VCF file line by line + * + * @param fileName VCF file, doesn't have to be sorted or indexed + * @param id ID of individual, essentially selects a column, null to use first ID column + * @param pass If true, only output pass lines + */ + public VCFparser(String fileName, String id, boolean pass) { + this(fileName, id, pass, null); + } + + /** + * Reads a VCF file line by line, if there are multiple individuals, takes the first one + * + * @param fileName VCF file, doesn't have to be sorted or indexed + * @param pass If true, only output pass lines + */ + public VCFparser(String fileName, boolean pass, Random rand) { + this(fileName, null, pass, rand); + } + /** * Reads a VCF file line by line, if there are multiple individuals, takes the first one * @@ -52,7 +77,7 @@ public VCFparser(String fileName, String id, boolean pass) { * @param pass If true, only output pass lines */ public VCFparser(String fileName, boolean pass) { - this(fileName, null, pass); + this(fileName, null, pass, null); } /** @@ -268,11 +293,6 @@ else if (index == 9) { // Output format //return null; // reference alleles... ignore them for now.... } - if (phase_val[0] < 0 || phase_val[1] < 0) { - phase_val[0] = 1; - phase_val[1] = 1; - } - // determine copy-number // TODO need to be able to deal with unphased copy-numbers? byte[] copy_num_val = new byte[2]; // paternal-maternal @@ -314,13 +334,13 @@ else if (index == 9) { // Output format } // TODO this assumes only one alt return new Variant(chr_name, chr, pos, Math.abs(inv_lens[0]), refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else if (end_loc > 0) { int inv_len = Math.max(Math.abs(end_loc - pos + 1),1); alts = new FlexSeq[1]; alts[0] = new FlexSeq(FlexSeq.Type.INV, inv_len); return new Variant(chr_name, chr, pos, inv_len, refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else { log.error("No length information for INV:"); log.error(line); @@ -356,7 +376,7 @@ else if (index == 9) { // Output format } return new Variant(chr_name, chr, pos, Math.abs(dup_lens[0]), refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else if (end_loc > 0) { int dup_len = Math.max(Math.abs(end_loc - pos + 1),1); alts = new FlexSeq[1]; @@ -364,7 +384,7 @@ else if (index == 9) { // Output format copy_num_val[0], copy_num_val[1])); return new Variant(chr_name, chr, pos, dup_len, refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else { log.error("No length information for DUP:"); log.error(line); @@ -394,13 +414,13 @@ else if (index == 9) { // Output format alts[i] = new FlexSeq(FlexSeq.Type.INS,len_val); } return new Variant(chr_name, chr, pos, 0, refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else if (end_loc > 0) { int ins_len = Math.max(Math.abs(end_loc - pos),1); alts = new FlexSeq[1]; alts[0] = new FlexSeq(FlexSeq.Type.INS, ins_len); return new Variant(chr_name, chr, pos, 0, refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else { log.error("No length information for INS:"); log.error(line); @@ -426,13 +446,13 @@ else if (index == 9) { // Output format } return new Variant(chr_name, chr, pos, Math.abs(del_lens[0]), refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else if (end_loc > 0) { int del_len = end_loc - pos + 1; alts = new FlexSeq[1]; alts[0] = new FlexSeq(FlexSeq.Type.DEL, 0); return new Variant(chr_name, chr, pos, del_len, refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } else { log.error("No length information for DEL:"); log.error(line); @@ -543,7 +563,7 @@ else if (index == 9) { // Output format return new Variant(chr_name, chr, pos, refs.length, refs, alts, - phase_val, is_phased, var_id, FILTER, ref_deleted); + phase_val, is_phased, var_id, FILTER, ref_deleted,_rand); } } diff --git a/src/main/java/com/binatechnologies/varsim/VCFstats.java b/src/main/java/com/binatechnologies/varsim/VCFstats.java index 30a95951..7e94cba1 100644 --- a/src/main/java/com/binatechnologies/varsim/VCFstats.java +++ b/src/main/java/com/binatechnologies/varsim/VCFstats.java @@ -171,8 +171,8 @@ class Parent_record { } public void add(Variant var, BedFile bed_file) { - int paternal_allele = var.paternal(); - int maternal_allele = var.maternal(); + int paternal_allele = var.getgood_paternal(); + int maternal_allele = var.getgood_maternal(); boolean added = false; if (bed_file == null @@ -284,7 +284,7 @@ private void run(String[] args) { if (var == null) { continue; } - if (var.maternal() == 0 && var.paternal() == 0) { + if (var.getgood_maternal() == 0 && var.getgood_paternal() == 0) { continue; } data.add(var, bed_file); diff --git a/src/main/java/com/binatechnologies/varsim/Variant.java b/src/main/java/com/binatechnologies/varsim/Variant.java index c3ee88e6..537ea6a2 100644 --- a/src/main/java/com/binatechnologies/varsim/Variant.java +++ b/src/main/java/com/binatechnologies/varsim/Variant.java @@ -11,7 +11,7 @@ public class Variant { private final static Logger log = Logger.getLogger(Variant.class.getName()); // use a seed for reproducibility, should be an option or global - private static final Random _rand = new Random(3333l); + private Random _rand = null; public int idx = 0; // this is hopefully a unique index, for the split variants public int full_idx = 0; // this is hopefully a unique index, for the whole variants @@ -31,13 +31,22 @@ public class Variant { // if it is the same as the first alt base private String _ref_deleted; - public Variant() { + public Variant(Random rand) { // TODO define some methods to determine if a Variant is uninitialised + _rand = rand; } public Variant(String chr_name, int chr, int pos, int del, byte[] ref, FlexSeq[] alts, byte[] phase, boolean isPhased, String var_id, String filter, String ref_deleted) { + this(chr_name,chr,pos,del,ref,alts,phase,isPhased,var_id,filter,ref_deleted,null); + } + + public Variant(String chr_name, int chr, int pos, int del, byte[] ref, + FlexSeq[] alts, byte[] phase, boolean isPhased, String var_id, String filter, + String ref_deleted, Random rand) { + this(rand); + _filter = filter; _chr_name = chr_name; _var_id = var_id; @@ -84,6 +93,7 @@ public Variant(final Variant var) { _paternal = var._paternal; _maternal = var._maternal; _isPhased = var._isPhased; + _rand = var._rand; } /** @@ -263,7 +273,12 @@ public Interval1D get_var_interval(int ind) { return new Interval1D(_pos, _pos); } - return new Interval1D(_pos, _pos + max_len(ind) - 1); + // TODO hmm unsafe + if(max_len(ind) == Integer.MAX_VALUE){ + return new Interval1D(_pos, _pos); + }else { + return new Interval1D(_pos, _pos + max_len(ind) - 1); + } }catch(RuntimeException e){ log.error("Bad variant interval: " + toString()); log.error("_pos: " + _pos); @@ -276,15 +291,15 @@ public Interval1D get_var_interval(int ind) { // union of intervals from the genotypes public Interval1D get_geno_interval() { - return get_interval(_paternal).union(get_interval(_maternal)); + return get_interval(getgood_paternal()).union(get_interval(getgood_maternal())); } public Interval1D get_geno_var_interval() { - return get_var_interval(_paternal).union(get_var_interval(_maternal)); + return get_var_interval(getgood_paternal()).union(get_var_interval(getgood_maternal())); } public Genotypes getGeno() { - return new Genotypes(_paternal, _maternal); + return new Genotypes(getgood_paternal(), getgood_maternal()); } /* @@ -294,9 +309,9 @@ public Genotypes getGeno() { */ public int get_allele(int parent) { if (parent == 0) { - return _paternal; + return getgood_paternal(); } else if (parent == 1) { - return _maternal; + return getgood_maternal(); } return -1; } @@ -555,6 +570,12 @@ public int get_num_alt() { } public void randomizeHaplotype() { + if(_rand == null){ + log.error("Cannot randomize haplotype"); + log.error(toString()); + System.exit(1); + } + if (_rand.nextDouble() > 0.5) { return; } @@ -564,6 +585,19 @@ public void randomizeHaplotype() { return; } + public void randomizeGenotype() { + if(_rand == null){ + log.error("Cannot randomize genotype"); + log.error(toString()); + System.exit(1); + } + + Genotypes g = new Genotypes(_chr,_alts.length,_rand); + _paternal = g.geno[0]; + _maternal = g.geno[1]; + return; + } + /* Returns true if the variant is homozygous */ @@ -692,9 +726,9 @@ private void buildVCFstr(StringBuilder sbStr) { } if (hasCN()) { - sbStr.append(String.valueOf(getCN(paternal()))); + sbStr.append(String.valueOf(getCN(getgood_paternal()))); sbStr.append("|"); - sbStr.append(String.valueOf(getCN(maternal()))); + sbStr.append(String.valueOf(getCN(getgood_maternal()))); sbStr.append(":"); } @@ -709,9 +743,9 @@ public String toString() { buildVCFstr(sbStr); - sbStr.append(paternal()); + sbStr.append(getgood_paternal()); sbStr.append("|"); - sbStr.append(maternal()); + sbStr.append(getgood_maternal()); return sbStr.toString(); } @@ -727,13 +761,29 @@ public String toString(int paternal, int maternal) { buildVCFstr(sbStr); // for this one we need to work out which one is added - sbStr.append(_paternal); + sbStr.append(paternal); sbStr.append("|"); - sbStr.append(_maternal); + sbStr.append(maternal); return sbStr.toString(); } + byte getgood_paternal(){ + if(_paternal < 0){ + return 1; + }else{ + return _paternal; + } + } + + byte getgood_maternal(){ + if(_maternal < 0){ + return 1; + }else{ + return _maternal; + } + } + // type for one allele public enum Type { diff --git a/src/main/java/com/binatechnologies/varsim/randVCFgenerator.java b/src/main/java/com/binatechnologies/varsim/randVCFgenerator.java index 00e11067..58e2f45e 100644 --- a/src/main/java/com/binatechnologies/varsim/randVCFgenerator.java +++ b/src/main/java/com/binatechnologies/varsim/randVCFgenerator.java @@ -101,20 +101,20 @@ public String toString() { abstract public class randVCFgenerator { - Random rand; + Random _rand; /** * This sets a default seed, ideally different between different runs */ randVCFgenerator() { - rand = new Random(3333); + _rand = new Random(3333); } /** * @param seed User specified seed for random numbers */ randVCFgenerator(long seed) { - rand = new Random(seed); + _rand = new Random(seed); } @@ -130,7 +130,7 @@ abstract public class randVCFgenerator { byte sample_genotype(byte geno, Sample_params seen_added, int num_sample, int num_total) { if (seen_added.added_num < num_sample) { - double rand_num = rand.nextDouble(); + double rand_num = _rand.nextDouble(); if ((num_total - seen_added.seen_num) * rand_num >= (num_sample - seen_added.added_num)) { seen_added.seen_num++; return 0; @@ -162,13 +162,13 @@ void fill_in_seq(Variant var, byte[] insert_seq, int geno) { int seg_len = (int) Math.ceil(insert_seq.length / 10.0); for (int i = 0; i < len; i += seg_len) { // choose random start loc - int rand_start = rand.nextInt(new_seq.length - seg_len); + int rand_start = _rand.nextInt(new_seq.length - seg_len); for (int j = i; (j < len && j < (i + seg_len)); j++) { new_seq[j] = insert_seq[rand_start + j - i]; } } } else { - int rand_start = rand.nextInt(insert_seq.length - len); + int rand_start = _rand.nextInt(insert_seq.length - len); for (int j = 0; j < len; j++) { new_seq[j] = insert_seq[rand_start + j]; } @@ -193,6 +193,14 @@ void fill_in_seq(Variant var, byte[] insert_seq, int geno) { void output_vcf_record(BufferedWriter bw, Variant var, int geno0, int geno1) throws IOException { + // ignore ACTGN + String ref = var.getOrig_Ref().toUpperCase(); + String alt = var.alt_string().toString().toUpperCase(); + + if(!ref.matches("[ACTGN]*") || !alt.matches("[ACTGN,]*")){ + return; // don't output if it is not ACTGN + } + // chromosome name bw.write(var.getChr_name()); bw.write("\t"); @@ -203,10 +211,10 @@ void output_vcf_record(BufferedWriter bw, Variant var, int geno0, int geno1) bw.write(var.getVar_id()); bw.write("\t"); // ref allele - bw.write(var.getOrig_Ref()); + bw.write(ref); bw.write("\t"); // alt alleles - bw.write(var.alt_string().toString()); + bw.write(alt); bw.write("\t"); // variant quality bw.write(".\t"); diff --git a/varsim.py b/varsim.py index 6160c2a1..77dea56f 100755 --- a/varsim.py +++ b/varsim.py @@ -36,7 +36,7 @@ def get_contigs_list(reference): if not os.path.isfile(default_liftover): default_liftover = None if not os.path.isfile(default_vcfstats): default_vcfstats = None -main_parser = argparse.ArgumentParser(description="VarSim: An accurate reads simulator", formatter_class=argparse.ArgumentDefaultsHelpFormatter) +main_parser = argparse.ArgumentParser(description="VarSim: An accurate variant and reads simulator", formatter_class=argparse.ArgumentDefaultsHelpFormatter) main_parser.add_argument("--out_dir", metavar="Out directory", help="Output directory", required=False, default="out") main_parser.add_argument("--work_dir", metavar="Work directory", help="Work directory", required=False, default="work") main_parser.add_argument("--log_dir", metavar="Log directory", help="Directory to log to", required=False, default="log") @@ -232,7 +232,10 @@ def check_executable(fpath): vcf2diploid_stderr = open(os.path.join(args.log_dir, "vcf2diploid.err"), "w") vcf_arg_list = sum([["-vcf", v] for v in args.vcfs], []) filter_arg_list = ["-pass"] if args.filter else [] - vcf2diploid_command = ["java", "-jar", os.path.realpath(args.vcf2diploid_jar.name), "-t", args.sex, "-id", args.id, "-chr", os.path.realpath(args.reference.name)] + filter_arg_list + vcf_arg_list + vcf2diploid_command = ["java", "-jar", os.path.realpath(args.vcf2diploid_jar.name), + "-t", args.sex, + "-id", args.id, + "-chr", os.path.realpath(args.reference.name)] + filter_arg_list + vcf_arg_list p_vcf2diploid = subprocess.Popen(vcf2diploid_command, stdout=vcf2diploid_stdout, stderr=vcf2diploid_stderr, cwd=args.out_dir) logger.info("Executing command " + " ".join(vcf2diploid_command) + " with pid " + str(p_vcf2diploid.pid)) diff --git a/varsim_somatic.py b/varsim_somatic.py new file mode 100755 index 00000000..eb1f59d2 --- /dev/null +++ b/varsim_somatic.py @@ -0,0 +1,296 @@ +#!/usr/bin/python + +# this is a temporary attempt to get a workable somatic workflow + +import argparse +import os +import sys +import subprocess +import logging +import shutil +import time +import signal +import multiprocessing as mp +from multiprocessing import Process + +def get_contigs_list(reference): + with open("%s.fai" % (reference)) as fai_file: + contigs = [line.strip().split()[0] for line in fai_file.readlines()] + return contigs + +my_dir = os.path.dirname(os.path.realpath(__file__)) + +default_vcf2diploid = os.path.join(my_dir, "target/build/vcf2diploid.jar") +default_randvcf2vcf = os.path.join(my_dir, "target/build/randvcf2vcf.jar") +default_liftover = os.path.join(my_dir, "target/build/fastq_liftover.jar") +default_vcfstats = os.path.join(my_dir, "target/build/vcfstats.jar") +default_varsim = os.path.join(my_dir, "varsim.py") + +require_randvcf2vcf = not os.path.isfile(default_randvcf2vcf) +require_vcf2diploid = not os.path.isfile(default_vcf2diploid) +require_liftover = not os.path.isfile(default_liftover) +require_vcfstats = not os.path.isfile(default_vcfstats) +require_varsim = not os.path.isfile(default_varsim) + +if not os.path.isfile(default_randvcf2vcf): default_randvcf2vcf = None +if not os.path.isfile(default_vcf2diploid): default_vcf2diploid = None +if not os.path.isfile(default_liftover): default_liftover = None +if not os.path.isfile(default_vcfstats): default_vcfstats = None +if not os.path.isfile(default_varsim): require_varsim = None + +main_parser = argparse.ArgumentParser(description="VarSim: somatic workflow", formatter_class=argparse.ArgumentDefaultsHelpFormatter) +main_parser.add_argument("--out_dir", metavar="Out directory", help="Output directory", required=False, default="somatic_out") +main_parser.add_argument("--work_dir", metavar="Work directory", help="Work directory", required=False, default="somatic_work") +main_parser.add_argument("--log_dir", metavar="Log directory", help="Directory to log to", required=False, default="somatic_log") +main_parser.add_argument("--reference", metavar="Reference", help="Reference file", required=True, type=file) +main_parser.add_argument("--seed", metavar="seed", help="Random number seed", type=int, default=0) +main_parser.add_argument("--sex", metavar="Sex", help="Sex of the person (male/female)", required=False, type=str, choices=["male", "female"], default="male") +main_parser.add_argument("--id", metavar="id", help="Sample ID", required=True) +main_parser.add_argument("--simulator", metavar="simulator", help="Read simulator", required=False, type=str, choices=["art", "dwgsim"], default="art") +main_parser.add_argument("--vcf2diploid_jar", metavar="vcf2diploid_jar", help="vcf2diploid jar", type=file, default=default_vcf2diploid, required=require_vcf2diploid) +main_parser.add_argument("--read_length", metavar="read_length", help="Length of reads", default=100, type=int) +main_parser.add_argument("--nlanes", metavar="nlanes", help="Number of lanes to generate", default=3, type=int) +main_parser.add_argument("--total_coverage", metavar="total_coverage", help="Total coverage", default=1.0, type=float) +main_parser.add_argument("--mean_fragment_size", metavar="mean_fragment_size", help="Mean fragment size", default=350, type=int) +main_parser.add_argument("--sd_fragment_size", metavar="sd_fragment_size", help="Standard deviation of fragment size", default=50, type=int) +main_parser.add_argument("--cosmic_vcf", metavar="cosmic_vcf", help="COSMIC database VCF", default=[], required=True) +main_parser.add_argument("--normal_vcf", metavar="normal_vcf", help="Normal VCF from previous VarSim run", default=[], required=True) +main_parser.add_argument("--liftover_jar", metavar="liftover_jar", help="LiftOver jar", type=file, default=default_liftover, required=require_liftover) +main_parser.add_argument("--force_five_base_encoding", action="store_true", help="Force bases to be ACTGN") +main_parser.add_argument("--filter", action="store_true", help="Only use PASS variants") +main_parser.add_argument("--keep_temp", action="store_true", help="Keep temporary files") +main_parser.add_argument("--vcfstats_jar", metavar="JAR", help="VCFStats jar", type=file, default=default_vcfstats, required=require_vcfstats) +main_parser.add_argument("--varsim_py", metavar="PYTHON", help="VarSim python script", type=file, default=default_varsim, required=require_varsim) + +pipeline_control_group = main_parser.add_argument_group("Pipeline control options. Disable parts of the pipeline.") +pipeline_control_group.add_argument("--disable_rand_vcf", action="store_true", help="Disable RandVCF2VCF somatic") +pipeline_control_group.add_argument("--disable_vcf2diploid", action="store_true", help="Disable vcf2diploid") +pipeline_control_group.add_argument("--disable_sim", action="store_true", help="Disable read simulation") + +#RandVCF2VCF seed num_SNP num_INS num_DEL num_MNP num_COMPLEX percent_novel min_length_lim max_length_lim reference_file file.vcf +rand_vcf_group = main_parser.add_argument_group("RandVCF2VCF somatic options") +rand_vcf_group.add_argument("--som_num_snp", metavar="num_snp", help="Number of somatic SNPs", default=9000, type=int) +rand_vcf_group.add_argument("--som_num_ins", metavar="num_ins", help="Number of somatic insertions", default=1000, type=int); +rand_vcf_group.add_argument("--som_num_del", metavar="num_del", help="Number of somatic deletions", default=1000, type=int); +rand_vcf_group.add_argument("--som_num_mnp", metavar="num_mnp", help="Number of somatic MNPs", default=100, type=int) +rand_vcf_group.add_argument("--som_num_complex", metavar="num_complex", help="Number of somatic complex variants", default=100, type=int) +#rand_vcf_group.add_argument("--som_percent_novel", metavar="percent_novel", help="Percent novel", default=0, type=float) +rand_vcf_group.add_argument("--som_min_length_lim", metavar="min_length_lim", help="Min length lim", default=0, type=int) +rand_vcf_group.add_argument("--som_max_length_lim", metavar="max_length_lim", help="Max length lim", default=49, type=int) +#rand_vcf_group.add_argument("--som_vcf", metavar="in_vcf", help="Input somatic variant database VCF", type=file, required=False) +rand_vcf_group.add_argument("--som_prop_het", metavar="vc_prop_het", help="Proportion of somatic heterozygous variants", default=1.0, type=float) +rand_vcf_group.add_argument("--rand_vcf_jar", metavar="rand_vcf_jar", help="RandVCF2VCF jar", type=file, default=default_randvcf2vcf, required=require_randvcf2vcf) + + +dwgsim_group = main_parser.add_argument_group("DWGSIM options") +dwgsim_group.add_argument("--dwgsim_start_e", metavar="first_base_error_rate", help="Error rate on the first base", default=0.0001, type=float) +dwgsim_group.add_argument("--dwgsim_end_e", metavar="last_base_error_rate", help="Error rate on the last base", default=0.0015, type=float) +dwgsim_group.add_argument("--dwgsim_options", help="DWGSIM command-line options", default="", required=False) +dwgsim_group.add_argument("--dwgsim", metavar="dwgsim_executable", help="DWGSIM executable", type=file, required=False, default=None) + + +art_group = main_parser.add_argument_group("ART options") +art_group.add_argument("--profile_1", metavar="profile_file1", help="Profile for first end", default=None, type=file) +art_group.add_argument("--profile_2", metavar="profile_file2", help="Profile for second end", default=None, type=file) +art_group.add_argument("--art_options", help="ART command-line options", default="", required=False) +art_group.add_argument("--art", metavar="art_executable", help="ART executable", type=file, required=False, default=None) + +args = main_parser.parse_args() + +for d in [args.log_dir, args.out_dir, args.work_dir]: + if not os.path.exists(d): + os.makedirs(d) + +FORMAT = '%(levelname)s %(asctime)-15s %(message)s' +logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=logging.DEBUG, format=FORMAT) +logger = logging.getLogger(__name__) + +def run_shell_command(cmd, cmd_stdout, cmd_stderr, cmd_dir="."): + subproc = subprocess.Popen(cmd, stdout=cmd_stdout, stderr=cmd_stderr, cwd=cmd_dir, shell=True, preexec_fn=os.setsid) + retcode = subproc.wait() + sys.exit(retcode) + +def monitor_multiprocesses(processes, logger): + for p in processes: + p.join() + if p.exitcode != 0: + logger.error("Process with pid %d failed with exit code %d" % (p.pid, pid.exitcode)) + else: + logger.info("Process with pid %d finished successfully" % (p.pid)) + +def monitor_processes(processes, logger): + while processes: + time.sleep(1) + kill_all = False + processes_running = [] + for p in processes: + status = p.poll() + if status is not None: + logger.info("Process %s exited with code %d" % (p.pid, status)) + if status != 0: + kill_all = True + logger.error("Process %s failed. Will kill the remaining processes." % (p.pid)) + else: + processes_running.append(p) + if kill_all: + for p in processes: + status = p.poll() + if status is None: + try: + os.killpg(p.pid, signal.SIGTERM) + except OSError, e: + try: + p.terminate() + except OSError, ex: + logger.write ("Could not kill the process\n") + sys.exit(1) + processes = processes_running + return [] + +def check_executable(fpath): + if not os.path.isfile(fpath): + sys.stderr.write("ERROR: File %s does not exist\n" % (fpath)) + sys.exit(1) + if not os.access(fpath, os.X_OK): + sys.stderr.write("ERROR: File %s is not executable\n" % (fpath)) + sys.exit(1) + +if not args.disable_sim: + if args.simulator == "dwgsim": + if args.dwgsim is None: + sys.stderr.write("ERROR: Please specify the DWGSIM binary with --dwgsim option\n") + sys.exit(1) + check_executable(args.dwgsim.name) + if args.simulator == "art": + if args.art is None: + sys.stderr.write("ERROR: Please specify the ART binary with --art option\n") + sys.exit(1) + check_executable(args.art.name) + +processes = [] + +t_s = time.time() + +vcf_files = [os.path.realpath(args.normal_vcf)] + +if not args.disable_rand_vcf: + rand_vcf_stdout = open(os.path.join(args.out_dir, "random.cosmic.vcf"), "w") + rand_vcf_stderr = open(os.path.join(args.log_dir, "random.cosmic.err"), "w") + vcf_files.insert(0,os.path.realpath(rand_vcf_stdout.name)) + + rand_vcf_command = ["java", "-jar", os.path.realpath(args.rand_vcf_jar.name), "-seed", str(args.seed), + "-num_snp", str(args.som_num_snp), + "-num_ins", str(args.som_num_ins), + "-num_del", str(args.som_num_del), + "-num_mnp", str(args.som_num_mnp), + "-num_complex", str(args.som_num_complex), + #"-novel", str(args.som_percent_novel), + "-min_len", str(args.som_min_length_lim), + "-max_len", str(args.som_max_length_lim), + "-ref", os.path.realpath(args.reference.name), + "-prop_het", str(args.som_prop_het), + "-vcf", os.path.realpath(args.cosmic_vcf)] + + p_rand_vcf = subprocess.Popen(rand_vcf_command, stdout=rand_vcf_stdout, stderr=rand_vcf_stderr) + logger.info("Executing command " + " ".join(rand_vcf_command) + " with pid " + str(p_rand_vcf.pid)) + processes.append(p_rand_vcf) + +processes = monitor_processes(processes, logger) + +processes = [] +for in_vcf in vcf_files: + out_prefix = os.path.basename(in_vcf) + vcfstats_stdout = open(os.path.join(args.out_dir, "%s.stats" % (out_prefix)), "w") + vcfstats_stderr = open(os.path.join(args.log_dir, "%s.vcfstats.err" % (out_prefix)), "w") + vcfstats_command = ["java", "-Xmx1g", "-Xms1g", "-jar", os.path.realpath(args.vcfstats_jar.name), "-vcf", in_vcf] + p_vcfstats = subprocess.Popen(vcfstats_command, stdout=vcfstats_stdout, stderr=vcfstats_stderr) + logger.info("Executing command " + " ".join(vcfstats_command) + " with pid " + str(p_vcfstats.pid)) + processes.append(p_vcfstats) + + +# Run VarSim +varsim_stdout = open(os.path.join(args.log_dir, "som_varsim.out"), "w") +varsim_stderr = open(os.path.join(args.log_dir, "som_varsim.log"), "w") + +vcf_arg_list = ["--vcfs"] + vcf_files + +# need to fix the store true ones +filter_arg_list = ["--filter"] if args.filter else [] +disable_sim_arg_list = ["--disable_sim"] if args.disable_sim else [] +force_five_base_encoding_arg_list = ["--force_five_base_encoding"] if args.force_five_base_encoding else [] +keep_temp_arg_list = ["--keep_temp"] if args.keep_temp else [] +dwgsim_arg_list = ["--dwgsim", args.dwgsim] if args.dwgsim is not None else [] +profile_1_arg_list = ["--profile_1", args.profile_1] if args.profile_1 is not None else [] +profile_2_arg_list = ["--profile_2", args.profile_2] if args.profile_2 is not None else [] +art_arg_list = ["--art", args.art.name] if args.art is not None else [] +varsim_command = ["python", os.path.realpath(args.varsim_py.name), + "--out_dir", str(os.path.realpath(args.out_dir)), + "--work_dir", str(os.path.realpath(args.work_dir)), + "--log_dir", str(os.path.realpath(args.log_dir)), + "--reference", str(os.path.realpath(args.reference.name)), + "--seed", str(args.seed), + "--sex", str(args.sex), + "--id", str(args.id), + "--simulator", str(args.simulator), + "--vcf2diploid_jar", str(os.path.realpath(args.vcf2diploid_jar.name)), + "--read_length", str(args.read_length), + "--nlanes", str(args.nlanes), + "--total_coverage", str(args.total_coverage), + "--mean_fragment_size", str(args.mean_fragment_size), + "--sd_fragment_size", str(args.sd_fragment_size), + "--liftover_jar", str(os.path.realpath(args.liftover_jar.name)), + "--vcfstats_jar", str(os.path.realpath(args.vcfstats_jar.name)), + "--dwgsim_start_e", str(args.dwgsim_start_e), + "--dwgsim_end_e", str(args.dwgsim_end_e), + "--dwgsim_options", str(args.dwgsim_options), + "--art_options", str(args.art_options), + "--disable_rand_vcf", + "--disable_rand_dgv" ] + vcf_arg_list + filter_arg_list + disable_sim_arg_list + force_five_base_encoding_arg_list + keep_temp_arg_list + dwgsim_arg_list + profile_1_arg_list + profile_2_arg_list + art_arg_list +p_varsim = subprocess.Popen(varsim_command, stdout=varsim_stdout, stderr=varsim_stderr) +logger.info("Executing command " + " ".join(varsim_command) + " with pid " + str(p_varsim.pid)) +processes.append(p_varsim) + + +processes = monitor_processes(processes, logger) + +# grep out the cosmic variants +# This is a bit dodgy +# grep -v "COS" art_cosmic/out/sv.truth.vcf > out/sv_norm.vcf & +# grep "COS" art_cosmic/out/sv.truth.vcf > out/sv_cosmic.vcf & + +grep_norm_stdout = open(os.path.join(args.out_dir, str(args.id) + "_norm.vcf"), "w") +grep_norm_stderr = open(os.path.join(args.log_dir, str(args.id) + "_norm.err"), "w") + +grep_norm_command = ["grep", "-v", "COS", os.path.realpath(str(args.out_dir) + "/" + str(args.id) + ".truth.vcf")] + +p_grep_norm = subprocess.Popen(grep_norm_command, stdout=grep_norm_stdout, stderr=grep_norm_stderr) +logger.info("Executing command " + " ".join(grep_norm_command) + " with pid " + str(p_grep_norm.pid)) +processes.append(p_grep_norm) + +grep_cos_stdout = open(os.path.join(args.out_dir, str(args.id) + "_somatic.vcf"), "w") +grep_cos_stderr = open(os.path.join(args.log_dir, str(args.id) + "_somatic.err"), "w") + +grep_cos_command = ["grep", "COS", os.path.realpath(str(args.out_dir) + "/" + str(args.id) + ".truth.vcf")] + +p_grep_cos = subprocess.Popen(grep_cos_command, stdout=grep_cos_stdout, stderr=grep_cos_stderr) +logger.info("Executing command " + " ".join(grep_cos_command) + " with pid " + str(p_grep_cos.pid)) +processes.append(p_grep_cos) + + +processes = monitor_processes(processes, logger) + +vcfstats_stdout = open(os.path.join(args.out_dir, "%s.norm.vcf.stats" % (args.id)), "w") +vcfstats_stderr = open(os.path.join(args.log_dir, "%s.norm.vcf.vcfstats.err" % (args.id)), "w") +p_vcfstats = subprocess.Popen(["java", "-Xmx1g", "-Xms1g", "-jar", os.path.realpath(args.vcfstats_jar.name), "-vcf", os.path.join(args.out_dir, str(args.id) + "_norm.vcf")], stdout=vcfstats_stdout, stderr=vcfstats_stderr) +logger.info("Executing command " + " ".join(vcfstats_command) + " with pid " + str(p_vcfstats.pid)) +processes.append(p_vcfstats) + +vcfstats_stdout = open(os.path.join(args.out_dir, "%s.somatic.vcf.stats" % (args.id)), "w") +vcfstats_stderr = open(os.path.join(args.log_dir, "%s.somatic.vcf.vcfstats.err" % (args.id)), "w") +p_vcfstats = subprocess.Popen(["java", "-Xmx1g", "-Xms1g", "-jar", os.path.realpath(args.vcfstats_jar.name), "-vcf", os.path.join(args.out_dir, str(args.id) + "_somatic.vcf")], stdout=vcfstats_stdout, stderr=vcfstats_stderr) +logger.info("Executing command " + " ".join(vcfstats_command) + " with pid " + str(p_vcfstats.pid)) +processes.append(p_vcfstats) + +monitor_processes(processes, logger) + +logger.info("Done! (%g hours)" % ((time.time() - t_s)/3600.0))