Skip to content

Commit

Permalink
Merge pull request #88 from bioinform/improve-quickstart
Browse files Browse the repository at this point in the history
Improve quickstart
  • Loading branch information
marghoob authored Jan 10, 2017
2 parents 9a6fa93 + dbd3ba7 commit 5b627ca
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 29 deletions.
20 changes: 14 additions & 6 deletions quickstart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,32 @@ set -x

b37_source="ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
dbsnp_source="ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b147_GRCh37p13/VCF/All_20160601.vcf.gz"
varsim_version="0.7.1"
samtools_version="1.3.1"

# Download varsim
wget https://github.com/bioinform/varsim/releases/download/v0.7.1/varsim-0.7.1.tar.gz
tar xfz varsim-0.7.1.tar.gz
wget https://github.com/bioinform/varsim/releases/download/v$varsim_version/varsim-$varsim_version.tar.gz
tar xfz varsim-$varsim_version.tar.gz

# Download reference and variant databases
wget $b37_source -O - | gunzip -c > hs37d5.fa
wget http://web.stanford.edu/group/wonglab/varsim/insert_seq.txt
wget http://web.stanford.edu/group/wonglab/varsim/GRCh37_hg19_supportingvariants_2013-07-23.txt
wget $dbsnp_source -O All.vcf.gz

# Download samtools and install ART simulator
mkdir -pv samtools ART

# Download samtools and index reference
mkdir samtools
pushd .
cd samtools
wget http://sourceforge.net/projects/samtools/files/samtools/1.0/samtools-bcftools-htslib-1.0_x64-linux.tar.bz2
tar xfj samtools-bcftools-htslib-1.0_x64-linux.tar.bz2
cd ..
samtools/samtools-bcftools-htslib-1.0_x64-linux/bin/samtools faidx hs37d5.fa
wget https://github.com/samtools/samtools/releases/download/$samtools_version/samtools-$version.tar.bz2
tar xfj samtools-$samtools_version.tar.bz2
cd samtools-$samtools_version && make
popd

samtools/samtools-$samtools_version/samtools faidx hs37d5.fa

# Download ART
mkdir ART
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.bina.varsim.VarSimToolNamespace;
import com.bina.varsim.types.Genotypes;
import com.bina.varsim.types.variant.Variant;
import com.bina.varsim.util.SimpleReference;
import com.bina.varsim.util.VCFparser;
import org.apache.log4j.Logger;
import org.kohsuke.args4j.CmdLineException;
Expand All @@ -24,6 +25,9 @@ public class RandSequenceVCF extends RandVCFgenerator {
@Option(name = "-out_vcf", usage = "Output VCF to generate")
File outFile = null;

@Option(name = "-ref", usage = "Reference FASTA", required = true)
File ref = null;

public RandSequenceVCF(final String command, final String description) {
super(command, description);
}
Expand All @@ -37,6 +41,8 @@ public void run(String[] args) throws IOException {
return;
}

final SimpleReference reference = new SimpleReference(ref.getPath());

rand = new Random(seed);

final byte[] samplingSequence = fileToByteArray(sequenceFile);
Expand All @@ -50,6 +56,8 @@ public void run(String[] args) throws IOException {
if (var == null) {
continue;
}
var.calculateExtraBase(reference.getSequence(var.getChr()));

final Genotypes geno = var.getGenotypes();
fillInSeq(var, samplingSequence, geno.geno[0]);
fillInSeq(var, samplingSequence, geno.geno[1]);
Expand Down
7 changes: 6 additions & 1 deletion src/main/java/com/bina/varsim/types/variant/Variant.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ public Variant(final Random rand) {

/**
* whether have flag ISINV?
* @param idx
* @return
*/
public boolean isInversed() {
Expand Down Expand Up @@ -978,12 +977,18 @@ public String getLengthString() {
}

public void calculateExtraBase(final Sequence refSeq) {
if (refDeleted.length() > 0) {
return;
}
for (final Alt alt : alts) {
if (alt.isSeq() && alt.length() == 0 && getPos() + referenceAlleleLength < refSeq.length()) {
//why extrabase is only 1-bp long?
extraBase = String.valueOf((char) refSeq.byteAt(getPos() + referenceAlleleLength));
}
}
if (ref.length == 0) {
extraBase = String.valueOf((char) refSeq.byteAt(getPos()));
}
}

/**
Expand Down
12 changes: 2 additions & 10 deletions src/main/java/com/bina/varsim/util/VCFparser.java
Original file line number Diff line number Diff line change
Expand Up @@ -589,13 +589,7 @@ a translocation is decomposed into a duplication (a special translocation) and a

//what does clipLength represent?
int clipLength = 0;
for (int j = 0; j < alternativeAlleleLength; j++) {

// make sure there is at least something in ref
if (referenceAlleleLength <= j) {
clipLength = j;
break;
}
for (int j = 0; j < Math.min(alternativeAlleleLength, referenceAlleleLength); j++) {
//this is based on the assumption that all characters are ASCII characters
if (REF.charAt(referenceAlleleLength - j - 1) != alts[i].byteAt(alternativeAlleleLength - j - 1)) {
clipLength = j;
Expand All @@ -604,9 +598,7 @@ a translocation is decomposed into a duplication (a special translocation) and a
clipLength = j + 1;
}

if (minClipLength > clipLength) {
minClipLength = clipLength;
}
minClipLength = Math.min(clipLength, minClipLength);
}

/*
Expand Down
18 changes: 11 additions & 7 deletions varsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from multiprocessing import Process
from liftover_restricted_vcf_map import lift_vcfs, lift_maps

VERSION = "0.7.1"
MY_DIR = os.path.dirname(os.path.realpath(__file__))
VARSIMJAR = os.path.realpath(os.path.join(MY_DIR, "VarSim.jar"))
DEFAULT_VARSIMJAR = os.path.join(MY_DIR, "VarSim.jar")
Expand Down Expand Up @@ -178,13 +177,15 @@ def check_executable(fpath):
sys.exit(os.EX_NOINPUT)


def fill_missing_sequences(vcf, seq_file, work_dir, log_dir):
def fill_missing_sequences(vcf, seq_file, reference, work_dir, log_dir):
logger = logging.getLogger(fill_missing_sequences.__name__)

out_vcf = os.path.join(work_dir, os.path.basename(vcf))
if out_vcf.endswith(".gz"):
out_vcf = out_vcf[:-3]
out_log = os.path.join(log_dir, "%s_fill_missing.log" % (os.path.basename(vcf)))

command = ["java", "-Xmx1g", "-Xms1g", "-jar", VARSIMJAR, "randsequencevcf", "-in_vcf", vcf, "-seq", seq_file, "-out_vcf", out_vcf]
command = ["java", "-Xmx10g", "-Xms10g", "-jar", VARSIMJAR, "randsequencevcf", "-in_vcf", vcf, "-seq", seq_file, "-out_vcf", out_vcf, "-ref", reference]
with open(out_log, "w") as log_fd:
logger.info("Running command " + " ".join(command))
subprocess.check_call(" ".join(command), shell=True, stderr=log_fd)
Expand Down Expand Up @@ -224,7 +225,12 @@ def run_randvcf(sampling_vcf, out_vcf_fd, log_file_fd, seed, sex, num_snp, num_i
return p_rand_vcf


def get_version():
return subprocess.check_output("java -jar {} -version".format(VARSIMJAR), shell=True).strip()


if __name__ == "__main__":
check_java()

main_parser = argparse.ArgumentParser(description="VarSim: A high-fidelity simulation validation framework",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
Expand Down Expand Up @@ -266,7 +272,7 @@ def run_randvcf(sampling_vcf, out_vcf_fd, log_file_fd, seed, sex, num_snp, num_i
main_parser.add_argument("--filter", action="store_true", help="Only use PASS variants for simulation")
main_parser.add_argument("--keep_temp", action="store_true", help="Keep temporary files after simulation")
main_parser.add_argument("--lift_ref", action="store_true", help="Liftover chromosome names from restricted reference")
main_parser.add_argument('--version', action='version', version='VarSim: %(prog)s ' + VERSION)
main_parser.add_argument('--version', action='version', version=get_version())
main_parser.add_argument('--log_to_stderr', action='store_true', help='Output log to stderr instead of log_dir/varsim.log')
main_parser.add_argument("--loglevel", help="Set logging level", choices=["debug", "warn", "info"], default="info")

Expand Down Expand Up @@ -365,8 +371,6 @@ def run_randvcf(sampling_vcf, out_vcf_fd, log_file_fd, seed, sex, num_snp, num_i
logging.basicConfig(level=loglevel, format=FORMAT)
logger = logging.getLogger(__name__)

check_java()

# Make sure we can actually execute the executable
if not args.disable_sim:
check_executable(args.simulator_executable.name)
Expand All @@ -380,7 +384,7 @@ def run_randvcf(sampling_vcf, out_vcf_fd, log_file_fd, seed, sex, num_snp, num_i
for i, vcf in enumerate(args.vcfs):
tool_work_dir = os.path.join(args.out_dir, "filled_in", str(i))
makedirs([tool_work_dir])
in_vcfs.append(fill_missing_sequences(vcf, os.path.realpath(args.sv_insert_seq.name), tool_work_dir, tool_work_dir))
in_vcfs.append(fill_missing_sequences(vcf, os.path.realpath(args.sv_insert_seq.name), args.reference.name, tool_work_dir, tool_work_dir))
args.vcfs = map(os.path.realpath, in_vcfs)

open_fds = []
Expand Down
9 changes: 4 additions & 5 deletions varsim_somatic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import subprocess
import logging
import time
from varsim import VERSION, MY_DIR, VARSIMJAR, DEFAULT_VARSIMJAR, REQUIRE_VARSIMJAR
from varsim import check_java, makedirs, monitor_processes, check_executable, run_vcfstats, run_randvcf
from varsim import MY_DIR, VARSIMJAR, DEFAULT_VARSIMJAR, REQUIRE_VARSIMJAR
from varsim import check_java, makedirs, monitor_processes, check_executable, run_vcfstats, run_randvcf, get_version

VARSIM_PY = os.path.join(MY_DIR, "varsim.py")

if __name__ == "__main__":
check_java()

main_parser = argparse.ArgumentParser(description="VarSim: somatic workflow",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
Expand Down Expand Up @@ -50,7 +51,7 @@
main_parser.add_argument("--force_five_base_encoding", action="store_true", help="Force bases to be ACTGN")
main_parser.add_argument("--filter", action="store_true", help="Only use PASS variants")
main_parser.add_argument("--keep_temp", action="store_true", help="Keep temporary files")
main_parser.add_argument('--version', action='version', version='VarSim: %(prog)s ' + VERSION)
main_parser.add_argument('--version', action='version', version=get_version())


input_vcf_group = main_parser.add_argument_group("Input VCFs options")
Expand Down Expand Up @@ -103,8 +104,6 @@
logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=logging.DEBUG, format=FORMAT)
logger = logging.getLogger(__name__)

check_java()

if not args.disable_sim:
if not args.simulator_executable:
logger.error("Please specify %s binary with --simulator_executable option" % args.simulator)
Expand Down

0 comments on commit 5b627ca

Please sign in to comment.