From cb2c57459dd2192a6044b380ae2598d05c2df1d4 Mon Sep 17 00:00:00 2001 From: Mike Dacre Date: Thu, 17 May 2018 14:45:50 -0700 Subject: [PATCH] Version 2.0.0b2 This commit merges the updates branch and adds the following features: - More generalized file handling to allow user to run with only VCF plus BAM - Snakemake pipeline to allow easy running - Script to combine pipeline outputs across multiple samples - Ability to install via pip This is a squashed commit of the following: commit 9dfa3e25ca49a1aa5d63dfffa041beabfe887dd5 Author: Mike Dacre Date: Thu May 17 14:37:28 2018 -0700 Add pipeline components. commit 99a9eb12485dffa2a925d0779d7a31a1d4c95977 Author: Mike Dacre Date: Wed May 16 17:37:14 2018 -0700 Add get_snake command to cisVar.py commit 1d7d116fa211bc7e4bda3fc6fc7b239ed787bb52 Author: Mike Dacre Date: Wed May 16 17:36:50 2018 -0700 Make README current commit 38e2db192c9542a2eb17c2e36ea78b8b10cbf970 Author: Mike Dacre Date: Wed May 16 17:36:23 2018 -0700 Make installable with pip commit 9f5f4db89904206648ecc8a8f0cdfc948f366168 Author: Mike Dacre Date: Wed May 16 15:18:21 2018 -0700 Add complete documentation commit 10c145f48c08b606de4afa79382635fbad8a4f6a Author: Mike Dacre Date: Wed May 16 10:57:01 2018 -0700 Minor bugfix commit adc73ec9588c39921b0afd14e877559743e5492b Author: Mike Dacre Date: Tue May 15 17:54:44 2018 -0700 Add script to combine dataframes commit 680d75bf2dea59a2d5ebc080883895e0d0433b21 Author: Mike Dacre Date: Mon May 14 17:37:08 2018 -0700 Add standardized Snakemake pipeline for cisVar commit 2f43e2924d32d56db007cc3dfb20c65af7e362f2 Author: Mike Dacre Date: Mon May 14 17:36:37 2018 -0700 Moved external scripts into main script commit 87e7b40e500270c089c55b30e2162e4f16ffeeed Author: Mike Dacre Date: Mon May 14 10:52:25 2018 -0700 Integrated outside functions to core code commit bbc8d57aaf9ada25d832281f90a2e61fcf8181df Author: Mike Dacre Date: Mon Apr 30 16:02:03 2018 -0700 Increase readability of the regression code commit 0724e73d261cf3318f99ea19a339d43afcf62e15 Author: Mike Dacre Date: Tue Apr 24 17:59:49 2018 -0700 Version 2.0.0 Update to for POST and GENO sorting to be identical. Also simplify and integrate code to make it work on any dataset. commit fe7062b429056dcaf96d2f847c6cb953ffc4cc9e Author: Mike Dacre Date: Mon Apr 23 16:42:43 2018 -0700 Restructured argparse to make it more sane commit 5694fe530b41d75b8eae0e0d13f222486ec656fa Author: Mike Dacre Date: Wed Apr 11 10:10:10 2018 -0700 More robust handling of the R script plus bugfixes commit 86f82af2523f1c5a166515d6c5cc765dc4bb54c5 Author: Mike Dacre Date: Wed Feb 21 18:13:36 2018 -0800 Add documentation to genoExtract function commit f55c5c0faf6f965e6d23df28e3c4e6cbe65e2184 Author: Mike Dacre Date: Wed Feb 21 18:12:54 2018 -0800 Minor speed bugfix commit 81b4984e4bc53c0c2da26fb1027b3d38454f3ca9 Author: Mike Dacre Date: Wed Feb 21 18:09:30 2018 -0800 Alter genoExtract function to check data integrity Also reduces memory usage from ~150GB down to ~10GB and increases speed from ~10 hours to ~10 minutes. Also added documentation to the function. commit f7d578b33accb1d601a66d5ad08ac42095c7e9d6 Author: Mike Dacre Date: Wed Feb 21 13:13:29 2018 -0800 Attempted low memory solution, too slow commit 9c40c73b293536148326d8a437d0c10af17e90ab Merge: f2d9bc3 0669969 Author: Mike Dacre Date: Tue Feb 20 11:04:49 2018 -0800 Merge branch 'updates' of github.com:TheFraserLab/cisVar into updates commit f2d9bc3354870f40648529acc8f8c733e50263c5 Author: Mike Dacre Date: Tue Feb 20 11:04:37 2018 -0800 Add qqlot plotting commit 06699693443f4334d198baab43e38e3066a01a2b Merge: 2bbf0db 0ae247c Author: Mike Dacre Date: Mon Feb 12 13:38:05 2018 -0800 Merge branch 'updates' of github.com:TheFraserLab/cisVar into updates commit 2bbf0db2cc7fd2bc250742bfb5677d92960e0f83 Author: Mike Dacre Date: Mon Feb 12 13:38:02 2018 -0800 Changes to multiprocessing commit 0ae247ce25a56ef2a5e935719db2c1a34b1d6b4e Author: Mike Dacre Date: Mon Feb 12 13:37:13 2018 -0800 Bugfixes commit d3158dc064353e42c33d04e252f3bc61482e793d Author: Mike Dacre Date: Fri Jan 26 13:51:03 2018 -0800 Minor bugfix commit 31e5c8c92643c6ccc7efd0ac6d5b50e1cef252c5 Author: Mike Dacre Date: Wed Jan 17 15:26:02 2018 -0800 Misc stability updates and plotting code commit c080388a09f4770d91b3826230ebbf5e0e379828 Author: Mike Dacre Date: Mon Jan 15 16:58:10 2018 -0800 Made genotype parsing bed and vcf compatible and more robust commit dcad85a9909f42253c8e450f3e9e4cf3347050b8 Author: Mike Dacre Date: Tue Dec 19 15:06:45 2017 -0800 Generalized code and added progress messages Generalized the regression code to work with a wider range of data, particularly by removing or making more obvious most hard-coded limits in the R regression. Also added better progress tracking messages to all code. vcf_to_indi_and_geno has multiple changes to make it parrallelize better. --- .gitignore | 4 + README.md | 502 ++++++++- cisVar.py | 1686 ++++++++++++++++++++++++---- pipeline/Snakefile | 513 +++++++++ pipeline/cisvar_config.json | 18 + pipeline/cisvar_samples.json | 1 + pipeline/cluster.json | 9 + regression_qtls.R | 174 +-- scripts/combine.py | 275 +++++ scripts/convert_vcf2bed.sh | 8 - scripts/make_alleles.py | 92 -- scripts/plot_fun.R | 34 + scripts/post_process_regression.py | 259 ----- scripts/rmdup.py | 187 +-- scripts/vcf_to_indi_and_geno.py | 165 --- setup.py | 72 ++ 16 files changed, 3043 insertions(+), 956 deletions(-) mode change 100644 => 100755 cisVar.py create mode 100644 pipeline/Snakefile create mode 100644 pipeline/cisvar_config.json create mode 100644 pipeline/cisvar_samples.json create mode 100644 pipeline/cluster.json create mode 100755 scripts/combine.py delete mode 100755 scripts/convert_vcf2bed.sh delete mode 100755 scripts/make_alleles.py create mode 100644 scripts/plot_fun.R delete mode 100755 scripts/post_process_regression.py delete mode 100755 scripts/vcf_to_indi_and_geno.py create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 65a6cd6..b610d55 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ # Ipython .ipynb_checkpoints +# Vim +Session.vim +.ropeproject + # Apple's junk .DS_Store diff --git a/README.md b/README.md index 756ef58..e2adccb 100644 --- a/README.md +++ b/README.md @@ -1,73 +1,471 @@ # README -`cisVar.py` is my pipeline written in python3 and uses `regression_qtls.R`. +cisVar is a pipeline to estimate pre-ChIP frequencies from pooled post-ChIP +frequencies via a regression on the genotypes of the individuals in the pool. +To use this code you must have a mapped BAM file for each pool (a single pool is +fine) as well as genotype info for all individuals in the pool in VCF format. +Additional information can be provided also, see below for information. -`regression_qtls.R` is called inside of `cisVar.py` so you will not run it -directly. +For an overview of this method, or to cite this work, please see the following: + +Tehranchi, Ashley K., Marsha Myrthil, Trevor Martin, Brian L. Hie, David Golan, +and Hunter B. Fraser. “[Pooled ChIP-Seq Links Variation in Transcription Factor +Binding to Complex Disease Risk](https://doi.org/10.1016/j.cell.2016.03.041).” +Cell 165, no. 3 (April 21, 2016): 730–41. + +This code was written by Ashley Tehranchi with minor modifications by Mike +Dacre. It was produced at Stanford University, but is released here under the +MIT license. + +Current version: 2.0.0b2 + +## Overview + +`cisVar.py` is the pipeline written in python3 and uses `regression_qtls.R` +(`regression_qtls.R` is called inside of `cisVar.py` so you will not run it +directly). + +In addition, the code at `scripts/combine.py` can be used to create combined and +merged (per-SNP) pandas dataframes from the outputs of `cisVar.py` for multiple +samples. + +A complete example Snakemake pipeline is provided in `pipeline`, you should be +able to run it by modifying only the `cisvar_config.json` file, see the +Snakemake section below. Right now the default minor allele frequency filter is `0.1>MAF<0.99`. -To change these and other constants edit the `regressions_qtls.R` script. +To change these and other regression constants, edit the `regressions_qtls.R` +script. + +## Installation + +Install via PyPI: + +```shell +pip install cisVar +``` + +Or install from github: + +```shell +pip install https://github.com/TheFraserLab/cisVar/tarball/master +``` + +Alternatively, you can just clone the repo and use it directly from there. + +```shell +git clone https://github.com/TheFraserLab/cisVar.git +``` + +It should work with python 2 or 3, but python 3 is highly recommended. + + +### Prereqs + +Requires `samtools v1.9` and `bedtools v2.26.0` as well as the following python +modules (installed automatically by pip): + +`pandas`, `numpy`, `psutil`, `snakemake`, `wget` + +Additionally requires that `R` with `Rscript` are installed, with the `ggplot2` +module installed. + +## Usage + +To run this code on your own data you need: + +- VCF(s) with genotypes and ref/alt alleles for every SNP you wish to test +- A mapped BAM file, ideally this will make use of + [Hornet](https://github.com/TheFraserLab/Hornet) to remove reference bias. + +In addition, the following data can be helpful: + +- A list of individuals to filter genotypes with (one newline separated file of + individuals per pool) +- A file with ref and alt alleles for your SNPs of interest, to modify those in + the genotype VCF files (BED/VCF/txt) +- A file to limit the SNPs to consider to a subset of those in the VCF + (BED/VCF/txt) Example pipeline: ```shell -cisVar.py mpileup -F SampleName -f fastaFile -p mpileupBEDfile -B sortedBam +cisVar.py prep -F SampleName -i individuals.txt.gz --chrom-format chr /path/to/geno/*vcf.gz + +cisVar.py mpileup -F SampleName -f fastaFile -B sortedBam cisVar.py post -F SampleName -r readDepth -a allelesFile cisVar.py geno -F SampleName -r readDepth -i individualsFile -g genotypesFile cisVar.py qtls -F SampleName -r readDepth -n numberIndividuals + +cisVar.py tidy -F SampleName -r readDepth + +scripts/combine.py {sample}.readDepth.regression.pd +``` + +A readDepth of 20 is generally optimal, the sample name can be whatever you +want, but should be unique per sample. `{sample}` is a placeholder to allow any +number of samples to be combined in the last step. + + +### Snakemake + +The above pipeline can be automated with +[Snakemake](https://snakemake.readthedocs.io/en/stable/). + +To use, install cisVar, navigate to the root of your project, and run `cisVar.py +get_snake` to copy the Snakefile and config file over . Then edit the +`cisvar_config.json` file to match your needs. + +You will also need to edit the `Snakefile` to set the `script_prep` string to +match what is needed by your system. + +The following are the config options for that file: + +| Option | Description | +|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| name | A general name for this run, file prefix will be .. | +| sample_name | The name of the sample, default is population. Used only in the combination of multiple samples. | +| samples | A list of samples, can just be a list, or a dictionary of {sample:group}, the 'group' in this case allows the use of the same genotype files for multiple samples, can also be a path to a separate json file | +| read_depth | An integer depth to require for each SNP to be considered | +| max_cores | Used only when parsing VCFs, if you have multiple VCF files (e.g. per chromosome), they will be parsed in parallel up to this many cores (or max avaialable on machine) | +| sort_vcfs | Either 1 or 0, if 1 assumes that VCF files contain a `chr#` string in the file name, and sorts the order of files to be chr1->22,X,Y,MT. Don't use if your VCFs don't have `chr#` in the name | +| chrom_format | 'chr', 'num', 'ignore': Force format of chromosome name to be `chr#` or `#`. This ensures that all input files have the same format. Use ignore to do nothing. | +| bams | A path to the mapped BAM files, must contain the `{sample}` string (unless you only have one bam), e.g. `/path/to/{sample}.sorted.bam`, `{sample}` must be in samples | +| cisVar | Path to the cisVar repository | +| vcfs | Can be a single path (for one vcf), a list of vcfs, or a glob string (e.g. `/path/to/vcfs/*.vcf.comm.gz`) | +| genome_fa | Path to a FastA file of the genome you mapped to, single file only. | +| inds | Optional: used to filter VCFs so that the genotype files contain only the individuals in the sample, e.g. `/path/to/inds/{sample}.ind.txt.gz`. Newline separated file of individuals. | +| locs | Optional: a BED/VCF/text file of SNP locations to consider, used to limit the total to be a subset of the genotype file. | +| alleles | Optional: a BED/VCF/text file of alternate ref/alt alleles. Must be a subset of the genotype VCFs. If there is an entry in this file, it's ref/alt alleles will be used instead of those in the genotype file | + +Note the last three files are optional, also if `samples` is a dict, then the +value will be used in place of the sample. For example, if you have two samples +for the same population that are `yri1` and `yri2`, but they both use the same +genotype file `yri.geno.vcf`, you can make samples `{'yri1': 'yri', 'yri2': +'yri'}` and then `yri` will be used to pick the ind, loc, and allele files + + +#### Cluster + +To run on a cluster, run `cisVar.py get_snake` with `-x` and edit the +`cluster.json` file to match your cluster environment, then run e.g.: + +```shell +snakemake -j 100 --cluster-config cluster.json --cluster sbatch -n {threads} \ +-t {params.time} --mem={resources.mem_mb} -p {cluster.queue} \ +-o {cluster.out} -e {cluster.err} ``` -Requires `samtools v1.9<` and `bedtools v2.26.0<`. - -File formats are described in the script help section +or + +```shell +snakemake -j 100 --cluster-config cluster.json --cluster qsub \ +-l nodes=1:ppn={threads} -l walltime={params.time} -l mem={resources.mem_mb} \ +-o {cluster.out} -e {cluster.err} +``` + +To set the maximum allowed memory per job, add the argument +`--resources mem_mb=32000`. + +To also combine files, add `combine` to the end of the command. + ## Script help +Below are help options available on the command line for cisVar, all these steps +are run by the above snakemake pipeline. + +### cisVar.py + +``` +usage: cisVar.py [-h] {prep,mpileup,post,geno,qtls,tidy,get_snake} ... + +cisVar: Find cis QTLs based on an experimental selection method + +Ashley Tehranchi + +Stanford University + +Version: 2.0.0b1 +Created: 2015-12-12 +Updated: 2018-05-16 + +Example usage: +cisVar prep -F test_new -i individuals.txt.gz --chrom-format chr +cisVar mpileup -F -f -B +cisVar post -F -r +cisVar geno -F -r -i +cisVar qtls -F -r -n +cisVar tidy -F -r -p out.dataframe.pandas -t out.dataframe.txt + +Note: +The qtls regression step will use approximately 32GB of memory on an averaged- +sized dataset. + +The geno step will use approximately 20GB of memory on the same dataset. + +positional arguments: + {mpileup,post,geno,qtls} + prep Prepare genotype files + mpileup (m) Run mpileup + post (p) Run POST frequency calculation + geno (g) Munge genotypes to prepare for regression + qtls (q, regression, r) + Run the regression + tidy (t) Tidy up regression, call open/clsoed + +optional arguments: + -h, --help show this help message and exit +``` + +#### prep + +This step converts VCFs into genotype and individual files that can be used by +the pipeline. + +``` +usage: cisVar.py prep [-h] [-F PREFIX_NAME] [-r TRIAL_DEPTHS] [-i ALL_INDS] + [-l LIMIT_FILE] [-a ALLELE_FILE] + [--chrom-format {chr,num,ignore}] [--include-indels] + [-c CORES] + vcf_files [vcf_files ...] + +Prepare genotype files + +optional arguments: + -h, --help show this help message and exit + +Run Options: + -F PREFIX_NAME, --SampleName PREFIX_NAME + sample/population name (default: cis_var) + -r TRIAL_DEPTHS, --readDepth TRIAL_DEPTHS + minimum read depth per variant (default: 20) + +Prep Options: + -i ALL_INDS, --all-inds ALL_INDS + File of individuals in all groups, one per line + -l LIMIT_FILE, --limit-file LIMIT_FILE + BED/VCF/txt file of SNPs to consider + -a ALLELE_FILE, --allele-file ALLELE_FILE + BED/VCF/txt file of alleles to override VCF allels + (subset of vcf) + --chrom-format {chr,num,ignore} + chr: make format "chr#", num: make format "#", ignore: + do nothing (default: ignore) + --include-indels Do not skip indels + -c CORES, --cores CORES + Number of cores to use (default: all) + vcf_files VCF files with genotypes +``` + +#### mpileup + +This is just a simple wrapper for samtools mpileup + +``` +usage: cisVar.py mpileup [-h] [-F PREFIX_NAME] [-r TRIAL_DEPTHS] -f + ALLCHRFASTA -B SORTEDBAM [-p MPILEUPBEDFILE] + +Run mpileup + +optional arguments: + -h, --help show this help message and exit + +Run Options: + -F PREFIX_NAME, --SampleName PREFIX_NAME + sample/population name (default: cis_var) + -r TRIAL_DEPTHS, --readDepth TRIAL_DEPTHS + minimum read depth per variant (default: 20) + +mpileup Options: + -f ALLCHRFASTA, --fasta ALLCHRFASTA + fasta file with all chromosomes (Required) + -B SORTEDBAM, --BAMfile SORTEDBAM + sorted BAM file (Required) + -p MPILEUPBEDFILE, --mpileupBEDfile MPILEUPBEDFILE + BED to use instead of the BED generated in the prep + phase (Do not use if possible, use prep with limit + instead) +``` + +#### post + +This step actually calculates the POST-frequencies for the data. + +``` +usage: cisVar.py post [-h] [-F PREFIX_NAME] [-r TRIAL_DEPTHS] [-a GENOSFILE] + +Run POST frequency calculation + +optional arguments: + -h, --help show this help message and exit + +Run Options: + -F PREFIX_NAME, --SampleName PREFIX_NAME + sample/population name (default: cis_var) + -r TRIAL_DEPTHS, --readDepth TRIAL_DEPTHS + minimum read depth per variant (default: 20) + +POST Options (Deprecated): + -a GENOSFILE, --allelesFile GENOSFILE + The genotypes file, (Optional, default is file created + in prep) +``` + +#### geno + +This step converts the genotype file made in the prep step into a matrix that +can be used in the regression. It is important that this genotype file is +perfectly sorted to match the outputs of the POST step. + +``` +usage: cisVar.py geno [-h] [-F PREFIX_NAME] [-r TRIAL_DEPTHS] [-g GENOSFILE] + [-i INDIVIDUALSLIST] + +Munge genotypes to prepare for regression + +optional arguments: + -h, --help show this help message and exit + +Run Options: + -F PREFIX_NAME, --SampleName PREFIX_NAME + sample/population name (default: cis_var) + -r TRIAL_DEPTHS, --readDepth TRIAL_DEPTHS + minimum read depth per variant (default: 20) + +Genotype Options: + -g GENOSFILE, --genoFile GENOSFILE + The genotypes file, (Optional, default is file created + in prep) + -i INDIVIDUALSLIST, --individualsFile INDIVIDUALSLIST + list of individuals matching genotype matrix; one indv + per line +``` + +#### qtls + +This is the actual regression step, it makes sure all the files are in the right +place and then calls `regression_qtls.R` to do the actual regression. + +``` +usage: cisVar.py qtls [-h] [-F PREFIX_NAME] [-r TRIAL_DEPTHS] [-n NUMINDV] + +Run the regression + +optional arguments: + -h, --help show this help message and exit + +Run Options: + -F PREFIX_NAME, --SampleName PREFIX_NAME + sample/population name (default: cis_var) + -r TRIAL_DEPTHS, --readDepth TRIAL_DEPTHS + minimum read depth per variant (default: 20) + +Regression Options: + -n NUMINDV, --numberIndividuals NUMINDV + The number of individuals in the pool (if omitted, + calculated from genotype file length) +``` + +The regression produces z-scores and p-values, and additionally writes +coefficients and some simple summary plots in separate files. + +#### tidy + +This step calls the open/closed alleles and produces a final integrated file +with all available data as both a tad-delimited file and as a pandas dataframe. + +``` +usage: cisVar.py tidy [-h] [-F PREFIX_NAME] [-r TRIAL_DEPTHS] [-b BEDFILE] + [-t TEXTFILE] [-p PANDASFILE] + +Tidy up regression, call open/closed + +optional arguments: + -h, --help show this help message and exit + +Run Options: + -F PREFIX_NAME, --SampleName PREFIX_NAME + sample/population name (default: cis_var) + -r TRIAL_DEPTHS, --readDepth TRIAL_DEPTHS + minimum read depth per variant (default: 20) + +inputs: + -b BEDFILE, --bedfile BEDFILE + BED file to extract rsIDs from (optional) + +outputs: + -t TEXTFILE, --textfile TEXTFILE + Parsed output + -p PANDASFILE, --pandasfile PANDASFILE + Parsed dataframe +``` + +#### get_snake + +This option just downloads the Snakefile and config files from this repo, for +easy access when code is installed via pip. + +``` +usage: cisVar.py get_snake [-h] [-x] + +Download Snakefile and config to current dir + +optional arguments: + -h, --help show this help message and exit + -x, --extra Get additional sample and cluster configs +``` + +### combine.py + +This script is separate and is in the `scripts` folder. It takes a search string +as an input and produces both combined and merged DataFrames. The combined +dataframe is just all dataframes combined in order with sample data added as a +column and to the index. The merged dataframe is a collapsed dataframe that has +one entry per SNP with p-values combined using Fisher's method and supporting +population data. It also includes information on the level of support for the +open and closed calls. + +The search string should match your prefix and depth from the main pipeline. For +example, if you used a name of 'cis_var' plus a sample name (the variable part) +of e.g. CEU and YRI, and a read depth of 20, your search string would be: +`cis_var.{sample}.20.regression.pd`. + +The script will write `cis_var.combined.20.regression.pd` and +`cis_var.merged.20.regression.pd`. + ``` --h help - --g The genotypes file is a tab separated file, sorted by individuals(columns) and by snps (rows). - - EX: - chr position ref alt NA18486 NA18489 ... - chr1 10583 g a 0 1 ... - chr1 10611 C G 0 0 ... - chr1 13302 c t 1 0 … - --i The individualsFile is just a list of the individuals (one per line) in your pool that are to be extracted from a larger list of individuals. - - EX: - NA18486 - NA18489 - … --a The alleles file is a tab-sep file with no header (chr#, position, Ref_allele, Alt_allele). This file should match the length of the pileup file. - - EX: - chr1 10505 A T - chr1 10506 C G - chr1 10511 G A - --f fasta file with all chromosomes concatenated and sorted. - --p The mpileup.bed file is a tab-sep standard bed file with no header (where the SNP location is the third column and the second is SNP-1. - EX: - - chr1 10504 10505 - chr1 10505 10506 - chr1 10510 10511 - --B sortedBam is the sorted without the -n flag i.e; samtools sort [in.bam] -``` - -## Additional Scripts - -The `scripts` folder includes a few additional scripts that may be useful - -- `convert_vcf2bed.sh` — Used bedops to convert vcfs to bed files (which can then be concatenated to make the required bed -- `make_alleles.py` — Makes the alleles file described above from the mpileup file -- `vcf_to_indi_and_geno.py` — Make the individual and genotype files from VCFs -- `post_process_regression.py` — Use pandas to process the output of the qtls step of the pipeline to add open/closed alleles and rsIDs -- `rmdup.py` — Remove duplicates from a bam file, picks duplicate to keep randomly, works with paried end data +usage: combine.py [-h] [-c COLUMN_NAME] [--no-merge] search_str + +Combine a bunch of cisVar pandas files by sample (e.g. population). + +Requires a search string such as prefix.{sample}.regression.pd. + +Writes +------ +prefix.combined.regression.pd + A simple merger of all DataFrames +prefix.merged.regression.pd + A per-snp merger based on p-value + +positional arguments: + search_str e.g. name.{sample}.regression.pd, used to find files + +optional arguments: + -h, --help show this help message and exit + -c COLUMN_NAME, --column-name COLUMN_NAME + Name for combine column, e.g. population + --no-merge Produce only a combined dataframe, not a merged + dataframe. merging can add half an hour over + combination, which takes seconds +``` + +### plot + +There is an additional script in `scripts` called `plot_fun.R` that takes a +single argument—the output of the regression step (e.g. +`cis_var.YRI.20.totals.txt`) and creates a simple density pre-freq vs post freq +plot. diff --git a/cisVar.py b/cisVar.py old mode 100644 new mode 100755 index 296a706..68ca913 --- a/cisVar.py +++ b/cisVar.py @@ -1,89 +1,404 @@ -#!/usr/bin/python3 - +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- """ -#============================================================================== -# -# FILE: cisVar (python 3) -# AUTHOR: Ashley Tehranchi, tehranchi576@gmail.com -# ORGANIZATION: Stanford University -# CREATED: 2015-12-12 11:33 -# Last modified: 2017-10-09 11:30 -# -# -#============================================================================== - - -cisVar mpileup -F -f -p -B -cisVar post -F -r -a -cisVar geno -F -r -i -g -cisVar qtls -F -r -n +cisVar: Find cis QTLs based on an experimental selection method -""" +Ashley Tehranchi -###################################################################################### -# ARGUEMENTS -###################################################################################### +Stanford University +Version: 2.0.0b1 +Created: 2015-12-12 +Updated: 2018-05-16 -import argparse -import sys +Example usage: +cisVar prep -F test_new -i individuals.txt.gz --chrom-format chr +cisVar mpileup -F -f -B +cisVar post -F -r +cisVar geno -F -r -i +cisVar qtls -F -r -n +cisVar tidy -F -r -p out.dataframe.pandas -t out.dataframe.txt + +Note: +The qtls regression step will use approximately 32GB of memory on an averaged- +sized dataset. + +The geno step will use approximately 20GB of memory on the same dataset. + +""" +from __future__ import print_function import os +import sys +import bz2 +import gzip +import argparse import operator import subprocess -import scipy -import numpy as np -from scipy import stats -import pandas as pd - - -parser = argparse.ArgumentParser(description='Find cis QTLs based on an experimental selection method') -parser.add_argument('inputCommand', help=' ') -parser.add_argument('-F', '--SampleName', required=False, dest='prefix_name', help='sample/population name') -parser.add_argument('-f', '--fasta', required=False, dest='allCHRfasta', help='fasta file with all chromosomes') -parser.add_argument('-B', '--BAMfile', required=False, dest='sortedBam', help='sorted BAM file') -parser.add_argument('-r', '--readDepth', required=False, type=int, dest='trial_depths', help='minimum read depth per variant') -parser.add_argument('-i', '--individualsFile', required=False, dest='individualslist', help='list of individuals matching genotype matrix; one indv per line') -parser.add_argument('-a', '--allelesFile', required=False, dest='allelesFileName', help='format: chr1 10583 G A') -parser.add_argument('-n', '--numberIndividuals', required=False, dest='numIndv', help='The number of individuals in the pool') -parser.add_argument('-p', '--mpileupBEDfile', required=False, dest='mpileupBEDfile', help='The mpileup BED file') -parser.add_argument('-g', '--genoFile', required=False, dest='genosFile', help='The genotypes file') -args = parser.parse_args() - - -WorkingDirectory = os.getcwd() - -###################################################################################### -# MPILEUP FROM BAM FILES USING HG19 MASKED GENOME WITH BLACKLISTED REGIONS REMOVED -###################################################################################### - -def mpileup(allCHRfasta, mpileupBEDfile, sortedBam, prefix_name): - os.system("samtools mpileup -Q 25 -f " + allCHRfasta +" -l "+ mpileupBEDfile +" " + sortedBam + " > " + prefix_name + ".mpileup.txt") - ## outputs prefix.mpileup.txt - return() - - -###################################################################################### -#POST-CHIP CALCULATION AND FILE TRIMMIMG PLUS HEADER ADDITION -###################################################################################### - -def postcalc(prefix_name, trial_depths, allelesFileName): - os.system("module load samtools") - print(prefix_name) - print("Post-ChIP calculation for min reads ",trial_depths) - infilename = prefix_name + ".mpileup.txt" - inalleles = allelesFileName +import multiprocessing as mp + +from time import sleep +from datetime import datetime + +from collections import defaultdict + +# psutil is used to print memory usage if installed, otherwise ignored +try: + import psutil +except ImportError: + psutil = None + +CORES = mp.cpu_count() + +__version__ = '2.0.0b2' + +########################################################################### +# Create Necessary Input Files from VCFs # +########################################################################### + + +def prep_files(vcf_files, prefix_name, inds=None, limit_file=None, + alleles_file=None, chrom_format=0, skip_indels=True, cores=4): + """Create genotype, bed, and allele files for pipeline. + + Bed is used by mpileup to define SNPs to pileup. + Alleles file is used to set default ref and alt in POST. + + Note, this only needs to be created once per experiment, not once per + group/pool. The same files can be used for each population/group/pool, + they are further filtered in the genotype step. + + This code first loads all locations from the limit file (if present) and + any alleles to override. These two lists are kept in memory. It then loops + through each vcf_file in parallel, line-by-line, writing temp files, before + concatenating the results into a single file. + + Params + ------ + vcf_files : list + A list of VCF files to parse, should contain genotypes, ref and alt. + prefix_name : str + A name for the output files. + inds : list or path, optional + Either a list of individuals or a list of paths to individuals files. + Should include all individuals in this experiment, in all pools. + limit_file : path, optional + Path to a bed/vcf/simple file of SNPs to consider. Should be a subset + of SNPs in the VCF files. BED3 format is fine, simple format is just + chromosome and position (1-based) to exclude, tab separated. + alleles_file : path, optional + Path to a VCF/bed/txt file with alleles, it text, should be + chr position (base-1). Can be same file as limit file if desired. + chrom_format : {0, 1, 2}, optional + 0 = leave + 1 = force chr# + 2 = force # + skip_indels : bool, optional + Don't include rows with indels + cores : int, optional + Number of parallel cores to use on multiple vcf files. + + Writes + ------ + prefix.genotypes.txt.gz + A tab separated file, including header with individuals, of: + chrom, position (base-1), ref (upper), alt (upper), genotypes (0/1/2) + prefix.locations.bed.gz + A BED file of all of the locations to be tested, including names + prefix.individuals.txt.gz + A newline separated file of individuals, replicates inds if provided, + overwrites even if already exists. + """ + if isinstance(vcf_files, str): + vcf_files = [vcf_files] + for fl in vcf_files: + if not os.path.isfile(fl): + raise OSError('Could not find file {}'.format(fl)) + assert isinstance(prefix_name, str) + par = False if len(vcf_files) == 1 or cores < 2 else True + if limit_file: + sys.stderr.write( + "Loading test locations from {}\n".format(limit_file) + ) + limit_locs = parse_loc_file(limit_file) + else: + limit_locs = None + if alleles_file: + sys.stderr.write( + "Loading alleles from {}\n".format(alleles_file) + ) + alleles = parse_loc_file(alleles_file, mode='alleles') + else: + alleles = None + if par: + pool = mp.Pool(cores) + + # Individuals + if os.path.isfile(inds): + with open_zipped(inds) as fl: + inds = fl.read().strip().split('\n') + if not inds: + inds = geno_file(vcf_files[0], get_inds=True) + final_inds = list(inds) + + # Write the final individuals, order preserved + ind_output = '{}.individuals.txt.gz'.format(prefix_name) + with open_zipped(ind_output, 'w') as fout: + fout.write('\n'.join(final_inds)) + + # Primary parsing + sys.stderr.write("Begining genotype file parse\n") + + # Header + header = geno_file(vcf_files[0], get_header=True) + + count = 0 + jobs = [] + geno_output = '{}.genotypes.txt.gz'.format(prefix_name) + bed_output = '{}.locations.bed.gz'.format(prefix_name) + for fl in vcf_files: + ofl = '{}.{}.gz'.format(geno_output, count) + bfl = '{}.{}.gz'.format(bed_output, count) + args = ( + fl, ofl, header, bfl, final_inds, limit_locs, + alleles, skip_indels, chrom_format + ) + if par: + jobs.append([ofl, bfl, pool.apply_async(parse_file, args)]) + else: + parse_file(*args) + jobs.append((ofl, bfl, None)) + count += 1 + + ofls = [i[0] for i in jobs] + bfls = [i[1] for i in jobs] + jobs = [i[2] for i in jobs] + + if par: + for job in jobs: + job.wait() + sleep(2) + + for ofl in ofls: + done_file = ofl + '.done' + if not os.path.isfile(done_file): + sleep(1) + if not os.path.isfile(done_file): + print('Cannot find {}, assuming failure'.format(done_file)) + raise Exception('Missing done file') + os.remove(done_file) + + for bfl in bfls: + done_file = bfl + '.done' + if not os.path.isfile(done_file): + sleep(1) + if not os.path.isfile(done_file): + print('Cannot find {}, assuming failure'.format(done_file)) + raise Exception('Missing done file') + os.remove(done_file) + + # Merge + print('Merging files') + with open_zipped(geno_output, 'w') as fout: + fout.write( + 'chrom\tposition\tref\talt\t{}\n'.format('\t'.join(final_inds)) + ) + subprocess.check_call( + 'zcat {} | gzip >> {}'.format(' '.join(ofls), geno_output), + shell=True + ) + subprocess.check_call( + 'zcat {} | gzip > {}'.format(' '.join(bfls), bed_output), shell=True + ) + + print('Removing tmp files') + for temp_file in ofls + bfls: + if os.path.isfile(temp_file): + os.remove(temp_file) + + +def parse_file(geno, outfile, header, bedfile, inds=None, limit_locs=None, + alleles=None, skip_indels=False, chrom_format=0, log=None, + fail_if_too_few_inds=True): + """File parser, see parse_files for information.""" + if not log: + log = outfile + '.parse.log' + kept = 0 + indels = 0 + skipped = 0 + superbad = [] + print('Creating {} and {} from {}'.format(outfile, bedfile, geno)) + if chrom_format == 0: + def chromit(x): + """Return x.""" + return x + elif chrom_format == 1: + def chromit(x): + """Force x to start with chr.""" + return x if x.startswith('chr') else 'chr' + x + elif chrom_format == 2: + def chromit(x): + """Force x to not start with chr.""" + return x if not x.startswith('chr') else x[3:] + else: + raise ValueError('chrom_format must be one of 0, 1, 2') + + if not limit_locs: + limit_locs = dict() + if not alleles: + alleles = dict() + + lg = open_zipped(log, 'a') + with open_zipped(outfile, 'w') as gn, open_zipped(bedfile, 'w') as bd: + lg.write('Parse initiated\n') + for f in geno_file(geno, check_header=header, ind_list=inds, log=lg): + # Write core records + if skip_indels and (len(f[2]) > 1 or len(f[3]) > 1): + indels += 1 + continue + chrom = chromit(f[0]) + pos = int(f[1]) + # Drop excluded positions + try: + if pos not in limit_locs[chrom]: + skipped += 1 + continue + except KeyError: + pass + + # Override alleles + try: + if pos in alleles[chrom]: + ref, alt = alleles[chrom][pos] + else: + ref = f[2] + alt = f[3] + except KeyError: + ref = f[2] + alt = f[3] + + name = f[4] + gt_inds = f[5:] + + outstr = ( + '{chr}\t{pos}\t{ref}\t{alt}'.format( + chr=chrom, pos=pos, ref=ref, alt=alt + ) + ) + # Write genotypes + if inds: + gt_ind_l = len(gt_inds) + ind_l = len(inds) + if gt_ind_l < ind_l: + lg.write( + '{chr}.{pos} had too few ({ind}) individuals\n' + .format(chr=chrom, pos=f[1], ind=len(gt_inds)) + ) + if fail_if_too_few_inds: + superbad.append(f) + continue + if gt_ind_l > ind_l: + lg.write( + '{chr}.{pos} had too many ({ind}) individuals\n' + .format(chr=chrom, pos=f[1], ind=len(gt_inds)) + ) + superbad.append(f) + continue + gn.write('{}\t{}\n'.format(outstr, '\t'.join(gt_inds))) + bd.write('{}\t{}\t{}\t{}\n'.format(chrom, pos-1, pos, name)) + kept += 1 + lg.write( + 'Skipped: {}\nIndels: {}\nKept: {}\nParse Complete\n' + .format(skipped, indels, kept) + ) + print( + 'Finished {}. Kept {} SNPs. Dropped {} indels and skipped {}' + .format(outfile, kept, indels, skipped) + ) + if superbad: + msg = 'Length failure despite length filtering:\n{}\n'.format(superbad) + lg.write(msg) + print(msg) + + print('Writing done files') + with open(outfile + '.done', 'w') as fout: + fout.write('Done\n') + with open(bedfile + '.done', 'w') as fout: + fout.write('Done\n') + + lg.close() + return + + +############################################################################ +# mpileup BAM files with hg19 masked genome with blacklist regions removed # +############################################################################ + + +def mpileup(allCHRfasta, sortedBam, prefix_name, mpileupBEDfile=None): + """Run mpileup on the provided files. + + Params + ------ + allCHRfasta : str + Path to genome wide fasta file + sortedBam : str + Path to BAM file sorted by position + prefix_name : str + Prefix to use for default file names + mpileupBEDfile : str, optional + Path to BED file for mpileup, defaults to file output by prep. + + Writes + ------ + prefix_name.mpileup.txt.gz + """ + if not mpileupBEDfile: + mpileupBEDfile = '{}.locations.bed.gz'.format(prefix_name) + cmnd = ( + "samtools mpileup -Q 25 -f {} -l {} {} | gzip > {}.mpileup.txt.gz" + .format(allCHRfasta, mpileupBEDfile, sortedBam, prefix_name) + ) + sys.stderr.write('Running {}\n'.format(cmnd)) + subprocess.check_call(cmnd, shell=True) + + +########################################################################### +# POST-ChIP Calculation and File Trimming # +########################################################################### + + +def postcalc(prefix_name, inalleles, trial_depths): + """Calculate POST frequencies from mpileup results.""" + print("Post-ChIP calculation for min reads {}".format(trial_depths)) + + infilename = prefix_name + ".mpileup.txt.gz" outfilename = prefix_name + "." + str(trial_depths) + ".POST.txt" depth = int(trial_depths) pileup_dictionary = {} count = 1 - outfile = open(outfilename, 'w') - with open(infilename, 'r') as pileup: + bad = 0 + # mpileup format: + # chromosome, 1-based coordinate, reference base, # reads covering the + # site, read bases, and base qualities + with open_zipped(infilename, 'r') as pileup: for line in pileup: - #print count count = count + 1 - rows = line.rstrip('\n').split("\t") - if (int(rows[3]) < depth): + rows = line.rstrip().split("\t") + if not len(rows) == 6: + bad += 1 + continue + + # SNP name is just chr.position + snp = rows[0] + "." + rows[1] + # cov is the total coverage for the SNP + cov = int(rows[3]) + # Normalize base (called read here) to lowercase to make counting + # easier + read = str(rows[4]).lower() + + # Only work on those with coverage exceeding the minimum depth + if cov < depth: Acount = 'NA' Ccount = 'NA' Gcount = 'NA' @@ -94,210 +409,1097 @@ def postcalc(prefix_name, trial_depths, allelesFileName): RefFreq = 'NA' AltFreq = 'NA' else: - if (int(rows[3]) >= depth): - read = str(rows[4]).lower() # isolate overlapping positions from reads, make all letters lowercase to count them properly - Acount = read.count('a') # count all A's, A is plus strand, a in negative strand, count both - Ccount = read.count('c') - Gcount = read.count('g') - Tcount = read.count('t') - countslist = [Acount, Ccount, Gcount, Tcount] #make list of all counts - index, ALTdepth = max(enumerate(countslist), key=operator.itemgetter(1)) #find index of largest number and the value - #print("index ", index) - indels = rows[4].count('-') + rows[4].count('+') - REFdepth = int(rows[3]) - int(Acount) - int(Ccount) - int(Gcount) - int(Tcount) - int(indels) - #print("indels, REFdepth ", indels, REFdepth) - ##added 9/2/14 - #print(rows) - rows[3] = str(int(REFdepth) + int(ALTdepth)) ## Adjusts totaldepth to remove third alleles/sequencing errors - #print(rows) - ## + # Count all A's, A is plus strand, a in negative strand, + # count both + Acount = read.count('a') + Ccount = read.count('c') + Gcount = read.count('g') + Tcount = read.count('t') + + # Make list of all counts + countslist = [Acount, Ccount, Gcount, Tcount] + # Find index of largest number and the value + # ALT is assumed to be most common of the 4 bases + index, ALTdepth = max( + enumerate(countslist), key=operator.itemgetter(1) + ) + + indels = rows[4].count('-') + rows[4].count('+') + REFdepth = cov - Acount - Ccount - Gcount - Tcount - indels + + # Adjusts total_depth/cov to remove third alleles/sequencing + # errors + snp_depth = REFdepth + ALTdepth + rows[3] = str(snp_depth) + + if index == 0: + ALTallele = 'A' + elif index == 1: + ALTallele = 'C' + elif index == 2: + ALTallele = 'G' + elif index == 3: + ALTallele = 'T' + else: ALTallele = "NA" - if (index == 0): - ALTallele = 'A' - if (index == 1): - ALTallele = 'C' - if (index == 2): - ALTallele = 'G' - if (index == 3): - ALTallele = 'T' - RefFreq = "NA" - AltFreq = "NA" - if (rows[3] == 0): - RefFreq = str(0) - print("mistake") - else: - if (float(rows[3]) > depth) and ((float(REFdepth) / float(rows[3])) >= 0) : ## depth greater than 0 and postfreq > 0 - RefFreq = str(REFdepth / float(rows[3])) # to get ref freq, totaldepth - altdepth = refdepth / totaldepth - AltFreq = str(1 - float(RefFreq)) - pileup_dictionary[rows[0] + "." + rows[1]] = rows[2:] + [Acount, Ccount, Gcount, Tcount, ALTdepth, REFdepth, ALTallele, RefFreq, AltFreq] - #print("ALTallele, row ",ALTallele, rows) - assert len(pileup_dictionary[rows[0] + "." + rows[1]]) == 13 - linecounter =1 - with open(inalleles) as alleles: + + RefFreq = "NA" + AltFreq = "NA" + + if snp_depth == 0: + RefFreq = str(0) + print( + "Reference frequency is 0 for {}, probably a mistake" + .format(snp) + ) + else: + postfreq = float(REFdepth) / float(snp_depth) + if snp_depth > depth and postfreq >= 0: + # To get ref freq: + # totaldepth - altdepth = refdepth / totaldepth + RefFreq = str(postfreq) + AltFreq = str(1 - postfreq) + + pileup_dictionary[snp] = ( + rows[2:] + [ + Acount, Ccount, Gcount, Tcount, ALTdepth, + REFdepth, ALTallele, RefFreq, AltFreq + ] + ) + assert len(pileup_dictionary[snp]) == 13 + + print('{} mpileup rows were too short and thus skipped'.format(bad)) + + linecounter = 1 + with open_zipped(inalleles) as alleles, open_zipped(outfilename, 'w') as outfile: for line in alleles: - row = line.rstrip().split('\t') + row = line.rstrip().split('\t')[:4] + snp = row[0] + '.' + row[1] + + if snp in pileup_dictionary: + matchingVector = [str(x) for x in pileup_dictionary[snp]] + else: + continue + genoRefAllele = row[2] genoAltAllele = row[3] - try: matchingVector = pileup_dictionary[row[0] + '.' + row[1]] - except: continue - matchingVector = [str(x) for x in matchingVector] - if (matchingVector[8] == str(0)): # for alt alleles with 0 reads, force alt allele to be imputed geno alt allele + + # For alt alleles with 0 reads, force alt allele to be imputed geno + # alt allele + if matchingVector[8] == str(0): matchingVector[10] = genoAltAllele - if (matchingVector[9] == str(0)): # for ref alleles with 0 reads, force ref allele to be imputed geno ref allele + + # For ref alleles with 0 reads, force ref allele to be imputed geno + # ref allele + if matchingVector[9] == str(0): matchingVector[0] = genoRefAllele + postChipRefAllele = matchingVector[0].upper() - postChipRefFreq = matchingVector[11] + postChipRefFreq = matchingVector[11] postChipAltAllele = matchingVector[10].upper() - postChipAltFreq = matchingVector[12] - if (postChipRefAllele == genoRefAllele) and (postChipAltAllele == genoAltAllele): + postChipAltFreq = matchingVector[12] + + if postChipRefAllele == genoRefAllele and postChipAltAllele == genoAltAllele: outAllele = postChipRefAllele outfreq = postChipRefFreq - elif postChipRefAllele == genoAltAllele and (postChipAltAllele == genoRefAllele): + elif postChipRefAllele == genoAltAllele and postChipAltAllele == genoRefAllele: outAllele = postChipAltAllele outfreq = postChipAltFreq linecounter = linecounter +1 else: outAllele = 'NA' outfreq = 'NA' - finalout = row[0] + "\t" + row[1] + "\t" + '\t'.join(matchingVector) + "\t" + str(outAllele) + "\t" + str(outfreq) + "\n" - outfile.write(finalout) - outfile.close() + + outfile.write( + '\t'.join( + [row[0], row[1], '\t'.join(matchingVector), + str(outAllele), str(outfreq)] + ) + '\n' + ) + print("post-frequency calculation DONE") - return () -###################################################################################### -# POST.TXT FILE TRIMMING, ADDITION OF HEADER, BED FILE CREATION -###################################################################################### +########################################################################### +# POST.txt file triming, addition of header, BED file creation # +########################################################################### + def postTrim(prefix_name, trial_depths): - headerOUT = open("mpileup.header.txt", 'w') - headertemp = 'Chr'+'\t'+'position'+'\t'+'REFallele'+'\t'+'Depth'+'\t'+'Acount'+'\t'+'Ccount'+'\t'+'Gcount'+'\t'+'Tcount'+'\t'+'ALTdepth'+'\t'+'REFDepth'+'\t'+'ALTallele'+'\t'+'REFfreq'+'\t'+'ALTfreq'+'\t'+'POSTallele'+'\t'+'POSTfreq'+'\n' - headerOUT.write(headertemp) - headerOUT.close() - - print("File trimming ") - os.system("sed '/NA/d' " + prefix_name + "." + str(trial_depths) + ".POST.txt | cut -f-4,7- > " + prefix_name + "." + str(trial_depths) + ".POSTt.txt") - os.system("cat mpileup.header.txt " + prefix_name + "." + str(trial_depths) + ".POSTt.txt > " + prefix_name + "." + str(trial_depths) + ".POSTth.txt") - print(os.system("head " + prefix_name + "." + str(trial_depths) + ".POSTth.txt")) - postin = prefix_name + "." + str(trial_depths) + ".POSTt.txt" - bedoutfile = prefix_name + "." + str(trial_depths) + ".bed" - with open(postin) as openfile, open(bedoutfile, 'w') as bedOUT: - for line in openfile: - rowtmp = line.rstrip().split('\t') - chrom = rowtmp[0] - snp = rowtmp[1] - startpos = int(snp) - 1 - OUTrow = str(chrom) + '\t' + str(startpos) + '\t' + str(snp) + '\n' - bedOUT.write(OUTrow) + """Tidy the POST frequency outputs.""" + + with open_zipped("mpileup.header.txt", 'w') as headerOUT: + headerOUT.write( + '\t'.join( + ['Chr', 'position', 'REFallele', 'Depth', 'Acount', 'Ccount', + 'Gcount', 'Tcount', 'ALTdepth', 'REFDepth', 'ALTallele', + 'REFfreq', 'ALTfreq', 'POSTallele', 'POSTfreq'] + ) + '\n' + ) + + # File names + prfx = "{}.{}".format(prefix_name, trial_depths) + post_fl = "{}.POSTth.txt".format(prfx) + bed_fl = "{}.bed".format(prfx) + + print("File trimming") + + # Filter out unneeded columns + script = ( + "cat mpileup.header.txt <(" + "sed '/NA/d' {prefix}.POST.txt | cut -f-4,7- " + ") > {prefix}.POSTth.txt" + ).format(prefix=prfx) + subprocess.check_call('bash -c "{}"'.format(script), shell=True) + print("File trimming DONE") - return() + print("Writing bed file") + + # Make bed file + with open_zipped(post_fl) as openfile, open_zipped(bed_fl, 'w') as bedOUT: + openfile.readline() # Discard header + for line in openfile: + ln = line.rstrip().split('\t') + snp = ln[1] + bedOUT.write('\t'.join([ln[0], str(int(snp) - 1), snp]) + '\n') + + print("Bed file writing complete") + + +########################################################################### +# Genotype Extraction for POST-calc SNPs # +########################################################################### -###################################################################################### -# GENOTYPE EXTRACTION OF POSTCALC SNPS -###################################################################################### -def genoExtract(prefix_name, trial_depths, individualslist, genosFile): +def genoExtract(prefix_name, trial_depths, genosFile, individualslist=None): + """Filter genotypes by SNP and individual. + + First loops through genotypes and filters to a temp file, then transposes, + then filters transposed lines by individual. + + Takes approximately 10 minutes and uses 10GB of memory on a genotype file + of 77 million SNPs and an individuals files of 66 individuals. + + Params + ------ + prefix_name : str + trial_depths : int + genosFile : str + Path to genotype file to parse. Must be in the format: + chrom\\tpos\\tref\\talt\\tgenotypes... + Genotypes must be in 0/1/2 format, and the file must contain a header + with the individual names. + Note: Genotype file *must* be sorted by position. + individualslist : str, optional + Path to file of individuals to include separated by newlines + """ new_prefix = prefix_name + "." + str(trial_depths) - postdataIN = new_prefix + ".POSTth.txt" - outName = new_prefix + ".genotypes.txt" - print("file: ", new_prefix) + postdata = new_prefix + ".POSTth.txt" + out_name = new_prefix + ".genotypes.txt" + print("File prefix used:", new_prefix) + + # Track memory usage + process = show_mem() # Get list of sites with POSTfreq data - print("Getting SNPs") - postlocs = pd.read_csv(postdataIN, sep='\t') - postlocs['idx'] = postlocs[postlocs.columns[0]].astype(str) + '.' + postlocs[postlocs.columns[1]].astype(str) - print("Got SNPs") - - print("Reading file Locations") - with open(individualslist , 'r') as indv: - indv_list = frozenset(indv.read().strip().split('\n')) - dtypes = {i: int for i in indv_list} - sidx = {} - c = 0 - for i in indv_list: - sidx[i] = c - c += 1 - print("Individual Locations Read Complete") - - # Load genotypes - print("Loading genotypes") - genos = pd.read_csv(genosFile, sep='\t', dtype=dtypes) - genos['idx'] = genos[genos.columns[0]].astype(str) + '.' + genos[genos.columns[1]].astype(str) - print("Genotypes loaded") - print("Dropping indels") - l1 = len(genos) - genos = genos[(genos.alt.str.len() == 1) & (genos.ref.str.len() == 1)] - l2 = len(genos) - print("Dropped {} indels".format(l1-l2)) - print("Dropping duplicates") - genos = genos.drop_duplicates('idx', keep=False) - l3 = len(genos) - print("Dropped {} duplicates".format(l2-l3)) - print("Filtering by SNP") - genos = genos[genos.idx.isin(postlocs.idx)] - postlocs = postlocs[postlocs.idx.isin(genos.idx)] - print("Sorting") - genos = genos.sort_values(list(genos.columns[:2])) - postlocs = postlocs.sort_values(list(postlocs.columns[:2])) - genos = genos.set_index('idx', drop=True) - postlocs = postlocs.set_index('idx', drop=True) - print('Len {}: {}'.format(postdataIN, len(postlocs))) - print('Len genotypes: {}'.format(len(genos))) - print("Writing", postdataIN) - postlocs.to_csv(postdataIN, sep='\t', index=False) + print("Getting SNPs from POST data") + with open_zipped(postdata, 'r') as postin: + # Drop first line if it is a header (position is not numberic) + header = postin.readline().strip().split('\t') + if header[1].isdigit(): + postin.seek(0) + postlocs = [ + rows[0] + '.' + rows[1] for rows in [ + line.split('\t') for line in postin.readlines() + ] + ] + post_snps = frozenset(postlocs) + l = len(postlocs) + # The POST data must be unique per SNP, if it isn't we have a big problem + # somewhere in the earlier steps + assert l == len(post_snps) + + print("Got {} SNPs".format(l)) + show_mem(proc=process) + + done_snps = set() + add_to_done = done_snps.add + extras = 0 + indels = 0 + dups = 0 + + print("Filtering genotypes by SNP") + print("Time: {}".format(datetime.now())) + # Geno will have the same number of columns as the POST data has rows but + # will contain numeric matrix only, one column per SNP, and one row per + # individual *sorted to match individuals.txt* + with open_zipped(genosFile) as genos_in: + # Parse header + header = genos_in.readline().strip().split('\t') + if header[1].isdigit(): + raise Exception( + 'Genotype file {} has no header, cannot parse individuals' + .format(genosFile) + ) + # Extract the individuals from the header + inds = header[4:] + + # Parse the file + gt_dict = {} + for line in genos_in: + row = line.strip().split('\t') + # Filter by SNP + snp = row[0] + '.' + row[1] + if snp not in post_snps: + extras += 1 + continue + if snp in done_snps: + dups += 1 + continue + # Filter out INDELS + if len(row[2]) > 1 or len(row[3]) > 1: + if len(row[2]) > 1: + raise Exception( + 'Genotype file has more than one char for ref at ' + 'line:\n{}'.format(line) + ) + for allele in row[3].split(','): + if len(allele) > 1: + indels += 1 + continue + # Keep only the genotype data, not the position data + gt_dict[snp] = row[4:] + # Done by adding this at the end, we can still get duplicates + # where the first entry was an indel or something else. + add_to_done(snp) + + print("Genotype filtering complete") + show_mem(proc=process) + + print("Time: {}".format(datetime.now())) + print("Dropped {} unneeded SNPs".format(extras)) + print("Dropped {} indels".format(indels)) + print("Dropped {} duplicates".format(dups)) + + print("Checking all POST SNPs matched") + if len(postlocs) != len(done_snps): + err_file = new_prefix + ".missing.snps" + with open_zipped(err_file, "w") as fout: + fout.write( + '\n'.join(sorted([i for i in postlocs if i not in done_snps])) + ) + raise Exception( + "{} SNPs in POST were not in the genotype file, written to {}" + .format(len(postlocs)-len(done_snps), err_file) + ) print("Done") - print("Writing", prefix_name+"geno.txt") - genos.to_csv(prefix_name+"geno.txt", sep='\t') + + # Get individual locations + if individualslist: + print( + "Getting individuals from individuals file ({})." + .format(individualslist) + ) + with open_zipped(individualslist) as fin: + individuals = [i.strip() for i in fin.read().strip().split('\n')] + ind_locs = [] + missing = [] + for ind in individuals: + if ind in inds: + ind_locs.append(inds.index(ind)) + else: + missing.append(ind) + if len(missing) > 0: + sys.stderr.write( + 'The following individuals are missing ' + 'from the genotype file:\n' + ) + sys.stderr.write('\n'.join(missing)) + raise IndexError('Missing some individuals from genotypes') + assert len(ind_locs) == len(individuals) + else: + print('No individuals file provided, keeping all individuals') + ind_locs = None + + print("Creating sorted matrix of only required individuals") + mat = [] + for snp in postlocs: + if ind_locs: + gt = gt_dict[snp] + mat.append([gt[i] for i in ind_locs]) + else: + mat.append(gt_dict[snp]) print("Done") - print("Transposing") - genos = genos.drop(genos.columns[:4], axis=1) - trans = genos.T - # print("Writing", prefix_name+"transpose.txt") - # trans.to_csv(prefix_name+"transpose.txt", sep='\t') - # print("Done") - - print("Filtering by individual") - trans = trans[trans.index.to_series().isin(indv_list)] - trans['sort'] = trans.index.to_series().map(sidx) - trans = trans.sort_values('sort') - trans = trans.drop('sort', axis=1) - print("Writing numeric matrix:", outName) - trans.to_csv(outName, sep='\t', header=False, index=False) + show_mem(proc=process) + del gt_dict + print("Number of individuals included: {}".format(len(mat[0]))) + + print("Transposing and writing") + with open_zipped(out_name, 'w') as fout: + fout.write( + '\n'.join( + ['\t'.join(i) for i in zip(*mat)] + ) + ) print("Done") - return + show_mem(proc=process) + del mat + print("Genotype filtering complete") + show_mem(proc=process) -###################################################################################### -# REGRESSION WITH Z-SCORE, P-VALUE -###################################################################################### -def regPVAL(prefix_name, trial_depths, numIndv): +########################################################################### +# Regressuion with Z-score and P-value # +########################################################################### + + +def regPVAL(prefix_name, trial_depths, numIndv=None): + """Run the actual regression R code.""" new_prefix = prefix_name + "." + str(trial_depths) - postdataIN = new_prefix + ".POSTth.txt" - genosoutName = new_prefix + ".genotypes.txt" + postdata = new_prefix + ".POSTth.txt" + genosout_name = new_prefix + ".genotypes.txt" + if not numIndv: + wc = subprocess.check_output(['wc', '-l', genosout_name]) + numIndv = wc.decode().split(' ')[0] + numIndv = str(int(numIndv)) + ### make sure location of R script is correct - commandLine = "sed -e 's!postdataIN!" + postdataIN + "!g'" + " regression_qtls.R | sed -e 's!genosoutName!" + genosoutName + "!g' | sed -e 's!prefix!" + new_prefix + "!g' | sed -e 's!numIndiv!" + numIndv + "!g'> " +prefix_name+ "1.R" - print(commandLine) - subprocess.check_call(commandLine, shell=True) - commandout = "Rscript " +prefix_name+ "1.R" - subprocess.check_call("chmod u+x " +prefix_name+ "1.R", shell=True) - subprocess.check_call(commandout, shell=True) - os.system("rm " +prefix_name+ "1.R") + r_script = os.path.join(os.path.dirname(__file__), 'regression_qtls.R') + if not os.path.isfile(r_script): + raise OSError('File not found: {}'.format(r_script)) + + subprocess.check_call( + ['Rscript', r_script, postdata, genosout_name, new_prefix, numIndv] + ) + print("regression DONE") - return() - - -################## -# COMMANDS # -################## - -if args.inputCommand == 'mpileup': - mpileup(args.allCHRfasta, args.mpileupBEDfile, args.sortedBam, args.prefix_name) -elif args.inputCommand == 'post': - postcalc(args.prefix_name, args.trial_depths, args.allelesFileName) - postTrim(args.prefix_name, args.trial_depths) -elif args.inputCommand == 'geno': - genoExtract(args.prefix_name, args.trial_depths, args.individualslist, args.genosFile) -elif args.inputCommand == 'qtls': - regPVAL(args.prefix_name, args.trial_depths, args.numIndv) + + +########################################################################### +# Tidy Results # +########################################################################### + +orig_headers = [ + 'Chr', 'position', 'REFallele', 'Depth', 'Acount', 'Ccount', 'Gcount', + 'Tcount', 'ALTdepth', 'REFDepth', 'ALTallele', 'REFfreq', 'ALTfreq', + 'POSTallele', 'POSTfreq', 'prechipfreq', 'pvalue', 'zvalue', + 'prevar', 'postvar', 'SNPpostfreq', 'SNPprefreq' +] +new_headers = [ + 'chrom', 'position', 'ref', 'depth', 'a_count', 'c_count', 'g_count', + 't_count', 'alt_depth', 'ref_depth', 'alt', 'ref_freq', 'alt_freq', + 'post_allele', 'post_freq', 'pre_freq', 'p_value', 'z_value', + 'pre_variance', 'post_variance', 'snp_postfreq', 'snp_prefreq' +] +final_headers = [ + 'chrom', 'position', 'rsid', 'open_allele', 'closed_allele', + 'pre_freq', 'post_freq', 'pre_variance', 'post_variance', 'beta', + 'p_value', 'z_value', 'ref', 'alt', 'depth', 'ref_depth', + 'alt_depth', 'ref_freq', 'alt_freq', 'snp_postfreq', 'snp_prefreq', + 'a_count', 'c_count', 'g_count', 't_count' +] +float_dtypes = { + 'position': 'object', + 'REFfreq': 'float128', + 'ALTfreq': 'float128', + 'POSTfreq': 'float128', + 'prechipfreq': 'float128', + 'pvalue': 'float128', + 'zvalue': 'float128', + 'prevar': 'float128', + 'postvar': 'float128', + 'SNPpostfreq': 'float128', + 'SNPprefreq': 'float128', +} + + +def parse_regression(regfile, textfile=None, pandasfile=None, bed=None): + """Parse a regression file into a pandas datafram. + + Optionally output either pickled panda or textfile. + + For all files, gzipped files or open file handle are permissable. + + Parameters + ---------- + regfile : file name + Tab delimited output from regression, header strictly enforced + textfile : file name, optional + Path to write tab delimited formatted output + pandasfile : file name, optional + Path to write pickled pandas dataframe + bed : file name, optional + Path to a bed file that contains rsids + + Returns + ------- + pandas.DataFrame + Parsed DataFrame + """ + import numpy as np + import pandas as pd + print('Loading regression file') + df = pd.read_csv(regfile, sep='\t', low_memory=False, dtype=float_dtypes, + float_precision=128) + assert df.columns.tolist() == orig_headers + df.columns = new_headers + # Set the index to chr.position + df['idx'] = df.chrom.astype(str) + '.' + df.position.astype(str) + df = df.set_index('idx', drop=True) + + # Add rsID if possible + if bed: + print('Loading rsids') + df['rsid'] = bed_to_series(bed) + else: + print('No bed file, all rsids will be "Unknown"') + df['rsid'] = 'Unknown' + + # Sort the data by position (default is by best pvalue) + print('Sorting') + df = df.sort_values(['chrom', 'position']) + + # Force datatypes and uppercase + df['position'] = df.position.astype(int) + for i in ['ref', 'alt', 'post_allele']: + df[i] = df[i].str.upper() + # Add the beta + df['beta'] = df.post_freq - df.pre_freq + + # Add open and closed + print('Calculating open/closed') + df['open_allele'] = np.where(df.post_freq > df.pre_freq, df.post_allele, df.alt) + df['closed_allele'] = np.where(df.post_freq > df.pre_freq, df.alt, df.post_allele) + df['open_allele'] = np.where(df.post_freq == df.pre_freq, 'NA', df.open_allele) + df['closed_allele'] = np.where(df.post_freq == df.pre_freq, 'NA', df.closed_allele) + + # Sort columns + print('Adjusting columns') + df = df[final_headers] + + # Write outputs + if pandasfile: + print('Writing pickled panda to', pandasfile) + df.to_pickle(pandasfile) + if textfile: + print('Writing parsed dataframe to', textfile) + df.to_csv(textfile, sep='\t') + + return df + + +########################################################################### +# Helpers # +########################################################################### + + +def bed_to_series(bedfile): + """Convert a bed file to a series of chr.pos->name.""" + import pandas as pd + args = dict(sep='\t', usecols=[0, 2, 3]) + with open_zipped(bedfile) as fin: + if not fin.readline().startswith('#'): + args['header'] = None + print('Reading in bed file') + bed = pd.read_csv(bedfile, **args) + bed.columns = ['chrom', 'pos', 'name'] + bed['idx'] = bed.chrom.astype(str) + '.' + bed.pos.astype(str) + bed = bed.drop_duplicates('idx', keep=False) + bed = bed.set_index('idx', drop=True) + print('Parsed %s SNPs', len(bed)) + return bed.name + + +def show_mem(proc=None): + """Print memory usage.""" + if not psutil: + return None + if not proc: + proc = psutil.Process(os.getpid()) + print( + "Memory: {:.2f}GB" + .format(proc.memory_info().rss/1024/1024/1024) + ) + return proc + + +def open_zipped(infile, mode='r'): + """Return file handle of file regardless of compressed or not. + + Returns already opened files unchanged + Text mode automatic for compatibility with python2. + """ + # Return already open files + if hasattr(infile, 'write'): + return infile + + # Make text mode automatic + if len(mode) == 1: + mode = mode + 't' + + # Refuse to handle non-strings that aren't files. + if not isinstance(infile, str): + raise ValueError("i cannot open a filename that isn't a string.") + + # Treat '-' appropriately + if infile == '-': + if 'w' in mode: + return sys.stdout + return sys.stdin + + # If possible open zipped files + if infile.endswith('.gz'): + return gzip.open(infile, mode) + if infile.endswith('.bz2'): + if hasattr(bz2, 'open'): + return bz2.open(infile, mode) + return bz2.bz2file(infile, mode) + + # Fall back on regular open + return open(infile, mode) + + +# Simple location file handling + +def parse_loc_file(infile, mode='locations'): + """Return a dict of SNPs from a bed/vcf/text of chr postition (base-1).""" + if mode not in ['locations', 'alleles', 'names']: + raise ValueError( + 'mode must be one of locations, alleles, names. Is {}' + .format(mode) + ) + names = infile.split('.') + if 'vcf' in names: + fl_tp = 'vcf' + elif 'bed' in names: + fl_tp = 'bed' + else: + fl_tp = 'txt' + splt_char = '\t ' if fl_tp == 'txt' else '\t' + with open_zipped(infile) as fl: + for line in fl: + loc = fl.tell() + # Comment lines + if line.startswith('#'): + loc = fl.tell() + # First non-comment + else: + header = line.strip().split(splt_char) + if line[1].isdigit(): + # Not a header + fl.seek(loc) + break + # All non-head lines + if mode == 'locations': + res = _get_locs(fl, fl_tp, splt_char) + elif mode == 'alleles': + res = _get_alleles(fl, fl_tp, splt_char) + elif mode == 'names': + res = _get_names(fl, fl_tp, splt_char) + else: + raise ValueError('Invalid mode') + # Generalize chromosome name + for k, v in res.items(): + res[flip_chrom(k)] = v + return res + + +def _get_locs(fl, tp, sp): + result = defaultdict(set) + for line in fl: + f = line.rstrip().split(sp) + if tp == 'bed': + loc = int(f[2]) + else: + loc = int(f[1]) + result[f[0]].add(loc) + return result + + +def _get_alleles(fl, tp, sp): + if tp == 'bed': + raise ValueError("Can't get alleles from a bed file") + result = defaultdict(dict) + for line in fl: + f = line.rstrip().split(sp) + loc = int(f[1]) + if tp == 'vcf': + ref = f[3] + alt = f[4] + else: + ref = f[2] + alt = f[3] + result[f[0]][loc] = (ref, alt) + return result + + +def _get_names(fl, tp, sp): + result = defaultdict(dict) + for line in fl: + f = line.rstrip().split(sp) + if tp == 'bed': + name = f[3] + loc = int(f[2]) + else: + name = f[2] + loc = int(f[1]) + result[f[0]][loc] = name + return result + + +def flip_chrom(chrom): + """Return opposite of chr# or # for chromosome name.""" + if isinstance(chrom, int): + chrom = str(chrom) + if chrom.startswith('chr'): + return chrom[3:] + return 'chr' + chrom + + +# Genotype file handling + + +def geno_file(infile, check_header=None, ind_list=None, + get_header=False, get_inds=False, log=sys.stderr): + """Wrapper for genotype files, currently only vcf + + Parameters + ---------- + infile : str + Path to a vcf file, can be zipped. + check_header : list, optional + If a list is provided, assert that header matches before iterating. + ind_list : list of str, optional + If list provided, filter output to only include these individuals. + get_header : bool, optional + Return a header instead of yielding a row + get_inds : bool, optional + Return a list of individuals instead of yielding a row + + Yields + ------ + chr : str + pos : str (base 1) + ref : str + alt : str + name : str + inds : list of str + """ + name = infile.split('.') + if 'vcf' in name: + return vcf_file_gt( + infile, check_header, ind_list, get_header, get_inds, log + ) + else: + raise NotImplementedError( + 'Currently only VCFs supported, filetype required in name' + ) + + +def vcf_file_gt(vcf, check_header=None, ind_list=None, + get_header=False, get_inds=False, log=sys.stderr): + """VCF file iterator with goodies + + Parameters + ---------- + vcf : str + Path to a vcf file, can be gzipped + check_header : list, optional + If a list is provided, assert that header matches before iterating. + ind_list : list of str, optional + If list provided, filter output to only include these individuals. + get_header : bool, optional + Return a header instead of yielding a row + get_inds : bool, optional + Return a list of individuals instead of yielding a row + bad : file + File or handle to write errors + + Yields + ------ + chr : str + pos : str (base 1) + ref : str + alt : str + name : str + inds : list of str + """ + # Get header at genotype loc + with open_zipped(vcf) as fin: + line = fin.readline() + while True: + if not line.startswith('#'): + break + header = line + pos = fin.tell() + line = fin.readline() + + # Get genotype location + try: + gti = line.split('\t')[8].split(':').index('GT') + except ValueError: + raise ValueError( + 'GT column missing from VCF format string. In need ' + 'the genotype info location to know where to look ' + 'for the genotype (looks like 0|1)' + ) + + headers = header.strip().split('\t') + inds = headers[9:] + if check_header and header != check_header: + sys.stderr.write('{} bad header\n'.format(vcf)) + sys.stderr.write('Should be:\n{}\nInd:\n{}\n'.format(check_header, header)) + raise Exception('{} file had an invalid header'.format(vcf)) + if get_header: + return header + if get_inds: + return inds + + return _vcf_file(vcf, inds, ind_list, gti, pos, log) + + +def _vcf_file(vcf, inds, ind_list, gti, pos, log): + """Iterator for VCF.""" + + # Get individual locations for lookup/filter + ind_pos = [] + total_inds = len(inds) + if ind_list: + ind_count = len(ind_list) + if sorted(ind_list) != sorted(inds): + for ind in ind_list: + ind_pos.append(inds.index(ind)) + else: + ind_count = len(inds) + + short_lines = 0 + bad_gts = 0 + count = 0 + + # Actually parse file + with open_zipped(vcf) as fin: + fin.seek(pos) + count = 0 + for line in fin: + count += 1 + f = line.strip().split('\t') + out = [f[0], int(f[1]), f[3], f[4], f[2]] + gti = f[8].split(':').index('GT') + ind_data = f[9:] + if not len(ind_data) == total_inds: + short_lines += 1 + continue + if ind_pos: + these_inds = [] + for loc in ind_pos: + these_inds.append(ind_data[loc]) + if len(these_inds) != ind_count: + short_lines += 1 + continue + else: + these_inds = ind_data + for ind in these_inds: + gt = parse_gt(ind.split(':')[gti]) + if not gt: + break + out.append(gt) + if (len(out) - 5) != ind_count: + # sys.stderr.write( + # 'Bad line: {}\t{}\t{}\nLen: {}\tExpected: {}\n'.format( + # count, f[0], f[1], len(out)-4, ind_count + # ) + # ) + # raise Exception('Bugger') + bad_gts += 1 + continue + yield out + log.write( + 'Total: {}\nBad Genotypes: {}\nShort Lines (not enough inds): {}\n' + .format(count, bad_gts, short_lines) + ) + print( + '{}: Total: {}, Bad Genotypes: {}, Short Lines (not enough inds): {}' + .format(vcf, count, bad_gts, short_lines) + ) + return + + +def parse_gt(gt): + """Convert common genotypes into 0,1,2.""" + if '|' in gt: + if gt == '0|0': + gt = '0' + elif gt in ['0|1', '1|0']: + gt = '1' + elif gt == '1|1': + gt = '2' + else: + gt = None + elif '/' in gt: + if gt == '0/0': + gt = '0' + elif gt in ['0/1', '1/0']: + gt = '1' + elif gt == '1/1': + gt = '2' + else: + gt = None + elif gt.isdigit(): + gt = gt + else: + gt = None + return gt + + + +########################################################################### +# Command Line Parsing # +########################################################################### + + +def main(argv=None): + """Run as a script, parse command line args.""" + if not argv: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + # Subcommands + modes = parser.add_subparsers( + dest='inputCommand', + metavar='{prep,mpileup,post,geno,qtls,tidy,get_snake}' + ) + + # Shared commands + shared_cmds = argparse.ArgumentParser(add_help=False) + run_opts = shared_cmds.add_argument_group("Run Options") + run_opts.add_argument( + '-F', '--SampleName', dest='prefix_name', default='cis_var', + help='sample/population name (default: cis_var)' + ) + run_opts.add_argument( + '-r', '--readDepth', type=int, dest='trial_depths', default=20, + help='minimum read depth per variant (default: 20)' + ) + + # prep + prep_cmd_grp = modes.add_parser( + 'prep', description="Prepare genotype files", + parents=[shared_cmds], help="Prepare genotype files", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + prep_cmd = prep_cmd_grp.add_argument_group( + "Prep Options" + ) + prep_cmd.add_argument( + '-i', '--all-inds', + help='File of individuals in all groups, one per line' + ) + prep_cmd.add_argument( + '-l', '--limit-file', + help='BED/VCF/txt file of SNPs to consider' + ) + prep_cmd.add_argument( + '-a', '--allele-file', + help='BED/VCF/txt file of alleles to override VCF allels (subset of vcf)' + ) + prep_cmd.add_argument( + '--chrom-format', choices={'chr', 'num', 'ignore'}, default='ignore', + help='chr: make format "chr#", num: make format "#", ' + + 'ignore: do nothing (default: ignore)' + ) + prep_cmd.add_argument( + '--include-indels', action='store_false', + help='Do not skip indels' + ) + prep_cmd.add_argument( + '-c', '--cores', type=int, default=CORES, + help='Number of cores to use (default: all)' + ) + prep_cmd.add_argument( + 'vcf_files', nargs='+', help='VCF files with genotypes' + ) + + + # mpileup + mpileup_cmd_grp = modes.add_parser( + 'mpileup', description="Run mpileup", + parents=[shared_cmds], help="Run mpileup", + aliases=['m'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + mpileup_cmd = mpileup_cmd_grp.add_argument_group( + "mpileup Options" + ) + mpileup_cmd.add_argument( + '-f', '--fasta', required=True, dest='allCHRfasta', + help='fasta file with all chromosomes (Required)' + ) + mpileup_cmd.add_argument( + '-B', '--BAMfile', required=True, dest='sortedBam', + help='sorted BAM file (Required)' + ) + mpileup_cmd.add_argument( + '-p', '--mpileupBEDfile', required=False, dest='mpileupBEDfile', + help='BED to use instead of the BED generated in the prep phase ' + + '(Do not use if possible, use prep with limit instead)' + ) + + # POST + post_cmd_grp = modes.add_parser( + 'post', description="Run POST frequency calculation", + parents=[shared_cmds], help="Run POST frequency calculation", + aliases=['p'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + post_cmd = post_cmd_grp.add_argument_group("POST Options (Deprecated)") + post_cmd.add_argument( + '-a', '--allelesFile', required=False, dest='genosFile', + help='The genotypes file, ' + + '(Optional, default is file created in prep)' + ) + + # geno + geno_cmd_grp = modes.add_parser( + 'geno', description="Munge genotypes to prepare for regression", + parents=[shared_cmds], + help="Munge genotypes to prepare for regression", aliases=['g'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + geno_cmd = geno_cmd_grp.add_argument_group("Genotype Options") + geno_cmd.add_argument( + '-g', '--genoFile', required=False, dest='genosFile', + help='The genotypes file, ' + + '(Optional, default is file created in prep)' + ) + geno_cmd.add_argument( + '-i', '--individualsFile', required=False, dest='individualslist', + help='list of individuals matching genotype matrix; one indv per line' + ) + + # Regression + qtls_cmd_grp = modes.add_parser( + 'qtls', description="Run the regression", + parents=[shared_cmds], aliases=['q', 'regression', 'r'], + help="Run the regression", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + qtls_cmd = qtls_cmd_grp.add_argument_group("Regression Options") + qtls_cmd.add_argument( + '-n', '--numberIndividuals', dest='numIndv', + help='The number of individuals in the pool ' + + '(if omitted, calculated from genotype file length)' + ) + + # Tidy + tidy_cmd = modes.add_parser( + 'tidy', description="Tidy up regression, call open/closed", + parents=[shared_cmds], + help="Tidy up regression, call open/clsoed", aliases=['t'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + inputs = tidy_cmd.add_argument_group('inputs') + outputs = tidy_cmd.add_argument_group('outputs') + inputs.add_argument('-b', '--bedfile', + help="BED file to extract rsIDs from (optional)") + outputs.add_argument('-t', '--textfile', help="Parsed output") + outputs.add_argument('-p', '--pandasfile', help="Parsed dataframe") + + # Download pipeline + snk = modes.add_parser( + 'get_snake', + description="Download Snakefile and config to current dir", + help="Download Snakefile and config", aliases=['download'], + formatter_class=argparse.RawDescriptionHelpFormatter + ) + snk.add_argument('-x', '--extra', action='store_true', + help='Get additional sample and cluster configs') + + args = parser.parse_args(argv) + + if args.inputCommand in ['prep']: + if args.chrom_format == 'ignore': + chrom_format = 0 + elif args.chrom_format == 'chr': + chrom_format = 1 + elif args.chrom_format == 'num': + chrom_format = 2 + prep_files( + args.vcf_files, args.prefix_name, inds=args.all_inds, + limit_file=args.limit_file, alleles_file=args.allele_file, + chrom_format=chrom_format, skip_indels=args.include_indels, + cores=args.cores + ) + + elif args.inputCommand in ['mpileup', 'm']: + mpileup( + args.allCHRfasta, args.sortedBam, args.prefix_name, + mpileupBEDfile=args.mpileupBEDfile, + ) + + elif args.inputCommand in ['post', 'p']: + alleles_file = args.genosFile + if not alleles_file or not os.path.isfile(alleles_file): + alleles_file = '{}.genotypes.txt.gz'.format(args.prefix_name) + postcalc(args.prefix_name, alleles_file, args.trial_depths) + postTrim(args.prefix_name, args.trial_depths) + + elif args.inputCommand in ['geno', 'g']: + if args.genosFile: + genosFile = args.genosFile + else: + genosFile = '{}.genotypes.txt.gz'.format(args.prefix_name) + genoExtract( + args.prefix_name, args.trial_depths, + genosFile, args.individualslist + ) + + elif args.inputCommand in ['qtls', 'q', 'regression', 'r']: + regPVAL(args.prefix_name, args.trial_depths, args.numIndv) + + elif args.inputCommand in ['tidy', 't']: + prefix = args.prefix_name + '.' + str(args.trial_depths) + ifl = prefix + '.total.txt' + ofl = args.textfile if args.textfile else '{}.final.txt'.format(prefix) + pfl = args.pandasfile if args.pandasfile else '{}.final.pd'.format(prefix) + bedfile = args.bedfile + if not args.bedfile: + bfl = '{}.locations.bed.gz'.format(prefix) + if os.path.isfile(bfl): + bedfile = bfl + parse_regression( + ifl, textfile=ofl, pandasfile=pfl, bed=bedfile + ) + + elif args.inputCommand in ['get_snake', 'download']: + try: + from wget import download + except ImportError: + sys.stderr.write( + 'Cannot get files with wget installed, ' + 'try pip install --user wget\n' + ) + return 1 + url = 'https://raw.githubusercontent.com/TheFraserLab/cisVar/master/pipeline/{0}' + files = ['Snakefile', 'cisvar_config.json'] + if args.extra: + files += ['cisvar_samples.json', 'cluster.json'] + for fl in files: + download(url.format(fl)) + print( + 'Done, edit cisvar_config.json for your needs, use README on ' + 'github repo for help' + ) + + else: + sys.stderr.write('Unrecognized command {}'.format(args.inputCommand)) + parser.print_help(file=sys.stderr) + return 2 + + +if __name__ == '__main__' and '__file__' in globals(): + sys.exit(main()) diff --git a/pipeline/Snakefile b/pipeline/Snakefile new file mode 100644 index 0000000..fc7df38 --- /dev/null +++ b/pipeline/Snakefile @@ -0,0 +1,513 @@ +# -*- coding: utf-8 -*- +""" +Run cisVar + +Make sure input BAMs have been filtered by Hornet. +""" +import os +import json +from glob import glob +from textwrap import dedent +from subprocess import check_output + +import wget + +############################################################################### +# Pre-Defined Scripts # +############################################################################### + +# For use with modules +module = dedent( + """\ + module () { + eval $($LMOD_CMD bash "$@") + [ $? = 0 ] && eval $(${LMOD_SETTARG_CMD:-:} -s sh) + }""" +) + +# At the start of all scripts +script_prep = dedent( + """\ + {module} + module load fraserconda + """ +).format(module=module) + +# After script prep for R based rules +R_prep = dedent( + """\ + module load R + """ +) + +############################################################################### +# Editable Config # +############################################################################### + + +# Pick a config for your environment +configfile: "cisvar_config.json" + +# Name for this run +name = config['name'] + +# Read depth +read_depth = config['read_depth'] + +# Software locations +cisvar_folder = os.path.expandvars(config['cisvar']) # https://github.com/TheFraserLab/cisVar +if os.path.isdir(cisvar_folder): + cisvar = cisvar_folder + scripts = os.path.join(cisvar_folder, 'scripts') +else: + res = check_output(['command', '-V', 'cisVar']) + if 'not found' in res: + raise ValueError('Cannot find cisVar code') + cisvar = os.path.dirname(res.split(' ')[-1]) + scripts = cisvar + +# +############################################################################### +# Should Not Need To Edit Below Here # +############################################################################### + + +# Samples, either a list or a dictionary, dictionary allows sample: group, +# where group is used for genotype and individuals files +# Can be a separate json file +samples = config['samples'] +if isinstance(samples, str): + if os.path.isfile(config['samples']): + with open(config['samples']) as fin: + samples = json.load(fin) + else: + raise OSError('Could not find samples file: {}'.format(samples)) +# Allow samples to map to the same individual and genotype files +# in a group. +if isinstance(samples, dict): + groups = samples + samples = list(samples.keys()) +elif isinstance(samples, list): + groups = {i: i for i in samples} +else: + print(samples) + raise ValueError('samples must be a list or dict') +group_set = set(groups.values()) + +# Note, none of the below files are required, if they don't match anything, +# they are ignored. They can include {sample} inside the string. +alleles = config['alleles'] # Specify alleles in VCF or BED format to override those in vcfs +locs = config['locs'] # Only include these locations +inds = config['inds'] # Limit to these individuals, in this order + +# VCFs +vcfs = config['vcfs'] +if isinstance(vcfs, str): + vcfs = sorted(glob(os.path.expandvars(config['vcfs']))) + +# VCF sorting, only use if your vcf(s) are separated by chromosome and the +# chromosomes are in chr# format. Controlled by sort_vcfs in config file. +chrom = re.compile(r'chr([0-9mMxXyYtT]+)') +def k(s): + "Sort by chrom." + if chrom.search(s): + s = chrom.findall(s)[0] + else: + print(s) + raise ValueError("Couldn't find 'chr' string in vcf {}".format(s)) + if s.isdigit(): + s = int(s) + elif s.upper() == 'X': + s = 30 + elif s.upper() == 'Y': + s = 31 + elif 'M' in s.upper(): + s = 32 + else: + s = 99 + return s + +if config['sort_vcfs'] and len(vcfs) > 1: + vcfs = list(sorted([i for i in vcfs if 'chr' in i], key=k)) + +# Only the prep step is inherently parallel +cores = len(vcfs) +max_cores = config['max_cores'] +cores = cores if cores <= max_cores else max_cores + + +############################################################################### +# Targets # +############################################################################### + +wildcard_constraints: + name="[^\.]+", + sample="[^\.]+", + depth="[^\.]+", + pop="[^\.]+" + +localrules: all, combine, genotypes + + +# Make various inputs optional, only for prep step +ruleorder: prep_full > prep_no_alleles > prep_no_locs + + +# Run everything except combination, default pipeline +rule all: + input: + expand( + '{name}.{sample}.{depth}.regression.txt', + name=name, sample=samples, depth=read_depth + ), + expand( + '{name}.{sample}.{depth}.total.txt.prepost.png', + name=name, sample=samples, depth=read_depth + ), + expand( + '{name}.{sample}.{depth}.regression.pd', + name=name, sample=samples, depth=read_depth + ), + expand( + '{name}.{sample}.{depth}.genotypes.txt', + name=name, sample=samples, depth=read_depth + ) + threads: + 1 + resources: + mem_mb=500 + params: + time="48:00:00" + + +# Also combine samples at the end +rule combine: + input: + expand( + '{name}.{sample}.{depth}.regression.txt', + name=name, sample=samples, depth=read_depth + ), + expand( + '{name}.{sample}.{depth}.total.txt.prepost.png', + name=name, sample=samples, depth=read_depth + ), + expand( + '{name}.{sample}.{depth}.regression.pd', + name=name, sample=samples, depth=read_depth + ), + expand( + '{name}.{sample}.{depth}.genotypes.txt', + name=name, sample=samples, depth=read_depth + ), + expand( + '{name}.combined.{depth}.regression.pd', + name=name, depth=read_depth + ), + expand( + '{name}.merged.{depth}.regression.pd', + name=name, depth=read_depth + ) + threads: + 1 + resources: + mem_mb=500 + params: + time="48:00:00" + + +# Do preparation only +rule prep: + input: + expand( + '{name}.{group}.genotypes.txt.gz', + name=name, group=group_set + ), + expand( + '{name}.{group}.individuals.txt.gz', + name=name, group=group_set + ), + expand( + '{name}.{group}.genotypes.txt.gz', + name=name, group=group_set + ) + threads: + 1 + resources: + mem_mb=500 + params: + time="48:00:00" + + +############################################################################### +# cisVar # +############################################################################### + + +# Locs, inds, and alleles are optional and can be removed +rule prep_full: + input: + vcfs=ancient(vcfs), + alls=ancient(alleles), + locs=ancient(locs), + inds=ancient(inds) + output: + '{name}.{group}.genotypes.txt.gz', + '{name}.{group}.individuals.txt.gz', + '{name}.{group}.locations.bed.gz' + threads: + cores + resources: + mem_mb=20000 + params: + time="30:00:00", + chrom_fmt=config['chrom_format'] + shell: + dedent( + """\ + {script_prep} + python3 {cisvar}/cisVar.py prep -F {name}.{wildcards.group} \\ + -a {input.alls} -i {input.inds} -l {input.locs} \\ + --chrom-format {params.chrom_fmt} -c {threads} {input.vcfs} + """ + ) + +rule prep_no_alleles: + input: + vcfs=ancient(vcfs), + locs=ancient(locs), + inds=ancient(inds) + output: + '{name}.{group}.genotypes.txt.gz', + '{name}.{group}.individuals.txt.gz', + '{name}.{group}.locations.bed.gz' + threads: + cores + resources: + mem_mb=20000 + params: + time="30:00:00", + chrom_fmt=config['chrom_format'] + shell: + dedent( + """\ + {script_prep} + python3 {cisvar}/cisVar.py prep -F {name}.{wildcards.group} \\ + -i {input.inds} -l {input.locs} \\ + --chrom-format {params.chrom_fmt} -c {threads} {input.vcfs} + """ + ) + +rule prep_no_locs: + input: + vcfs=ancient(vcfs), + inds=ancient(inds) + output: + '{name}.{group}.genotypes.txt.gz', + '{name}.{group}.individuals.txt.gz', + '{name}.{group}.locations.bed.gz' + threads: + cores + resources: + mem_mb=20000 + params: + time="30:00:00", + chrom_fmt=config['chrom_format'] + shell: + dedent( + """\ + {script_prep} + python3 {cisvar}/cisVar.py prep -F {name}.{wildcards.group} \\ + -i {input.inds} \\ + --chrom-format {params.chrom_fmt} -c {threads} {input.vcfs} + """ + ) + + +rule mpileup: + input: + bam=ancient(config['bams']), # Must have {sample} in the json + bed=lambda x: '{name}.{group}.locations.bed.gz'.format( + name=name, group=groups[x.sample] + ), + fasta=ancient(os.path.expandvars(config['genome_fa'])) + output: + '{name}.{sample}.mpileup.txt.gz' + threads: + 1 + resources: + mem_mb=20000 + params: + time="30:00:00" + shell: + dedent( + """\ + {script_prep} + python3 {cisvar}/cisVar.py mpileup -F {name}.{wildcards.sample} \\ + -f {input.fasta} -p {input.bed} -B {input.bam} + """ + ) + + +rule post: + input: + mpileup='{name}.{sample}.mpileup.txt.gz', + geno=lambda x: '{name}.{group}.genotypes.txt.gz'.format( + name=name, group=groups[x.sample] + ) + output: + '{name}.{sample}.{depth}.POSTth.txt', + '{name}.{sample}.{depth}.POST.txt', + '{name}.{sample}.{depth}.bed' + threads: + 1 + resources: + mem_mb=32000 + params: + time="24:00:00" + shell: + dedent( + """\ + {script_prep} + python3 {cisvar}/cisVar.py post \\ + -F {name}.{wildcards.sample} -r {read_depth} \\ + -a {input.geno} + """ + ) + + +rule genos: + input: + post='{name}.{sample}.{depth}.POSTth.txt', + geno=lambda x: "{name}.{group}.genotypes.txt.gz".format( + name=name, group=groups[x.sample] + ), + inds=lambda x: '{name}.{group}.individuals.txt.gz'.format( + name=name, group=groups[x.sample] + ) + output: + '{name}.{sample}.{depth}.genotypes.txt' + threads: + 1 + resources: + mem_mb=26000 + params: + time="24:00:00" + shell: + dedent( + """\ + {script_prep} + python3 {cisvar}/cisVar.py geno \\ + -F {name}.{wildcards.sample} -r {read_depth} -i {input.inds} \\ + -g {input.geno} + """ + ) + + +rule regression: + input: + post='{name}.{sample}.{depth}.POSTth.txt', + geno='{name}.{sample}.{depth}.genotypes.txt', + inds=lambda x: '{name}.{group}.individuals.txt.gz'.format( + name=name, group=groups[x.sample] + ) + output: + '{name}.{sample}.{depth}.total.txt', + '{name}.{sample}.{depth}.nonsigLD', + '{name}.{sample}.{depth}.sigLD' + threads: + 1 + resources: + mem_mb=26000 + params: + time="20:00:00" + shell: + dedent( + """\ + {script_prep} + {R_prep} + python3 {cisvar}/cisVar.py qtls \\ + -F {name}.{wildcards.sample} -r {read_depth} \\ + -n `zcat {input.inds} | wc -l | awk '{{print $1}}'` + """ + ) + + +rule tidy: + input: + reg='{name}.{sample}.{depth}.total.txt', + bed=lambda x: '{name}.{group}.locations.bed.gz'.format( + name=name, group=groups[x.sample] + ) + output: + txt='{name}.{sample}.{depth}.regression.txt', + pd='{name}.{sample}.{depth}.regression.pd' + threads: + 1 + resources: + mem_mb=30000 + params: + time="06:00:00" + shell: + dedent( + """\ + {script_prep} + python3 {cisvar}/cisVar.py tidy \\ + -F {name}.{wildcards.sample} -r {read_depth} \\ + -b {input.bed} -t {output.txt} -p {output.pd} + """ + ) + + +rule plot: + input: + reg='{name}.{sample}.{depth}.total.txt' + output: + '{name}.{sample}.{depth}.total.txt.prepost.png' + threads: + 1 + resources: + mem_mb=5000 + params: + time="10:00:00" + shell: + dedent( + """\ + {script_prep} + {R_prep} + Rscript {scripts}/plot_fun.R {input.reg} + """ + ) + +rule combine_dfs: + input: + expand( + '{name}.{sample}.{depth}.regression.pd', + name=name, sample=samples, depth=read_depth + ) + output: + expand( + '{name}.combined.{depth}.regression.pd', + name=name, depth=read_depth + ), + expand( + '{name}.merged.{depth}.regression.pd', + name=name, depth=read_depth + ) + threads: + 1 + resources: + mem_mb=15000 + params: + time="10:00:00", + col_name=config['sample_name'], + search_str=expand( + '{name}.{{sample}}.{depth}.regression.pd', + name=name, depth=read_depth + ) + shell: + dedent( + """\ + {script_prep} + python3 {scripts}/combine.py -c {params.col_name} {params.search_str} + """ + ) diff --git a/pipeline/cisvar_config.json b/pipeline/cisvar_config.json new file mode 100644 index 0000000..74634e8 --- /dev/null +++ b/pipeline/cisvar_config.json @@ -0,0 +1,18 @@ +{ + "name": "cis_var", + "sample_name": "population", + "samples": [""], + "read_depth": 20, + "max_cores": 16, + "sort_vcfs": 1, + "chrom_format": "chr", + "bams": "", + "cisvar": "", + "vcfs": "", + "genome_fa": "/data/genomes/human/hg19/hg19.fa", + "_comment": "The following three are file descriptors and must contain {group} (same as sample if samples is a list), if they don't exist they are ignored.", + "_comment2": "It's a really good idea to include at least the individuals file.", + "alleles": "none..{group}.alleles.txt.gz ", + "locs": "{group}.locations.bed.gz", + "inds": "{group}.individuals.txt.gz" +} diff --git a/pipeline/cisvar_samples.json b/pipeline/cisvar_samples.json new file mode 100644 index 0000000..eb78bc4 --- /dev/null +++ b/pipeline/cisvar_samples.json @@ -0,0 +1 @@ +["YRI", "IBS", "ASW", "CEU", "CHB", "ESN", "FIN", "GWD", "IBS", "LWK", "TSI"] diff --git a/pipeline/cluster.json b/pipeline/cluster.json new file mode 100644 index 0000000..8714067 --- /dev/null +++ b/pipeline/cluster.json @@ -0,0 +1,9 @@ +{ + "__default__" : + { + "queue" : "", + "name" : "cisvar.{rule}.{wildcards}", + "out" : "logs/{rule}.{wildcards}.%j.out", + "err" : "logs/{rule}.{wildcards}.%j.err" + } +} diff --git a/regression_qtls.R b/regression_qtls.R index 3e2e2fd..2bd2fa6 100644 --- a/regression_qtls.R +++ b/regression_qtls.R @@ -1,12 +1,35 @@ #!/usr/bin/R - -#==================================================================================== -# # AUTHOR: Ashley Tehranchi, tehranchi576@gmail.com # ORGANIZATION: Stanford University -# -#==================================================================================== + +ldCalculation <- function(sig, outfile) { + + # Max distance between two SNPs to use for FDR calculation + fdr.window = 200000 + + r2.sig = vector("numeric", length=nrow(sig)) + holdnum = nrow(sig)-1 + maxcol = ncol(sig) + + # If two SNPs are within FDR window (default 200 kb) do correlation between + # their genotypes Otherwise just set the r-squared to 0 + for (i in 1:holdnum) { + if (sig[i,2] - sig[i+1,2] < fdr.window) { + r2.sig[i] <- cor(t(sig[i,20:maxcol]), t(sig[i+1,20:maxcol]), method="pearson") + } + else{ r2.sig[i] <- 0} + } + + # Add r-squared + cat("Adding r-squared\n"); flush.console() + sig$r2 <- (r2.sig)^2 + ldfileOUTsig = sig[, c("Chr", "position", "r2")] + ldfileOUTsigFinal = ldfileOUTsig[complete.cases(ldfileOUTsig),] + + cat("Writing out LD data\n"); flush.console() + write.table(ldfileOUTsigFinal, file=outfile, sep="\t", quote=F, row.names=F, col.names=T) +} linearRegression <- function(pd, gt, tmppfx, indiv) { @@ -18,24 +41,32 @@ linearRegression <- function(pd, gt, tmppfx, indiv) { MAF.low = 0.02 MAF.high = 1-MAF.low - # P-Value + # P-Value, used to split sig/nonsig for LD calculation P = 0.05 # Force postfreq to be between eps and 1-eps (e.g 0.0001 and 0.9999) eps = 0.0001 + # Figure sizing/format defautls + fig.width = 1200 + fig.height = 1200 + fig.font = 40 + ################## # Read in Data # ################## - cat("Regression... "); flush.console() + cat("..cisVAR Regression..\n"); flush.console() options("scipen"=100) n.indi = as.numeric(indiv) print(pd) + cat("Loading POST..\n"); flush.console() postTotal = read.table(pd, sep="\t", header=T) - print(dim(pd)) + cat("Loading Genotypes\n"); flush.console() genofile = matrix(scan(file = gt, sep = "\t", what = double(), nlines = n.indi), byrow = TRUE, nrow = n.indi) + cat("Genotype Dimensions: "); flush.console() print(dim(genofile)) + genoTotal = 1 - (0.5 * genofile) ############################ @@ -54,13 +85,16 @@ linearRegression <- function(pd, gt, tmppfx, indiv) { geno.end=15 + n.indi # Bind the matrices + cat("Combining\n"); flush.console() a = t(genoTotal) b = cbind(postTotal,a) # MAF Filter + cat(sprintf("Filtering by MAF %f\n", MAF.low)); flush.console() mafsub = b[apply(b[,geno.start:geno.end], MARGIN = 1, function(x) mean(as.numeric(x)) >= MAF.low & mean(as.numeric(x)) <= MAF.high), ] # Split the matrices again + cat("Splitting\n"); flush.console() post = mafsub[,post.start:post.end] genotypes = as.matrix(mafsub[,geno.start:geno.end]) # postTemp = mafsub @@ -81,11 +115,15 @@ linearRegression <- function(pd, gt, tmppfx, indiv) { Y = postProp - genotypes[,n.indi] # Force postfreq to be between eps and 1-eps (e.g 0.0001 and 0.9999) + cat(sprintf("Filtering POST frequencies by eps of %f\n", eps)); flush.console() props.for.weights = pmin(1-eps,pmax(estimated.props,eps)) # weight = depth / (adjusted.post * (1-adjusted.post)) + cat("Calculating Weights\n"); flush.console() regression.weights = depths / (props.for.weights * (1-props.for.weights) ) good = which( (postProp>0.1) & (postProp < 0.9)) + + # Regression is here: m = lm(Y[good] ~ G[good,]-1,weights=regression.weights[good]) ## run without intercept coefs = m$coef s = summary(m) @@ -94,17 +132,20 @@ linearRegression <- function(pd, gt, tmppfx, indiv) { big.cov.mat[-n.indi,-n.indi] = cov.mat big.cov.mat[n.indi,n.indi] = sum(cov.mat) big.cov.mat[n.indi,-n.indi] = big.cov.mat[-n.indi,n.indi] = -rowSums(cov.mat) + cat("Regression...\n"); flush.console() vars = sapply(1:n.snps, function(i) genotypes[i,] %*% big.cov.mat %*% genotypes[i,]) vars[vars < 0] <- 0 - all.coefs = c(coefs, 1-sum(coefs)) - preProps = genotypes %*% as.matrix(all.coefs) + all.coeffs = c(coefs, 1-sum(coefs)) + preProps = genotypes %*% as.matrix(all.coeffs) preProps[preProps > 1] <- 1 preVars = vars postProps = estimated.props postProps[postProps > 1] <- 1 postVars = 1/regression.weights postVars[postVars > 1] <- 1 - Zs = (postProps - preProps)/sqrt(preVars +postVars) + cat("Calculating Z values\n"); flush.console() + Zs = (postProps - preProps)/sqrt(preVars + postVars) + cat("Calculating P values\n"); flush.console() p.values = 2*(1-pnorm(abs(Zs))) analytic.pv = function(preProp,preVar,postProp,depth){ @@ -121,59 +162,69 @@ linearRegression <- function(pd, gt, tmppfx, indiv) { } an.pvs<-sapply(1:n.snps, function(i) analytic.pv(preProps[i],preVars[i],postProps[i],depths[i])) + + cat("Adding Statistics to Matrix\n"); flush.console() post$prechipfreq <- preProps post$pvalue <- an.pvs + post$zvalue <- Zs + post$prevar <- preVars + post$postvar <- postVars post$SNPpostfreq <- 1 - post$POSTfreq post$SNPprefreq <- 1 - post$prechipfreq - combined = cbind(post, genotypes) - - # Split data based on significance - - sig = subset(combined, pvalue <= P) - nonsig = subset(combined, pvalue > P) - ############################################## # LD Calculation for Significance Estimate # ############################################## - r2.sig = vector("numeric", length=nrow(sig)) - holdnum = nrow(sig)-1 - for (i in 1:holdnum){ - if (sig[i,2] - sig[i+1,2] < 200000) { - r2.sig[i] <- cor(t(sig[i,20:79]), t(sig[i+1,20:79]), method="pearson") - } - else{ r2.sig[i] <- 0} - } - sig$r2 <- (r2.sig)^2 - ldfileOUTsig = sig[, c("Chr", "position", "r2")] - ldfileOUTsigFinal = ldfileOUTsig[complete.cases(ldfileOUTsig),] + cat("LD Calculation\n"); flush.console() + combined = cbind(post, genotypes) + + sig = subset(combined, pvalue <= P) totaloutsig = paste(tmppfx, ".sigLD", sep="") - write.table(ldfileOUTsigFinal, file=totaloutsig, sep="\t", quote=FALSE, row.names=FALSE, col.names=T) + ldCalculation(sig, totaloutsig) + rm(sig) ######################################## # LD Calculation for Non-Significant # ######################################## - r2.nonsig = vector("numeric",length=nrow(nonsig)) - holdnum = nrow(nonsig)-1 - for (i in 1:holdnum){ - if (nonsig[i,2] - nonsig[i+1,2] < 200000) { - r2.nonsig[i] <- cor(t(nonsig[i,20:79]), t(nonsig[i+1,20:79]), method="pearson") - } - else{ r2.nonsig[i] <- 0} - } - nonsig$r2 <- (r2.nonsig)^2 - ldfileOUTnonsig = nonsig[, c("Chr", "position", "r2")] - ldfileOUTnonsigFinal = ldfileOUTnonsig[complete.cases(ldfileOUTnonsig),] + nonsig = subset(combined, pvalue > P) totaloutnonsig = paste(tmppfx, ".nonsigLD", sep="") - write.table(ldfileOUTnonsigFinal, file= totaloutnonsig, sep="\t", quote=FALSE, row.names=FALSE, col.names=T) + ldCalculation(nonsig, totaloutnonsig) + rm(nonsig) + rm(combined) + + ################### + # Write Outputs # + ################### + + cat("Filtering pre-freq between 0 and 1 only\n"); flush.console() + ## remove SNPs with pre = 0 or 1 + # print(dim(post)) + temp = subset(post, prechipfreq>0 & prechipfreq<1) + ## must be sorted by pvalue + x = temp[with(temp, order(pvalue)), ] + # print(dim(x)) + xs = x + options("scipen"=100) + xs$start <- xs$position - 1 + + cat("Writing outputs\n"); flush.console() + bed = xs[,c("Chr", "start", "position")] ## reorder by column numbers + sortedBed = bed[with(bed,order(Chr, start, position)),] + finaloutFile = paste(tmppfx, ".total.txt", sep="") + finalcoeffs = paste(tmppfx, ".coefficients.txt", sep="") + finalbed = paste(tmppfx, ".final.bed", sep="") + write.table(x, file=finaloutFile, sep="\t", quote=F, row.names=F, col.names=T) + write.table(sortedBed, file=finalbed, sep="\t", quote=F, row.names=F, col.names=F) + write.table(all.coeffs, file=finalcoeffs, sep="\t", quote=F, row.names=F, col.names=T) ################### # Summary Plots # ################### - png(paste(tmppfx, ".summaryPlots.png", sep=""), width=1200, height=1200, pointsize=40) + cat("Making summary plots\n"); flush.console() + png(paste(tmppfx, ".summaryPlots.png", sep=""), width=fig.width, height=fig.height, pointsize=fig.font) par(mfrow=c(2,2)) plot(table(round(p.values,d=2))) qqplot(p.values,runif(n.snps), )#,xlim=c(0,0.1),ylim=c(0,0.1)) @@ -183,31 +234,28 @@ linearRegression <- function(pd, gt, tmppfx, indiv) { abline(0,1) dev.off() - png(paste(tmppfx, ".qqplot.png", sep=""), width=1200, height=1200, pointsize=40) + png(paste(tmppfx, ".qqplot.png", sep=""), width=fig.width, height=fig.height, pointsize=fig.font) + qqplot(runif(n.snps), an.pvs, xlab="Expected", ylab="Observed")#,xlim=c(0,0.1),ylim=c(0,0.1)) + abline(0,1) + dev.off() + png(paste(tmppfx, ".qqplot.log.png", sep=""), width=fig.width, height=fig.height, pointsize=fig.font) qqplot(runif(n.snps), an.pvs, xlab="Expected", ylab="Observed")#,xlim=c(0,0.1),ylim=c(0,0.1)) abline(0,1) dev.off() - ################### - # Write Outputs # - ################### + cat("Done!\n"); flush.console() - ## remove SNPs with pre = 0 or 1 - print(dim(post)) - temp = subset(post, prechipfreq>0 & prechipfreq<1) - ## must be sorted by pvalue - x = temp[with(temp, order(pvalue)), ] - print(dim(x)) - xs = x - options("scipen"=100) - xs$start <- xs$position - 1 - bed = xs[,c("Chr", "start", "position")] ## reorder by column numbers - sortedBed = bed[with(bed,order(Chr, start, position)),] - finaloutFile = paste(tmppfx, ".total.txt", sep="") - finalbed = paste(tmppfx, ".bed", sep="") - write.table(x, file= finaloutFile, sep="\t", quote=FALSE, row.names=FALSE, col.names=T) - write.table(sortedBed, file=finalbed, sep="\t", quote=FALSE, row.names=FALSE, col.names=F) } -linearRegression("postdataIN", "genosoutName", "prefix", "numIndiv") +args <- commandArgs(trailingOnly = TRUE) + +if (length(args) != 4) { + stop("Rscript regression_qtls.R .", call.=FALSE) +} +post = args[1] +genos = args[2] +prefix = args[3] +indiv = as.numeric(args[4]) + +linearRegression(post, genos, prefix, indiv) diff --git a/scripts/combine.py b/scripts/combine.py new file mode 100755 index 0000000..969f547 --- /dev/null +++ b/scripts/combine.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Combine a bunch of cisVar pandas files by sample (e.g. population). + +Requires a search string such as prefix.{sample}.regression.pd. + +Writes +------ +prefix.combined.regression.pd + A simple merger of all DataFrames +prefix.merged.regression.pd + A per-snp merger based on p-value +""" +import os as _os +import re as _re +import sys as _sys +import argparse as _argparse +from glob import glob as _glob + +import numpy as np +import pandas as pd +from scipy import stats as sts + + +def find_dfs(search_str): + """Use search_str to find files to merge. + + Params + ------ + search_str : str + String in format 'prefix.{sample}.suffix', {sample} required + + Returns + ------- + dfs : dict + Dictionary of {name: path} for all dataframes + """ + query = '{sample}' + if query not in search_str: + raise ValueError('Search string must contain {sample}') + ss = search_str.replace(query, '*') + q = _re.compile(search_str.replace(query, '(.*)')) + c = _re.compile(search_str.replace(query, 'combined')) + return { + q.findall(i)[0]: i for i in _glob(_os.path.expandvars(ss)) + if not c.search(i) + } + + +def combine_dfs(dfs, column_name='population'): + """Combine pandas dfs. + + Params + ------ + dfs : dict + {name: path} + column_name : str, optional + A name for the column to merge on + + Returns + ------- + DataFrame + """ + cdfs = [] + for name, path in dfs.items(): + print('Working on {}'.format(path)) + df = pd.read_pickle(path) + df[column_name] = name + df['snp'] = df.index.to_series() + df['idx'] = df[column_name].astype(str) + '.' + df['snp'].astype(str) + cdfs.append(df) + print('Combining') + return pd.concat(cdfs) + + +def combine_and_write(search_str, column_name='population'): + """Combine pandas dfs by search string. + + Params + ------ + search_str : str + String in format 'prefix.{sample}.suffix', {sample} required + column_name : str, optional + A name for the column to merge on + + Writes + ------ + DataFrame + Combined DataFrame at search_string.replace('{sample}', 'combined') + + Returns + ------- + DataFrame + """ + comb = combine_dfs(find_dfs(search_str), column_name=column_name) + ofl = search_str.replace('{sample}', 'combined') + print('Writing to {}'.format(ofl)) + comb.to_pickle(ofl) + return comb + + +def merge_dataframe(df, column_name='population'): + """Merge the result of combine_dfs into per-SNP dataframe.""" + df['snp'] = df.chrom.astype(str) + '.' + df.position.astype(str) + print('Getting open and closed combined data, this might take a while.') + # Takes about 20 minutes + open_closed = get_open_closed_counts(df) + keep_cols = [ + 'snp', 'chrom', 'position', 'rsid', 'pre_freq', 'post_freq', + 'pre_variance', 'post_variance', 'beta', 'p_value', 'z_value', 'ref', + 'alt', 'depth', 'ref_depth', 'alt_depth', 'snp_postfreq', 'snp_prefreq', + 'population' + ] + comb_functions = { + 'chrom': first, 'position': first, 'rsid': first, + 'ref': first, 'alt': first, + 'pre_freq': np.median, 'post_freq': np.median, + 'pre_variance': np.median, 'post_variance': np.median, + 'beta': np.median, 'z_value': np.median, 'depth': np.median, + 'ref_depth': np.median, 'alt_depth': np.median, + 'snp_postfreq': np.median, 'snp_prefreq': np.median, + column_name: ','.join, 'p_value': fishers + } + print('Collapsing dataframe, this also takes a while.') + # Takes about 10 minutes + df = df[keep_cols].groupby('snp').agg(comb_functions) + # Split the fishers statistics + df['fishers_value'], df['fishers_p'] = df.p_value.str + print('Merging in open/closed data') + df = pd.merge(df, open_closed, left_index=True, right_index=True) + print('Organizing') + final_cols = [ + 'chrom', 'position', 'rsid', 'fishers_p', 'ref', 'alt', 'open_best', + 'closed_best', 'pop_count', column_name, 'pops_agree', + 'most_pops_agree', 'all_open_closed', 'open_closed_freqs', + 'fishers_value', 'pre_freq', 'post_freq', 'pre_variance', + 'post_variance', 'beta', 'z_value', 'depth', 'ref_depth', 'alt_depth', + 'snp_postfreq', 'snp_prefreq' + ] + final_names = [ + 'chrom', 'position', 'rsid', 'fishers_p', 'ref', 'alt', 'open_best', + 'closed_best', column_name + '_count', column_name + 's', column_name + + 's_agree', 'most_{}s_agree'.format(column_name), 'all_open_closed', + 'open_closed_freqs', 'fishers_value', 'pre_freq_median', + 'post_freq_median', 'pre_variance_median', 'post_variance_median', + 'beta_median', 'z_value_median', 'depth_median', 'ref_depth_median', + 'alt_depth_median', 'snp_postfreq_median', 'snp_prefreq_median' + ] + df = df[final_cols] + df.columns = final_names + print('Done') + return df + + +def merge_and_write(df, ofl, column_name='population'): + """Merge the dataframe and write.""" + df = merge_dataframe(df, column_name=column_name) + df.to_pickle(ofl) + return df + + +########################################################################### +# Combination Functions # +########################################################################### + +def first(x): + """Return the first member of a list.""" + try: + x = x.to_series() + except AttributeError: + pass + return list(x)[0] + + +def fishers(x): + """Return fishers, fishers_p for pvalues.""" + return sts.combine_pvalues(x.astype(np.float), method='fisher') + + + +def summarize(x): + """Create a set of open, closed allele pairs. + + Args: + DataFrame: Should be provided by apply, expects only one SNP + + Returns: + tuple: tuple, int, tuple: open_closed_alleles, total populations, open_close_frequencies + """ + pop_count = len(x.open_allele) + # Get the counts for every open, closed combination + oc = pd.Series([(i, j) for i, j in zip(x.open_allele.tolist(), x.closed_allele.tolist())]).value_counts() + total = oc.sum() + ocs = [] + freqs = [] + for oc, cnt in zip(oc.index.to_series().tolist(), oc.tolist()): + ocs.append(oc) + freqs.append(round(cnt/total, 3)) + return tuple(ocs), total, tuple(freqs) + + +def pops_agree(x): + """True only if there is only one open-closed pair in the row, use with apply, expects a row of a DF.""" + return len(x.all_open_closed) == 1 + + +def pops_agree_50(x): + """True if more than 50% of populations agree on open-closed pair, False if 50% or less agree.""" + return x.open_closed_freqs[0] > .5 + + +def get_open_closed_counts(x): + """Take a dataframe (from apply), tally the open/closed SNPs, and return some stats. + + Returns: + DataFrame: open and closed _best represent the most common open/closed alleles, pop count + is all populations with this SNP, all_open_closed is a tuple of tuples of all open-closed pairs, + open_closed_freqs tells you how many populations had each of the open-closed pairs. Other + two columns do what they say on the tin. + """ + open_closed_data = pd.concat(x[['snp', 'open_allele', 'closed_allele']].groupby('snp').apply(summarize).str, axis=1) + open_closed_data.columns = ['all_open_closed', 'pop_count', 'open_closed_freqs'] + open_closed_data.index.name = None + # Explicitly set the best alleles, assuming sorted by frequency (which is done by summarize()) + open_closed_data['open_best'], open_closed_data['closed_best'] = open_closed_data['all_open_closed'].str[0].str + open_closed_data['pops_agree'] = open_closed_data.apply(pops_agree, axis=1) + open_closed_data['most_pops_agree'] = open_closed_data.apply(pops_agree_50, axis=1) + # Set best alleles to NA if the top allele is not better than next best allele + top_freq = open_closed_data.open_closed_freqs.str[0] + nxt_freq = np.where(open_closed_data.pops_agree, 0, open_closed_data.open_closed_freqs.str[1]) + for i in ['open_best', 'closed_best']: + open_closed_data[i] = np.where(top_freq > nxt_freq, open_closed_data[i], 'NA') + # Return organized DF + return open_closed_data[['open_best', 'closed_best', 'pop_count', 'all_open_closed', 'pops_agree', 'most_pops_agree', 'open_closed_freqs']] + + +def main(argv=None): + """Run as a script.""" + if not argv: + argv = _sys.argv[1:] + + parser = _argparse.ArgumentParser( + description=__doc__, + formatter_class=_argparse.RawDescriptionHelpFormatter + ) + + # Positional arguments + parser.add_argument( + 'search_str', + help='e.g. name.{sample}.regression.pd, used to find files' + ) + parser.add_argument( + '-c', '--column-name', default='population', + help='Name for combine column, e.g. population' + ) + parser.add_argument( + '--no-merge', action='store_false', dest='merge', + help='Produce only a combined dataframe, not a merged dataframe' + + '. merging can add half an hour over combination, which takes seconds' + ) + + args = parser.parse_args(argv) + + comb = combine_and_write(args.search_str, column_name=args.column_name) + if len(comb) < 10: + return 1 + if args.merge: + ofl = args.search_str.replace('{sample}', 'merged') + merged = merge_and_write(comb, ofl, column_name=args.column_name) + return 0 + + +if __name__ == '__main__' and '__file__' in globals(): + _sys.exit(main()) diff --git a/scripts/convert_vcf2bed.sh b/scripts/convert_vcf2bed.sh deleted file mode 100755 index 7ce99f9..0000000 --- a/scripts/convert_vcf2bed.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -# Convert vcf to bed, only including SNVs - -module load bedops -in=$1 -out=$(echo "${1}" | sed 's/vcf.gz/bed.gz/') -# Convert to bed and drop indels -pigz -dc ${in} | vcf2bed --snvs | pigz > ${out} diff --git a/scripts/make_alleles.py b/scripts/make_alleles.py deleted file mode 100755 index 7ed56a2..0000000 --- a/scripts/make_alleles.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -Convert a bed file into an alleles file. -""" -import sys as _sys -import bz2 as _bz2 -import gzip as _gzip -import argparse as _argparse - - -def bed_to_alleles(mpileup_file, bed_file, allele_file): - """Convert a bed file into an alleles file. - - Arguments should be writable files or file names - """ - all_loci = {} - with open_zipped(bed_file) as fin: - for line in fin: - fields = line.strip().split('\t') - if fields[0] not in all_loci: - all_loci[fields[0]] = {} - all_loci[fields[0]][fields[2]] = (fields[5], fields[6]) - - with open_zipped(mpileup_file) as fin, open_zipped(allele_file, 'w') as fout: - for line in fin: - fields = line.strip().split('\t') - fout.write( - '{}\t{}\t{}\n'.format( - fields[0], - fields[1], - '\t'.join( - all_loci[fields[0]][fields[1]] - ) - ) - ) - return 0 - - -def open_zipped(infile, mode='r'): - """Return file handle of file regardless of compressed or not. - - Also returns already opened files unchanged, text mode automatic for - compatibility with python2. - """ - # Return already open files - if hasattr(infile, 'write'): - return infile - # Make text mode automatic - if len(mode) == 1: - mode = mode + 't' - if not isinstance(infile, str): - raise ValueError("I cannot open a filename that isn't a string.") - if infile.endswith('.gz'): - return _gzip.open(infile, mode) - if infile.endswith('.bz2'): - if hasattr(_bz2, 'open'): - return _bz2.open(infile, mode) - else: - return _bz2.BZ2File(infile, mode) - return open(infile, mode) - - -def main(argv=None): - """Run as a script.""" - if not argv: - argv = _sys.argv[1:] - - parser = _argparse.ArgumentParser( - description=__doc__, - formatter_class=_argparse.RawDescriptionHelpFormatter - ) - - # Optional arguments - parser.add_argument('mpileup', - help='The M-pileup file to match') - - # Optional files - parser.add_argument('bedfile', nargs='?', - help="Input file (Default: STDIN)") - parser.add_argument('allele_file', nargs='?', - help="Output file (Default: STDOUT)") - - args = parser.parse_args(argv) - - ifl = args.bedfile if args.bedfile else _sys.stdin - ofl = args.allele_file if args.allele_file else _sys.stdout - - return bed_to_alleles(args.mpileup, ifl, ofl) - -if __name__ == '__main__' and '__file__' in globals(): - _sys.exit(main()) diff --git a/scripts/plot_fun.R b/scripts/plot_fun.R new file mode 100644 index 0000000..c1b613f --- /dev/null +++ b/scripts/plot_fun.R @@ -0,0 +1,34 @@ +#!/usr/bin/env Rscript +# input is *.total.txt file from the regression + +require(gplots) +options("scipen"=100) + +plotreg <- function(fl, verbose=FALSE) { + print(fl) + x=read.table(fl, head=T) + #sub = subset(x, Chr=="chr7" & position=="6065804") + ## p-value = 1.76362e-15 + corr = round(cor(x$prechipfreq, x$POSTfreq, method="pearson"), 2) + + y <- densCols(x$prechipfreq, x$POSTfreq, colramp=colorRampPalette(c("black", "white"))) + x$dens <- col2rgb(y)[1,] + 1L + + ## Map densities to colors + cols <- colorRampPalette(c("#054A91", "#00FEFF", "#45FE4F","#FCFF00", "#FF9400", "#FF3100"))(256) + x$col <- cols[x$dens] + + ## Plot it, reordering rows so that densest points are plotted on top + png(paste(fl, ".prepost.png", sep=""), width=1200, height=1200, pointsize=40) + plot(x$POSTfreq~x$prechipfreq, data=x[order(x$dens),], pch=20, col=col, cex=0.5, xlim=c(0,1), ylim=c(0,1),las=1, cex.axis=1.5, main=paste(fl, corr, sep="\t"), xlab="Pre-ATAC frequency", ylab="Post-ATAC frequency") + #par(new=T) + #plot(sub$prechipfreq, sub$POSTfreq, xlim=c(0,1), ylim=c(0,1), xaxt='n', yaxt='n' ,xlab="", ylab="", col="red", pch=16, cex=1.5) + dev.off() +} + +args <- commandArgs(trailingOnly = TRUE) + +for (i in args) { + plotreg(i) +} + diff --git a/scripts/post_process_regression.py b/scripts/post_process_regression.py deleted file mode 100755 index 4dd8443..0000000 --- a/scripts/post_process_regression.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -Take a completed regression from cisVar.py and create a dataframe, call -open/closed alleles, sort, and tidy. -""" -import sys as _sys -import bz2 as _bz2 -import gzip as _gzip -import argparse as _argparse -import logging as _log - -import numpy as np -import pandas as pd - -# Formatted logging -_log_formatter = _log.Formatter( - "%(asctime)s [%(name)s] %(levelname)s – %(message)s" -) -_log_formatter_simp = _log.Formatter( - "%(asctime)s %(levelname)s – %(message)s" -) -log = _log.getLogger('process_regression') -_log_out = _log.StreamHandler() -_log_out.setFormatter(_log_formatter_simp) -_log_out.setLevel(_log.INFO) -log.addHandler(_log_out) -log.setLevel(_log.INFO) - -############### -# Constants # -############### - -orig_headers = [ - 'Chr', 'position', 'REFallele', 'Depth', 'Acount', 'Ccount', 'Gcount', - 'Tcount', 'ALTdepth', 'REFDepth', 'ALTallele', 'REFfreq', 'ALTfreq', - 'POSTallele', 'POSTfreq', 'prechipfreq', 'pvalue', 'SNPpostfreq', - 'SNPprefreq' -] -new_headers = [ - 'chrom', 'position', 'ref', 'depth', 'a_count', 'c_count', 'g_count', - 't_count', 'alt_depth', 'ref_depth', 'alt', 'ref_freq', 'alt_freq', - 'post_allele', 'post_freq', 'pre_freq', 'p_value', 'snp_postfreq', - 'snp_prefreq' -] -final_headers = [ - 'chrom', 'position', 'rsid', 'open_allele', 'closed_allele', - 'pre_freq', 'post_freq', 'beta', 'p_value', 'ref', 'alt', 'depth', - 'ref_depth', 'alt_depth', 'ref_freq', 'alt_freq', 'snp_postfreq', - 'snp_prefreq', 'a_count', 'c_count', 'g_count', 't_count' -] -float_dtypes = { - 'position': 'object', - 'REFfreq': 'float128', - 'ALTfreq': 'float128', - 'POSTfreq': 'float128', - 'prechipfreq': 'float128', - 'pvalue': 'float128', - 'SNPpostfreq': 'float128', - 'SNPprefreq': 'float128', -} - - -#################### -# Core Functions # -#################### - - -def parse_regression(regfile, textfile=None, pandasfile=None, bed=None): - """Parse a regression file into a pandas datafram. - - Optionally output either pickled panda or textfile. - - For all files, gzipped files or open file handle are permissable. - - Parameters - ---------- - regfile : file name - Tab delimited output from regression, header strictly enforced - textfile : file name, optional - Path to write tab delimited formatted output - pandasfile : file name, optional - Path to write pickled pandas dataframe - bed : file name, optional - Path to a bed file that contains rsids - - Returns - ------- - pandas.DataFrame - Parsed DataFrame - """ - log.info('Loading regression file') - df = pd.read_csv(regfile, sep='\t', low_memory=False, dtype=float_dtypes, - float_precision=128) - assert df.columns.tolist() == orig_headers - log.info('%s regression records', len(df)) - log.debug('Changing headers') - df.columns = new_headers - # Set the index to chr.position - log.debug('Indexing') - df['idx'] = df.chrom.astype(str) + '.' + df.position.astype(str) - df = df.set_index('idx', drop=True) - - # Add rsID if possible - if bed: - log.info('Loading rsids') - df['rsid'] = bed_to_series(bed) - else: - log.warn('No bed file, all rsids will be "Unknown"') - df['rsid'] = 'Unknown' - - # Sort the data by position (default is by best pvalue) - log.info('Sorting') - df = df.sort_values(['chrom', 'position']) - - # Force datatypes and uppercase - df['position'] = df.position.astype(int) - for i in ['ref', 'alt', 'post_allele']: - df[i] = df[i].str.upper() - # Add the beta - df['beta'] = df.post_freq - df.pre_freq - - # Add open and closed - log.info('Calculating open/closed') - df['open_allele'] = np.where(df.post_freq > df.pre_freq, df.post_allele, df.alt) - df['closed_allele'] = np.where(df.post_freq > df.pre_freq, df.alt, df.post_allele) - df['open_allele'] = np.where(df.post_freq == df.pre_freq, 'NA', df.open_allele) - df['closed_allele'] = np.where(df.post_freq == df.pre_freq, 'NA', df.closed_allele) - - # Sort columns - log.debug('Adjusting columns') - df = df[final_headers] - - # Write outputs - if pandasfile: - log.info('Writing pickled panda to %s', pandasfile) - df.to_pickle(pandasfile) - if textfile: - log.debug('Writing parsed dataframe to %s', textfile) - df.to_csv(textfile, sep='\t') - - return df - - -def bed_to_series(bedfile): - """Convert a bed file to a series of chr.pos->name.""" - args = dict(sep='\t', usecols=[0, 2, 3]) - log.debug('Checking for bed head') - with _open_zipped(bedfile) as fin: - if not fin.readline().startswith('#'): - args['header'] = None - log.debug('Reading in bed file') - bed = pd.read_csv(bedfile, **args) - bed.columns = ['chrom', 'pos', 'name'] - log.debug('Indexing bed') - bed['idx'] = bed.chrom.astype(str) + '.' + bed.pos.astype(str) - bed = bed.drop_duplicates('idx', keep=False) - bed = bed.set_index('idx', drop=True) - log.debug('Returning rsid series') - log.debug('Parsed %s SNPs', len(bed)) - return bed.name - - -####################### -# Support Functions # -####################### - - -def _open_zipped(infile, mode='r'): - """Return file handle of file regardless of compressed or not. - - Also returns already opened files unchanged, text mode automatic for - compatibility with python2. - """ - # Return already open files - if hasattr(infile, 'write'): - return infile - # Make text mode automatic - if len(mode) == 1: - mode = mode + 't' - if not isinstance(infile, str): - raise ValueError("I cannot open a filename that isn't a string.") - if infile.endswith('.gz'): - return _gzip.open(infile, mode) - if infile.endswith('.bz2'): - if hasattr(_bz2, 'open'): - return _bz2.open(infile, mode) - else: - return _bz2.BZ2File(infile, mode) - return open(infile, mode) - - -def main(argv=None): - """Run as a script.""" - if not argv: - argv = _sys.argv[1:] - - parser = _argparse.ArgumentParser( - description=__doc__, - formatter_class=_argparse.RawDescriptionHelpFormatter - ) - - inputs = parser.add_argument_group('inputs') - outputs = parser.add_argument_group('outputs') - inputs.add_argument('regfile', - help="Regression matrix (Default: STDIN)") - inputs.add_argument('-b', '--bedfile', - help="BED file to extract rsIDs from") - outputs.add_argument('-t', '--textfile', - help="Parsed output (Default: STDOUT)") - outputs.add_argument('-p', '--pandasfile', - help="Parsed dataframe") - outputs.add_argument('-l', '--logfile', - help="Log file (Default: STDERR)") - - parser.add_argument('-v', '--verbose', action='store_true', - help="Verbose logging") - parser.add_argument('-n', '--notext', action='store_true', - help="Suppress text output, output only dataframe") - - args = parser.parse_args(argv) - - ifl = args.regfile if args.regfile else _sys.stdin - ofl = args.textfile if args.textfile else _sys.stdout - - # Adjust logging - if args.verbose: - log.setLevel(_log.DEBUG) - loghandle = None - if args.logfile: - # Setup file - loghandle = _open_zipped(args.logfile, 'a') - handler = _log.StreamHandler(loghandle) - handler.setFormatter(_log_formatter) - if args.verbose: - handler.setLevel(_log.DEBUG) - log.addHandler(handler) - log.info('%s Starting Run', __file__) - elif args.verbose: - # Set STDERR logger to debug if no logfile - log.handlers = [] - _log_out.setLevel(_log.DEBUG) - log.addHandler(_log_out) - - if args.notext: - if not args.pandasfile: - _sys.stderr.write('Pandas file required in notext mode.\n') - _sys.stderr.write(parser.format_help() + '\n') - return 1 - parse_regression(ifl, pandasfile=args.pandasfile) - else: - parse_regression(ifl, textfile=ofl, pandasfile=args.pandasfile, - bed=args.bedfile) - - if loghandle: - loghandle.close() - -if __name__ == '__main__' and '__file__' in globals(): - _sys.exit(main()) diff --git a/scripts/rmdup.py b/scripts/rmdup.py index e09a437..affabf4 100644 --- a/scripts/rmdup.py +++ b/scripts/rmdup.py @@ -1,37 +1,44 @@ -#!/usr/bin/python3 +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +script removes duplicates and filters out unmapped reads -################################################################################## -# ADAPTED FROM EILON SHARON, https://github.com/eilon-s/bioinfo_scripts/rmdup.py -# script removes duplicates and filters out unmapped reads -################################################################################## +Written by Eilon Sharon: + `https://github.com/eilon-s/bioinfo_scripts/blob/master/rmdup.py`_ +""" -import sys, os, subprocess +import os +import sys +import random import argparse import pysam -import gzip -import array -import random - import psutil + class SlimRead: - def __init__(self, read, line_num): - self.line_num = line_num - self.tid = read.tid - self.reference_start = read.reference_start - self.reference_end = read.reference_end + + """ + Simple class to hold read information. + """ + + def __init__(self, read, line_num): + self.line_num = line_num + self.tid = read.tid + self.reference_start = read.reference_start + self.reference_end = read.reference_end + def memory_usage_psutil(): - # return the memory usage in MB - + """Return the memory usage in MB.""" process = psutil.Process(os.getpid()) - mem = process.get_memory_info()[0] / float(2 ** 20) + mem = process.memory_info()[0]/float(2 ** 20) return mem - -#@profile -def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality): - + + +def rmdup_pe(in_bam_file_name, out_bam_file_name): + """Paired end duplicate removal.""" + sys.stderr.write("Running in paired-ends mode\n") in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb") @@ -61,45 +68,48 @@ def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality): if len(reads_dict[cur_read_key]) == 1: # saving the line of the read for memory efficiency reads_dict[cur_read_key].append(SlimRead(read,r)) - + # found a pair -> add id to the uniq set cur_first_read = reads_dict[cur_read_key][0] cur_second_read = reads_dict[cur_read_key][1] - + #print("found a pair\n%s\n%s\n" % (cur_first_read, cur_second_read) ) #sys.stderr.write("found a pair: %s" % read.qname) - + if (cur_first_read.tid != cur_first_read.tid): - sys.stderr.write("Removing pairs that are mapped to two different reference sequences (chromosomes)") + sys.stderr.write( + "Removing pairs that are mapped to two " + "different reference sequences (chromosomes)" + ) continue # converting reference id to chr for the output to be sorted cur_pair_key_str = (str(chr(65+cur_first_read.tid)) + "_" + str(chr(65+cur_second_read.tid)) + "_" + str(min(cur_first_read.reference_start,cur_first_read.reference_end, cur_second_read.reference_start,cur_second_read.reference_end)) + "_" + str(max(cur_first_read.reference_start,cur_first_read.reference_end, cur_second_read.reference_start,cur_second_read.reference_end))) - + cur_pair_key = cur_pair_key_str.__hash__() - + #sys.stderr.write("%s\n" % cur_pair_key) - + # replacing the dictionary entry by just the line numbers reads_dict[cur_read_key] = [cur_first_read.line_num, cur_second_read.line_num] - + #sys.stderr.write("Read pair position key: %s" % cur_pair_key) - + #if (cur_pair_key == "0_0_1472172_1472342"): # print("found a pair\n%s\n%s\n" % (cur_first_read, cur_second_read) ) - + # adding to the dictionary of unique pairs if cur_pair_key in uniq_reads_corr_dict: #sys.stderr.write("-----in dict: %s, %s" % (cur_pair_key,read.qname) ) uniq_reads_corr_dict[cur_pair_key].append(cur_read_key) - + else: #sys.stderr.write("not in dict: %s, %s" % (cur_pair_key,read.qname)) uniq_reads_corr_dict[cur_pair_key] = [cur_read_key] - - # + + # #uniq_reads_corr_dict[] else: # more than 2 rreads with the same id? error! raise Exception("More than two reads with the same id: %s\n" % read) @@ -107,21 +117,22 @@ def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality): #sys.stderr.write("Adding first read: %s\n" % read.qname) # saving the line of the read for memory efficiency reads_dict[cur_read_key] = [SlimRead(read,r)] - + else: continue else: sys.stderr.write("Found a single end read: %s\n" % read) - #print read - #print("YYYYYYYYYYYYYYYYY") - - + in_bamfile.close() - - - # implemented for pair ends - sys.stderr.write("Found %d uniq reads pairs (%d reads), memory: %dMB\n" % (len(uniq_reads_corr_dict),len(uniq_reads_corr_dict)*2, memory_usage_psutil())) + + # implemented for pair ends + sys.stderr.write( + "Found {} uniq reads pairs ({} reads), memory: {}MB\n".format( + len(uniq_reads_corr_dict), len(uniq_reads_corr_dict)*2, + memory_usage_psutil() + ) + ) sys.stderr.write("Looking for duplicates...\n") max_num_of_dupliactes = 0 @@ -129,7 +140,7 @@ def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality): out_reads_line_numbers = [] - for uniq_pos,pairs_ids in sorted(uniq_reads_corr_dict.items()): + for uniq_pos, pairs_ids in sorted(uniq_reads_corr_dict.items()): cur_num_pairs = len(pairs_ids) if (cur_num_pairs>1): cnt_dup_pos += 1 @@ -150,15 +161,19 @@ def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality): # sorting the reads out_reads_line_numbers.sort() - sys.stderr.write("Found %d duplicated segments in %d uniq segments (%f), max #duplictaes: %d" % (cnt_dup_pos,len(uniq_reads_corr_dict),(1.0*cnt_dup_pos)/len(uniq_reads_corr_dict),max_num_of_dupliactes) ) + sys.stderr.write( + "Found {} duplicated segments in {} uniq segments ({:.2f}), max #duplictaes: {}" + .format(cnt_dup_pos, len(uniq_reads_corr_dict), + (1.0*cnt_dup_pos)/len(uniq_reads_corr_dict), + max_num_of_dupliactes) + ) # printing the output bam - in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb") + in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb") out_bamfile = pysam.AlignmentFile(out_bam_file_name, "wb", template=in_bamfile) sys.stderr.write("Writing output bam file...\n") - out_ind=0 r=0 for read in in_bamfile: @@ -171,11 +186,20 @@ def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality): if (out_ind >= len(out_reads_line_numbers)): break else: - sys.stderr.write("Error bug in printing the out file %d,%d,%d\n" % (r,out_ind,out_reads_line_numbers[out_ind])) - exit() + sys.stderr.write( + "Error bug in printing the out file {},{},{}\n".format( + r, out_ind, out_reads_line_numbers[out_ind] + ) + ) + sys.exit(1) if (r % 1000000 == 0): - sys.stderr.write("writing. going over read %d , printed %d reads memroy: %dMB\n" % (r,out_ind,memory_usage_psutil())) + sys.stderr.write( + "writing. going over read {}, printed {} reads memory: {}MB\n" + .format( + r, out_ind, memory_usage_psutil() + ) + ) out_bamfile.close() in_bamfile.close() @@ -183,10 +207,12 @@ def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality): # writing a single random pair from each unique position sys.stderr.write("Finish writing to output file: %s" % (out_bam_file_name) ) -def rmdup_se(in_bam_file_name,out_bam_file_name,do_rand_quality): + +def rmdup_se(in_bam_file_name, out_bam_file_name): + """Single eng duplicate removal.""" sys.stderr.write("Running in single end mode\n") - in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb") + in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb") out_bamfile = pysam.AlignmentFile(out_bam_file_name, "wb", template=in_bamfile) # build hash of read pairs and uniq position reads pairs @@ -204,7 +230,6 @@ def rmdup_se(in_bam_file_name,out_bam_file_name,do_rand_quality): # converting reference id to chr for the ourput to be sorted cur_pair_key = str(chr(65+read.tid)) + "_" + str(read.tid) + "_" + str(min(read.reference_start,read.reference_end)) + "_" + str(max(read.reference_start,read.reference_end)) - # adding to the dictionary of unique pairs if cur_pair_key in uniq_reads_corr_dict: #sys.stderr.write("-----in dict: %s, %s" % (cur_pair_key,read.qname) ) @@ -232,40 +257,52 @@ def rmdup_se(in_bam_file_name,out_bam_file_name,do_rand_quality): out_bamfile.write(cur_read) # writing a single random pair from each unique position - sys.stderr.write("Finished! found %d duplicated segments in %d uniq segments (%f), max #duplictaes: %d" % (cnt_dup_pos,len(uniq_reads_corr_dict),(1.0*cnt_dup_pos)/len(uniq_reads_corr_dict),max_num_of_dupliactes) ) + sys.stderr.write( + "Finished! found {} duplicated segments in {} uniq segments ({:.2f}), max #duplictaes: {}" + .format( + cnt_dup_pos, len(uniq_reads_corr_dict), + (1.0*cnt_dup_pos)/len(uniq_reads_corr_dict), + max_num_of_dupliactes + ) + ) out_bamfile.close() in_bamfile.close() -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("-r", action='store_true', dest='do_rand_quality', default=False) - parser.add_argument("-p", action='store_true', dest='is_paired_ends', default=False) - parser.add_argument("in_bam_file", action='store') - parser.add_argument("out_bam_file", action='store') +def main(): + """Run as a script.""" + parser = argparse.ArgumentParser( + "Identifies duplicated reads and select one (pair) randomly" + ) + parser.add_argument("-p", action='store_true', dest='is_paired_ends', + help="Treat reads as paired") + parser.add_argument("in_bam_file") + parser.add_argument("out_bam_file") options = parser.parse_args() - in_bam_file_name = options.in_bam_file + in_bam_file_name = options.in_bam_file out_bam_file_name = options.out_bam_file + # If randomize quality scores such that samtools will select a random pair + # and not the highest score notice that this will make the quality scores + # in downstream analysis random - # if randomize quality scores such that samtools will select a random pair and not the highest score - # notice that this will make the quality scores in downstream analysis random + # sys.stderr.write( + # "Warning: the -r flag is not implemented, the selection is " + # "random - i.e. quality independent\n" + # ) - sys.stderr.write("Warnning: the -r flag is not implemented, the selection is random - i.e. quality independent\n") - - - if (options.is_paired_ends): - for chrom in - rmdup_pe(in_bam_file_name,out_bam_file_name,options.do_rand_quality) + if options.is_paired_ends: + rmdup_pe( + in_bam_file_name, out_bam_file_name + ) else: - # TODO - implement memory efficient version - rmdup_se(in_bam_file_name,out_bam_file_name,options.do_rand_quality) - #sys.stderr.write('TODO - not implemented\n') - #raise Exception('Not implemented') - sys.stderr.write("\nDone!\n") + rmdup_se( + in_bam_file_name, out_bam_file_name + ) + sys.stderr.write("\nDone!\n") if __name__ == '__main__': main() diff --git a/scripts/vcf_to_indi_and_geno.py b/scripts/vcf_to_indi_and_geno.py deleted file mode 100755 index 20575ed..0000000 --- a/scripts/vcf_to_indi_and_geno.py +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -""" -Parse VCF files into genotypes and individuals files. - -EX: - chr position ref alt NA18486 NA18489 ... - chr1 10583 g a 0 1 ... - chr1 10611 C G 0 0 ... - chr1 13302 c t 1 0 … - -and: - NA18486 - NA18489 -""" -import sys as _sys -import bz2 as _bz2 -import gzip as _gzip -import argparse as _argparse - -from tqdm import tqdm - - -EXPECTED_HEADER = [ - "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT" -] - - -def parse_file(vcfs, geno, indi, logfile=_sys.stderr): - """File parser.""" - if not isinstance(vcfs, list): - raise ValueError('vcfs must be list') - # Tracking variables - header = None - idone = False # Individual order parsing - _sys.stdout.write('Looping through {} VCF files\n'.format(len(vcfs))) - with _open_zipped(geno, 'w') as gn: - for vcf in tqdm(vcfs, unit='file'): - with _open_zipped(vcf) as fin: - hdone = False # Header parsing - fdone = False # Format string parsing - for line in fin: - if line.startswith('#'): - header = line - continue - if not hdone: - header = header.strip().split('\t') - assert header[:9] == EXPECTED_HEADER - inds = header[9:] - if not idone: - orig_inds = inds - # Write individual file - with _open_zipped(indi, 'w') as ind: - ind.write('\n'.join(inds) + '\n') - # Write header - gn.write('chr\tposition\tref\talt\t{}\n'.format( - '\t'.join(inds) - )) - idone = True - if not inds == orig_inds: - raise ValueError( - ("Individual order is different in {fl}!\n" - "Orig: {orig}\nCurr: {cur}").format( - fl=vcf, orig=orig_inds, cur=inds - ) - ) - hdone = True - f = line.strip().split('\t') - if not fdone: - # Get the genotype info location - try: - gti = f[8].split(':').index('GT') - except ValueError: - raise ValueError( - 'GT column missing from VCF format string. In need ' - 'the genotype info location to know where to look ' - 'for the genotype (looks like 0|1)' - ) - # Write core records - outstr = ( - '{chr}\t{pos}\t{ref}\t{alt}'.format( - chr=f[0], pos=f[1], ref=f[3], alt=f[4] - ) - ) - # Write genotypes - badgt = False - for ind in f[9:]: - gt = ind.split(':')[gti] - if not '|' in gt: - raise ValueError( - 'Expected genotype to be like 0|0, is {}'.format(gt) + - '\n' + line - ) - if gt == '0|0': - gt = '0' - elif gt in ['0|1', '1|0']: - gt = '1' - elif gt == '1|1': - gt = '2' - else: - logfile.write( - 'Invalid genotype {}: {}'.format(gt, line) - ) - gt = 'NA' - badgt = True - break - outstr += ('\t{}'.format(gt)) - if badgt: - continue - gn.write(outstr + '\n') - return - - -def _open_zipped(infile, mode='r'): - """Return file handle of file regardless of compressed or not. - - Also returns already opened files unchanged, text mode automatic for - compatibility with python2. - """ - # Return already open files - if hasattr(infile, 'write'): - return infile - # Make text mode automatic - if len(mode) == 1: - mode = mode + 't' - if not isinstance(infile, str): - raise ValueError("I cannot open a filename that isn't a string.") - if infile.endswith('.gz'): - return _gzip.open(infile, mode) - if infile.endswith('.bz2'): - if hasattr(_bz2, 'open'): - return _bz2.open(infile, mode) - else: - return _bz2.BZ2File(infile, mode) - return open(infile, mode) - - -def main(argv=None): - """Run as a script.""" - if not argv: - argv = _sys.argv[1:] - - parser = _argparse.ArgumentParser( - description=__doc__, - formatter_class=_argparse.RawDescriptionHelpFormatter - ) - - parser.add_argument('genotype_file', help="Genotype file") - parser.add_argument('individual_file', help="Genotype file") - parser.add_argument('vcf_files', nargs='+', help="VCF Files") - - parser.add_argument('-l', '--logfile', default=_sys.stderr, - type=_argparse.FileType('a'), - help="Log file (Default: STDERR)") - - args = parser.parse_args(argv) - - parse_file( - args.vcf_files, args.genotype_file, - args.individual_file, args.logfile - ) - - -if __name__ == '__main__' and '__file__' in globals(): - _sys.exit(main()) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9536b3e --- /dev/null +++ b/setup.py @@ -0,0 +1,72 @@ +"""Installation instructions for cisVar.""" +import os +from setuptools import setup + +import cisVar # For version + +VERSION=cisVar.__version__ +GITHUB='https://github.com/TheFraserLab/cisVar' + +REQUIREMENTS = ['pandas', 'numpy', 'scipy', 'psutil', 'snakemake', 'wget'] + + +def read(fname): + """Read the contents of a file in this dir.""" + with open(os.path.join(os.path.dirname(__file__), fname)) as fin: + return fin.read() + + +# Actual setup instructions +setup( + name = 'cisVar', + version = VERSION, + author = 'Ashley Tehranchi', + author_email = 'mike.dacre@gmail.com', + description = ( + "Calculate both pre and post frequencies for ChIP or ATAC style data" + ), + keywords = ( + "ATACseq ChIPseq regression bioinformatics" + ), + long_description = read('README.md'), + license = 'MIT', + + # URLs + url = GITHUB, + download_url='{0}/archive/v{1}.tar.gz'.format(GITHUB, VERSION), + + py_modules=['cisVar'], + + # Entry points and scripts + # Entry points are functions that can use sys.argv (e.g. main()) + # Scripts are independent pieces of code intended to be executed as is + entry_points = { + 'console_scripts': [ + 'cisVar = cisVar:main', + ], + }, + scripts = ['cisVar.py', 'regression_qtls.R', + 'scripts/combine.py', 'scripts/plot_fun.R'], + + + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + 'Development Status :: 4 - Beta', + # 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'Intended Audience :: End Users/Desktop', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: POSIX', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Topic :: Utilities', + ], + + # Requirements + requires=REQUIREMENTS, + install_requires=REQUIREMENTS, +)