From 0b1d5587835651e38311dacd0cf7cecd0f097dbb Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Sun, 7 Nov 2021 15:53:59 -0500 Subject: [PATCH 01/10] init latentcor --- R/correlations.R | 6 ++++++ tests/testthat/test_correlations.R | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 R/correlations.R create mode 100644 tests/testthat/test_correlations.R diff --git a/R/correlations.R b/R/correlations.R new file mode 100644 index 0000000..0f480f1 --- /dev/null +++ b/R/correlations.R @@ -0,0 +1,6 @@ +#' @export +latentcor <- function(data, types = NULL) { + suppressMessages( + latentcor::latentcor(X = data, types = types, use.nearPD=FALSE, nu = 0, showplot = FALSE)$Rpointwise + ) +} diff --git a/tests/testthat/test_correlations.R b/tests/testthat/test_correlations.R new file mode 100644 index 0000000..027b64c --- /dev/null +++ b/tests/testthat/test_correlations.R @@ -0,0 +1,29 @@ +context('setup') + +p <- 20 +e <- p +n <- 100 +set.seed(10010) +g <- make_graph('erdos_renyi', p, e) +S <- cov2cor(prec2cov(graph2prec(g))) +X <- rmvnegbin(n, rep(10,p), S, ks=.3) + +pargs <- list(seed=10010, rep.num=10) + +context("Correlation fit") +# lmx <- 1 +lmr <- 5e-2 +nlam <- 10 +## Default ## +out_default <- spiec.easi(X, method='mb', cov.method='default', verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, pulsar.select=FALSE) +out_cor <- spiec.easi(X, method='mb', cov.method='cor', verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, pulsar.select=FALSE) +out_cov <- spiec.easi(X, method='mb', cov.method='cor', verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, pulsar.select=FALSE) +out_lcor <- spiec.easi(X, method='mb', cov.method='latentcor', verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, pulsar.select=FALSE) + +## TODO: test some outputs here + +## test normalizations: TODO; break this out +out_mlcor <- spiec.easi(X, method='mb', cov.method='latentcor', + norm.params=list(method='mclr'), + verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, + pulsar.select=FALSE) From ee1adaef0d0644b55146095fc21f7152a845149e Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Sun, 7 Nov 2021 15:55:42 -0500 Subject: [PATCH 02/10] add cov.fun option to Sparse[LowRank]ICov --- R/SparseICov.R | 10 ++++++++-- R/SparseLowRankICov.R | 14 ++++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/R/SparseICov.R b/R/SparseICov.R index a07950d..0bf0bb8 100644 --- a/R/SparseICov.R +++ b/R/SparseICov.R @@ -42,15 +42,20 @@ #' image(as.matrix(est.log$path[[3]][1:5,1:5])) #' image(as.matrix(est.clr$path[[3]][1:5,1:5])) #' image(as.matrix(est.f$path[[3]][1:5,1:5])) -sparseiCov <- function(data, method, npn=FALSE, verbose=FALSE, cov.output = TRUE, ...) { +sparseiCov <- function(data, method, cov.fun='cor', npn=FALSE, verbose=FALSE, cov.output = TRUE, ...) { if (npn) data <- huge::huge.npn(data, verbose=verbose) + if (isSymmetric(data)) SigmaO <- data + else { + stopifnot(cov.fun %in% c('cor', 'cov', 'latentcor')) + SigmaO <- match.fun(cov.fun)(data) + } args <- list(...) method <- switch(method, glasso = "glasso", mb = "mb", stop("Method not supported")) if (is.null(args$lambda.min.ratio)) args$lambda.min.ratio <- 1e-3 - est <- do.call(huge::huge, c(args, list(x=data, + est <- do.call(huge::huge, c(args, list(x=SigmaO, method=method, verbose=verbose, cov.output = cov.output))) @@ -63,6 +68,7 @@ sparseiCov <- function(data, method, npn=FALSE, verbose=FALSE, cov.output = TRUE # est$data <- data # est$sym <- ifelse(!is.null(args$sym), args$sym, 'or') # } + est$data <- data return(est) } diff --git a/R/SparseLowRankICov.R b/R/SparseLowRankICov.R index 2839bb3..75c1d1b 100644 --- a/R/SparseLowRankICov.R +++ b/R/SparseLowRankICov.R @@ -5,7 +5,8 @@ #' @param data the n x p data matrix #' @param npn flag to first fit nonparametric normal transform to the data #' @param verbose flag to turn on verbose output -#' @param cor flag to use correlation matrix as the input (default: false - uses covariance) +# DEPRECATED +## #' @param cor flag to use correlation matrix as the input (default: false - uses covariance) #' @param ... arguments to override default algorithm settings (see details) #' @export #' @details @@ -19,7 +20,7 @@ #' The argument \code{nlambda} determines the number of penalties - somewhere between 10-100 is usually good, depending on how the values of empirical correlation are distributed.#' @export #' #' One of \code{beta} (penalty for the nuclear norm) or \code{r} (number of ranks) should be supplied or \code{r=2} is chosen by default. -sparseLowRankiCov <- function(data, npn=FALSE, verbose=FALSE, cor=FALSE, ...) { +sparseLowRankiCov <- function(data, cov.fun='cor', npn=FALSE, verbose=FALSE, cor=FALSE, ...) { ## TODO: make args to admm2 explicit args <- list(...) if (length(args$r) > 1 || length(args$beta) > 1) @@ -27,8 +28,13 @@ sparseLowRankiCov <- function(data, npn=FALSE, verbose=FALSE, cor=FALSE, ...) { if (npn) data <- huge::huge.npn(data, verbose=verbose) if (isSymmetric(data)) SigmaO <- data - else SigmaO <- cov(data) - if (cor) SigmaO <- cov2cor(SigmaO) + else { + stopifnot(cov.fun %in% c('cor', 'cov', 'latentcor')) + fcor <- match.fun(cov.fun) + SigmaO <- fcor(data) + } + # else SigmaO <- cov(data) + # if (cor) SigmaO <- cov2cor(SigmaO) if (!is.null(args[[ "lambda.max" ]])) maxlam <- args$lambda.max else maxlam <- 1 From 0779f27c40ce946532ad2a0657d09cbe174ad548 Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Sat, 4 Dec 2021 16:22:57 -0500 Subject: [PATCH 03/10] add latentcor, multi-data types, mclr --- DESCRIPTION | 7 +- NAMESPACE | 6 ++ R/SparseICov.R | 6 +- R/correlations.R | 10 ++ R/data_types.R | 26 +++++ R/normalization.R | 65 ++++++++++++ R/spiec-easi.R | 142 ++++++++++++++++++++------- man/clr.Rd | 9 ++ man/mclr.Rd | 25 +++++ man/sparseLowRankiCov.Rd | 11 ++- man/sparseiCov.Rd | 13 ++- man/spiec.easi.Rd | 17 +++- tests/testthat/test_correlations.R | 11 +-- tests/testthat/test_normalizations.R | 15 +++ tests/testthat/test_sparcc.R | 11 ++- tests/testthat/test_typedetect.R | 26 +++++ 16 files changed, 336 insertions(+), 64 deletions(-) create mode 100644 R/data_types.R create mode 100644 man/mclr.Rd create mode 100644 tests/testthat/test_normalizations.R create mode 100644 tests/testthat/test_typedetect.R diff --git a/DESCRIPTION b/DESCRIPTION index 8334796..32c2c42 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: SpiecEasi Title: Sparse Inverse Covariance for Ecological Statistical Inference -Version: 1.1.3 +Version: 2.0.0 Authors@R: c( person("Zachary", "Kurtz", role = c("aut", "cre"), email="zdkurtz@gmail.com"), person("Christian", "Mueller", role = "aut"), @@ -10,7 +10,7 @@ Authors@R: c( Description: Estimate networks from the precision matrix of compositional microbial abundance data. Encoding: UTF-8 Depends: - R (>= 3.6.0), + R (>= 4.0.0), Imports: stats, methods, @@ -21,7 +21,8 @@ Imports: MASS, VGAM, Matrix (>= 1.5), - glmnet + glmnet, + latentcor (>= 1.2.0), Suggests: parallel, boot, diff --git a/NAMESPACE b/NAMESPACE index 1a044f0..ca43249 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,13 +34,19 @@ export(getOptiCov) export(getRefit) export(getStability) export(get_comm_params) +export(get_types) export(graph2prec) +export(is.count) +export(is.normalized) +export(latentcor) export(make_graph) +export(mclr) export(multi.spiec.easi) export(neff) export(norm_pseudo) export(norm_rdiric) export(norm_to_total) +export(pclr) export(prec2cov) export(pval.sparccboot) export(qqdplot_comm) diff --git a/R/SparseICov.R b/R/SparseICov.R index 0bf0bb8..ed828ef 100644 --- a/R/SparseICov.R +++ b/R/SparseICov.R @@ -9,6 +9,7 @@ #' @param npn perform Nonparanormal (npn) transformation before estimation? #' @param verbose print progress to standard out #' @param cov.output return the covariance matrix as well. +#' @param types if cov.method is latentcor, the column types parameter can be supplied as a character vector or (if NULL) inferred #' @param ... further arguments to huge/estimation functions. See details. #' @details #' This is a wrapper function for sparse iCov estimations performed by glasso in the huge package. @@ -42,13 +43,12 @@ #' image(as.matrix(est.log$path[[3]][1:5,1:5])) #' image(as.matrix(est.clr$path[[3]][1:5,1:5])) #' image(as.matrix(est.f$path[[3]][1:5,1:5])) -sparseiCov <- function(data, method, cov.fun='cor', npn=FALSE, verbose=FALSE, cov.output = TRUE, ...) { +sparseiCov <- function(data, method, cov.fun='cor', npn=FALSE, verbose=FALSE, cov.output = TRUE, types=NULL, ...) { if (npn) data <- huge::huge.npn(data, verbose=verbose) if (isSymmetric(data)) SigmaO <- data else { - stopifnot(cov.fun %in% c('cor', 'cov', 'latentcor')) - SigmaO <- match.fun(cov.fun)(data) + SigmaO <- .match.cov(cov.fun, data, types) } args <- list(...) diff --git a/R/correlations.R b/R/correlations.R index 0f480f1..daae42e 100644 --- a/R/correlations.R +++ b/R/correlations.R @@ -1,6 +1,16 @@ #' @export latentcor <- function(data, types = NULL) { + ## remove custom 'count' types ## TODO: should this be done here?? + types <- gsub("_count", "", types) suppressMessages( latentcor::latentcor(X = data, types = types, use.nearPD=FALSE, nu = 0, showplot = FALSE)$Rpointwise ) } + +#' @keywords internal +.match.cov <- function(cov.fun, data, types) { + switch(cov.fun, + cor=cor(data), + cov=cov(data), + latentcor=latentcor(data, types=types)) +} diff --git a/R/data_types.R b/R/data_types.R new file mode 100644 index 0000000..3465768 --- /dev/null +++ b/R/data_types.R @@ -0,0 +1,26 @@ +#' @export +is.count <- function(x) { + stopifnot(is.vector(x)) + if (!is.numeric(x) && !is.integer(x)) return(FALSE) + if (isFALSE(all.equal(x, as.integer(x)))) return(FALSE) + if (any(x<0)) return(FALSE) + return(TRUE) +} + +#' @export +is.normalized <- function(X) { + # assuming n x p matrix + stopifnot(is.matrix(X) | is.data.frame(X)) + isTRUE(all.equal(rowSums(X), rep(1L, nrow(X)))) +} + + +#' @export +get_types <- function(X, ...) { + types <- latentcor::get_types(X) + ## determine count and compositional types as a subset of truncated or continuous + tc_ind <- (types=="tru") | (types=="con") + is_count <- apply(X[,tc_ind], 2, is.count) + types[tc_ind][is_count] <- paste(types[tc_ind][is_count], "count", sep="_") + types +} diff --git a/R/normalization.R b/R/normalization.R index 9f0d23a..de629e6 100644 --- a/R/normalization.R +++ b/R/normalization.R @@ -85,6 +85,71 @@ clr.data.frame <- function(x.f, mar=2, ...) { clr(as.matrix(x.f), mar, ...) } +#' Centered log-ratio functions +#' @param x input count data +#' @param pseudo value of the pseudocount +#' @param ... pass through arguments +#' @rdname clr +#' @export +pclr <- function(x, mar=2, pseudo=1, ...) { + clr(x+pseudo, mar=mar, ...) +} + + +#' Modified central log ratio (mclr) transformation +#' +#' Introduced by GraceYoon/SPRING +#' +#' @param data raw count data or compositional data (n by p) does not matter. +#' @param base exp(1) for natural log +#' @param tol tolerance for checking zeros + +# For eps and atleast, users do not have to specify any values. Default should be enough. +#' @param eps epsilon in eq (2) of the paper "Yoon, Gaynanova, M\"{u}ller (2019), Frontiers in Genetics". positive shifts to all non-zero compositions. Refer to the paper for more details. eps = absolute value of minimum of log ratio counts plus c. +#' @param atleast default value is 1. Constant c which ensures all nonzero values to be strictly positive. default is 1. +#' +#' +#' @return \code{mclr} returns a data matrix of the same dimension with input data matrix. +#' @export +## from GraceYoon/SPRING/src/R/helpers.R +mclr <- function(data, base = exp(1), tol = 1e-16, eps = NULL, atleast = 1){ + data <- as.matrix(data) + nzero <- (data >= tol) # index for nonzero part + LOG <- ifelse(nzero, log(data, base), 0.0) # take log for only nonzero values. zeros stay as zeros. + + # centralize by the log of "geometric mean of only nonzero part" # it should be calculated by each row. + if (nrow(data) > 1){ + clrdata <- ifelse(nzero, LOG - rowMeans(LOG)/rowMeans(nzero), 0.0) + } else if (nrow(data) == 1){ + clrdata <- ifelse(nzero, LOG - mean(LOG)/mean(nzero), 0.0) + } + + if (is.null(eps)){ + if(atleast < 0){ + warning("atleast should be positive. The functions uses default value 1 instead.") + atleast = 1 + } + if( min(clrdata) < 0 ){ # to find the smallest negative value and add 1 to shift all dataa larger than zero. + positivecst <- abs(min(clrdata)) + atleast # "atleast" has default 1. + }else{ + positivecst <- 0 + } + # positive shift + ADDpos <- ifelse(nzero, clrdata + positivecst, 0.0) ## make all non-zero values strictly positive. + return(ADDpos) + } else if(eps == 0){ + ## no shift. clr transform applied to non-zero proportions only. without pseudo count. + return(clrdata) + } else if(eps > 0){ + ## use user-defined eps for additional positive shift. + ADDpos <- ifelse(nzero, clrdata + eps, 0.0) + return(ADDpos) + } else { + stop("check your eps value for additional positive shift. Otherwise, leave it as NULL.") + } +} + + #' Additive log-ratio functions #' @param x.f input data #' @param ... pass through arguments diff --git a/R/spiec-easi.R b/R/spiec-easi.R index 203af15..7de9542 100644 --- a/R/spiec-easi.R +++ b/R/spiec-easi.R @@ -18,21 +18,22 @@ spiec.easi <- function(data, ...) { return(OTU) } -.data.checks <- function(data) { - ## data checks ## - if (inherits(data, 'list')) { - sink <- lapply(data, .data.checks) - return(NULL) - } - if (isTRUE(all.equal(rowSums(data), rep(1L, nrow(data))))) { - warning('Data is normalized, but raw counts are expected') - } - - if (any(data<0)) { - warning('Negative values detected, but raw counts are expected') - } - return(NULL) -} +# DEPRECATED IN FAVOR OF TYPE DETECTION +# .data.checks <- function(data) { +# ## data checks ## +# if (inherits(data, 'list')) { +# sink <- lapply(data, .data.checks) +# return(NULL) +# } +# if (isTRUE(all.equal(rowSums(data), rep(1L, nrow(data))))) { +# warning('Data is normalized, but raw counts are expected') +# } +# +# if (any(data<0)) { +# warning('Negative values detected, but raw counts are expected') +# } +# return(NULL) +# } #' @method spiec.easi phyloseq #' @rdname spiec.easi @@ -57,14 +58,50 @@ spiec.easi.otu_table <- function(data, ...) { #' @noRd -.spiec.easi.norm <- function(data) { +#' @keywords internal +.spiec.easi.norm <- function(data, method='pclr', types=NULL, ...) { # internal function to normalize a data matrix if (inherits(data, 'matrix')) { + ## TODO: check that types argument and data attr are not both supplied + types <- attr(data, 'types') + if (is.null(types)) types <- get_types(data) + ## all columns are counts or comp - normalize ## + # TODO: create message if dataset is partially normalized + comp <- is.normalized(data) + if (comp || all(grepl("count", types))) { + if (comp & (method == "plcr")) { + message("input data is already total-sum-normalized, Is pseudocount needed?") + } + if (method=='pclr') { + data <- t(pclr(data, mar=1, ...)) + types[types == "tru_count"] <- 'con' + } else if (method == 'clr') { + data <- t(clr(data, mar=1, ...)) + types[types == "tru_count"] <- 'con' + } else if (method == 'alr') { + data <- t(alr(data, mar=1, ...)) + types[types == "tru_count"] <- 'con' + } else if (method == "mclr") { + data <- mclr(data, ...) + } else { + stop(sprintf("method '%s' not recognized", method)) + } + ## log ratio transforms won't be counts any longer + types <- gsub("_count", "", types) + } + attr(data, 'types') <- types + return(data) ## standard data pipeline - return(t(clr(data+1, 1))) } else if (inherits(data, 'list')) { ## multi domain spiec.easi, data must be list of numeric matrices - return(do.call('cbind', lapply(data, .spiec.easi.norm))) + ## types must be a list of types + data <- lapply(seq_along(data), function(i) { + .spiec.easi.norm(data[[i]], method=method, types=types[[i]]) + }) + types <- lapply(data, attr, which='types') + data <- do.call('cbind', data) + attr(data, 'types') <- unlist(types) + return(data) } else { stop('input data must be a numeric matrix') } @@ -125,46 +162,65 @@ NULL #' @param data For a matrix, non-normalized count OTU/data table with samples on rows and features/OTUs in columns. Can also by phyloseq or otu_table object. -#' @param method estimation method to use as a character string. Currently either 'glasso' or 'mb' (meinshausen-buhlmann's neighborhood selection) +#' @param types if data is a list, specify count, compositional or environmental covariates types. #TODO: flesh out this argument +#' @param method inverse covariance estimation method to use as a character string. Currently either 'glasso' or 'mb' (meinshausen-buhlmann's neighborhood selection) #' @param sel.criterion character string specifying criterion/method for model selection. Accepts 'stars' [default], 'bstars' (Bounded StARS) #' @param verbose flag to show progress messages #' @param pulsar.select flag to perform model selection. Choices are TRUE/FALSE/'batch' #' @param pulsar.params list of further arguments to \code{\link{pulsar}} or \code{\link{batch.pulsar}}. See the documentation for \code{\link{pulsar.params}}. +#' @param lambda.log should values of lambda be distributed logarithmically (\code{TRUE}) or linearly ()\code{FALSE}) between \code{lamba.min} and \code{lambda.max}? +#' @param norm.params method and other named arguments for the function which should normalize compositional (count) data [method can be 'clr', 'pclr', 'mclr' or 'alr']. +#' @param cov.method method for inferring the empirical covariance matrix. 'Default' choice depends on the method and input data but can be `cov`, `cor` or `latentcor`. #' @param icov.select deprecated. #' @param icov.select.params deprecated. -#' @param lambda.log should values of lambda be distributed logarithmically (\code{TRUE}) or linearly ()\code{FALSE}) between \code{lamba.min} and \code{lambda.max}? #' @param ... further arguments to \code{\link{sparseiCov}} / \code{huge} #' @method spiec.easi default #' @rdname spiec.easi #' @seealso multi.spiec.easi #' @export -spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', - verbose=TRUE, pulsar.select=TRUE, pulsar.params=list(), - icov.select=pulsar.select, - icov.select.params=pulsar.params, - lambda.log=TRUE, ...) { +spiec.easi.default <- function(data, types=NULL, + method='glasso', + sel.criterion='stars', + verbose=TRUE, + pulsar.select=TRUE, pulsar.params=list(), + lambda.log=TRUE, + norm.params=list(method='pclr', pseudo=1), + cov.method='default', + icov.select=pulsar.select, + icov.select.params=pulsar.params, ...) { args <- list(...) if (verbose) msg <- .makeMessage("Applying data transformations...") else msg <- .makeMessage('') + # TODO: check that norm.params is a list with valid method + norm.params[['data']] <- data + # TODO: check that cov.params is a list with valid method + stopifnot(cov.method %in% c('default', 'cor', 'cov', 'latentcor')) + switch(method, - glasso = { + glasso = { message(msg, appendLF=verbose) estFun <- "sparseiCov" args$method <- method - X <- .spiec.easi.norm(data) + X <- do.call(.spiec.easi.norm, norm.params) + args$types <- attr(X, 'types') + if (cov.method=='default') args$cov.fun <- 'cor' + else args$cov.fun <- cov.method if (is.null(args[['lambda.max']])) - args$lambda.max <- getMaxCov(cor(X)) + args$lambda.max <- getMaxCov(.match.cov(args$cov.fun, X, args$types)) }, mb = { message(msg, appendLF=verbose) estFun <- "sparseiCov" args$method <- method - X <- .spiec.easi.norm(data) + X <- do.call(.spiec.easi.norm, norm.params) + args$types <- attr(X, 'types') + if (cov.method=='default') args$cov.fun <- 'cor' + else args$cov.fun <- cov.method if (is.null(args[['lambda.max']])) - args$lambda.max <- getMaxCov(cor(X)) + args$lambda.max <- getMaxCov(.match.cov(args$cov.fun, X, args$types)) }, slr = { @@ -186,15 +242,20 @@ spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', } message(msg, appendLF=verbose) estFun <- "sparseLowRankiCov" - X <- .spiec.easi.norm(data) + X <- do.call(.spiec.easi.norm, norm.params) + args$types <- attr(X, 'types') + if (cov.method=='default') args$cov.fun <- 'cov' + else args$cov.fun <- cov.method if (is.null(args[['lambda.max']])) - args$lambda.max <- getMaxCov(cov(X)) + args$lambda.max <- getMaxCov(.match.cov(args$cov.fun, X, args$types)) }, coat = { message(msg, appendLF=verbose) estFun <- "coat" - X <- .spiec.easi.norm(data) + X <- do.call(.spiec.easi.norm, norm.params) + types <- attr(X, 'types') + ## TODO: implement cov/cor/latentcor for COAT if (is.null(args[['lambda.max']])) args$lambda.max <- getMaxCov(X) }, @@ -212,8 +273,8 @@ spiec.easi.default <- function(data, method='glasso', sel.criterion='stars', }, poisson= { - if (inherits(data, 'list')) - stop('method "poisson" does not support list data') + if (inherits(data, 'list')) + stop('method "poisson" does not support list data') message(msg, appendLF=verbose) estFun <- "neighborhood.net" @@ -318,6 +379,7 @@ multi.spiec.easi <- function(datalist, method='glasso', sel.criterion='stars', verbose=TRUE, pulsar.select=TRUE, pulsar.params=list(), ...) { ## functional wrapper for spiec.easi.list +## check that datalist is all compositional types spiec.easi.list(datalist, method=method, sel.criterion=sel.criterion, verbose=verbose, pulsar.select=pulsar.select, pulsar.params=pulsar.params, ...) @@ -328,14 +390,18 @@ multi.spiec.easi <- function(datalist, method='glasso', sel.criterion='stars', #' @rdname multi.spiec.easi #' @export spiec.easi.list <- function(data, ...) { - classes <- sapply(data, class) - if (length(unique(classes)) != 1) - warning('input list contains data of mixed classes.') + args <- list(...) + # TODO: move this check to to after types detection + # classes <- sapply(data, function(x) class(x)[1]) + # if ((length(unique(classes)) != 1) & (args$cov.method!='latentcor')) + # message("input list contains data of mixed classes. 'latentcor' is the recommended `cov.method`") ## convert phyloseq objects to matrices if (any('phyloseq' %in% classes) || any('otu_table' %in% classes)) data <- lapply(data, .phy2mat) + ## TODO: allow phyloseq sample_data for env metadata + ## Finally, check the number of rows (samples) are equal ## and sample names are identical (sample names can be NULL) ssizes <- lapply(data, nrow) diff --git a/man/clr.Rd b/man/clr.Rd index 13e5e1b..3d9f058 100644 --- a/man/clr.Rd +++ b/man/clr.Rd @@ -5,6 +5,7 @@ \alias{clr.default} \alias{clr.matrix} \alias{clr.data.frame} +\alias{pclr} \title{Centered log-ratio functions} \usage{ clr(x.f, ...) @@ -14,6 +15,8 @@ clr(x.f, ...) \method{clr}{matrix}(x.f, mar = 2, ...) \method{clr}{data.frame}(x.f, mar = 2, ...) + +pclr(x, mar = 2, pseudo = 1, ...) } \arguments{ \item{x.f}{input data} @@ -25,7 +28,13 @@ clr(x.f, ...) \item{tol}{tolerance for a numerical zero} \item{mar}{margin to apply the transformation (rows: 1 or cols: 2)} + +\item{x}{input count data} + +\item{pseudo}{value of the pseudocount} } \description{ +Centered log-ratio functions + Centered log-ratio functions } diff --git a/man/mclr.Rd b/man/mclr.Rd new file mode 100644 index 0000000..5981bc5 --- /dev/null +++ b/man/mclr.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/normalization.R +\name{mclr} +\alias{mclr} +\title{Modified central log ratio (mclr) transformation} +\usage{ +mclr(data, base = exp(1), tol = 1e-16, eps = NULL, atleast = 1) +} +\arguments{ +\item{data}{raw count data or compositional data (n by p) does not matter.} + +\item{base}{exp(1) for natural log} + +\item{tol}{tolerance for checking zeros} + +\item{eps}{epsilon in eq (2) of the paper "Yoon, Gaynanova, M\"{u}ller (2019), Frontiers in Genetics". positive shifts to all non-zero compositions. Refer to the paper for more details. eps = absolute value of minimum of log ratio counts plus c.} + +\item{atleast}{default value is 1. Constant c which ensures all nonzero values to be strictly positive. default is 1.} +} +\value{ +\code{mclr} returns a data matrix of the same dimension with input data matrix. +} +\description{ +Introduced by GraceYoon/SPRING +} diff --git a/man/sparseLowRankiCov.Rd b/man/sparseLowRankiCov.Rd index b0045a1..7a081f9 100644 --- a/man/sparseLowRankiCov.Rd +++ b/man/sparseLowRankiCov.Rd @@ -4,7 +4,14 @@ \alias{sparseLowRankiCov} \title{Sparse plus Low Rank inverse covariance} \usage{ -sparseLowRankiCov(data, npn = FALSE, verbose = FALSE, cor = FALSE, ...) +sparseLowRankiCov( + data, + cov.fun = "cor", + npn = FALSE, + verbose = FALSE, + cor = FALSE, + ... +) } \arguments{ \item{data}{the n x p data matrix} @@ -13,8 +20,6 @@ sparseLowRankiCov(data, npn = FALSE, verbose = FALSE, cor = FALSE, ...) \item{verbose}{flag to turn on verbose output} -\item{cor}{flag to use correlation matrix as the input (default: false - uses covariance)} - \item{...}{arguments to override default algorithm settings (see details)} } \description{ diff --git a/man/sparseiCov.Rd b/man/sparseiCov.Rd index 5d9e6cf..0e6596e 100644 --- a/man/sparseiCov.Rd +++ b/man/sparseiCov.Rd @@ -4,7 +4,16 @@ \alias{sparseiCov} \title{Sparse/penalized estimators of covariance matrices} \usage{ -sparseiCov(data, method, npn = FALSE, verbose = FALSE, cov.output = TRUE, ...) +sparseiCov( + data, + method, + cov.fun = "cor", + npn = FALSE, + verbose = FALSE, + cov.output = TRUE, + types = NULL, + ... +) } \arguments{ \item{data}{data matrix with features/OTUs in the columns and samples in the rows. Should be transformed by clr for meaningful results, if the data is compositional} @@ -17,6 +26,8 @@ sparseiCov(data, method, npn = FALSE, verbose = FALSE, cov.output = TRUE, ...) \item{cov.output}{return the covariance matrix as well.} +\item{types}{if cov.method is latentcor, the column types parameter can be supplied as a character vector or (if NULL) inferred} + \item{...}{further arguments to huge/estimation functions. See details.} } \description{ diff --git a/man/spiec.easi.Rd b/man/spiec.easi.Rd index e68884c..ea7172a 100644 --- a/man/spiec.easi.Rd +++ b/man/spiec.easi.Rd @@ -15,14 +15,17 @@ spiec.easi(data, ...) \method{spiec.easi}{default}( data, + types = NULL, method = "glasso", sel.criterion = "stars", verbose = TRUE, pulsar.select = TRUE, pulsar.params = list(), + lambda.log = TRUE, + norm.params = list(method = "pclr", pseudo = 1), + cov.method = "default", icov.select = pulsar.select, icov.select.params = pulsar.params, - lambda.log = TRUE, ... ) } @@ -31,7 +34,9 @@ spiec.easi(data, ...) \item{...}{further arguments to \code{\link{sparseiCov}} / \code{huge}} -\item{method}{estimation method to use as a character string. Currently either 'glasso' or 'mb' (meinshausen-buhlmann's neighborhood selection)} +\item{types}{if data is a list, specify count, compositional or environmental covariates types. #TODO: flesh out this argument} + +\item{method}{inverse covariance estimation method to use as a character string. Currently either 'glasso' or 'mb' (meinshausen-buhlmann's neighborhood selection)} \item{sel.criterion}{character string specifying criterion/method for model selection. Accepts 'stars' [default], 'bstars' (Bounded StARS)} @@ -41,11 +46,15 @@ spiec.easi(data, ...) \item{pulsar.params}{list of further arguments to \code{\link{pulsar}} or \code{\link{batch.pulsar}}. See the documentation for \code{\link{pulsar.params}}.} +\item{lambda.log}{should values of lambda be distributed logarithmically (\code{TRUE}) or linearly ()\code{FALSE}) between \code{lamba.min} and \code{lambda.max}?} + +\item{norm.params}{method and other named arguments for the function which should normalize compositional (count) data [method can be 'clr', 'pclr', 'mclr' or 'alr'].} + +\item{cov.method}{method for inferring the empirical covariance matrix. 'Default' choice depends on the method and input data but can be `cov`, `cor` or `latentcor`.} + \item{icov.select}{deprecated.} \item{icov.select.params}{deprecated.} - -\item{lambda.log}{should values of lambda be distributed logarithmically (\code{TRUE}) or linearly ()\code{FALSE}) between \code{lamba.min} and \code{lambda.max}?} } \description{ Run the whole SPIEC-EASI pipeline, from data transformation, sparse inverse covariance estimation and model selection. diff --git a/tests/testthat/test_correlations.R b/tests/testthat/test_correlations.R index 027b64c..415e295 100644 --- a/tests/testthat/test_correlations.R +++ b/tests/testthat/test_correlations.R @@ -1,5 +1,5 @@ context('setup') - +## Test various correlation methods p <- 20 e <- p n <- 100 @@ -14,6 +14,8 @@ context("Correlation fit") # lmx <- 1 lmr <- 5e-2 nlam <- 10 + +test_that("execution succeeds", { ## Default ## out_default <- spiec.easi(X, method='mb', cov.method='default', verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, pulsar.select=FALSE) out_cor <- spiec.easi(X, method='mb', cov.method='cor', verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, pulsar.select=FALSE) @@ -21,9 +23,4 @@ out_cov <- spiec.easi(X, method='mb', cov.method='cor', verbose=FALSE, lambda.mi out_lcor <- spiec.easi(X, method='mb', cov.method='latentcor', verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, pulsar.select=FALSE) ## TODO: test some outputs here - -## test normalizations: TODO; break this out -out_mlcor <- spiec.easi(X, method='mb', cov.method='latentcor', - norm.params=list(method='mclr'), - verbose=FALSE, lambda.min.ratio=lmr, nlambda=nlam, - pulsar.select=FALSE) +}) diff --git a/tests/testthat/test_normalizations.R b/tests/testthat/test_normalizations.R new file mode 100644 index 0000000..9429ebc --- /dev/null +++ b/tests/testthat/test_normalizations.R @@ -0,0 +1,15 @@ +context('setup') +## Test various correlation methods +p <- 20 +e <- p +n <- 100 +set.seed(10010) +g <- make_graph('erdos_renyi', p, e) +S <- cov2cor(prec2cov(graph2prec(g))) +X <- rmvnegbin(n, rep(10,p), S, ks=.3) + +test_that("execution succeeds", { + Xmclr <- .spiec.easi.norm(X, method='mclr') + Xpclr <- .spiec.easi.norm(X, method='pclr') + Xalr <- .spiec.easi.norm(X, method='alr') +}) diff --git a/tests/testthat/test_sparcc.R b/tests/testthat/test_sparcc.R index 3bcad01..7190570 100644 --- a/tests/testthat/test_sparcc.R +++ b/tests/testthat/test_sparcc.R @@ -10,11 +10,12 @@ X <- exp(rmvnorm(n, rep(0,p), S)) X.f <- t(apply(X, 1, norm_to_total)) context("Input data") -test_that("data is counts", { - expect_silent(.data.checks(X)) - expect_warning(.data.checks(X.f)) - expect_warning(.data.checks(scale(X))) -}) +# TODO: redo with data types +# test_that("data is counts", { +# expect_silent(.data.checks(X)) +# expect_warning(.data.checks(X.f)) +# expect_warning(.data.checks(scale(X))) +# }) context("SparCC") test_that("sparcc gives expected output", { diff --git a/tests/testthat/test_typedetect.R b/tests/testthat/test_typedetect.R new file mode 100644 index 0000000..49cb749 --- /dev/null +++ b/tests/testthat/test_typedetect.R @@ -0,0 +1,26 @@ +context('setup') +library(latentcor) + + +set.seed(10010) +X1 <- gen_data(n = 100, types = c("tru", "con", "con", "bin", "ter"), + XP = list(.1, 1e-4, NA, .5, c(.6,.3,.1)))$X +## create count types +X1[X1[,1]!=0,1] <- round(exp(X1[X1[,1]!=0,1])) +X1[ ,2] <- round(exp(1.5*X1[,2]+3)) + + +p <- 20 +e <- p +n <- 100 +set.seed(10010) +g <- make_graph('erdos_renyi', p, e) +S <- cov2cor(prec2cov(graph2prec(g))) +X2 <- rmvnegbin(n, rep(10,p), S, ks=.3) + +datanorm <- .spiec.easi.norm(list(X1, X2)) +types <- attr(datanorm, 'types') + +test_that("execution succeeds", { + est <- spiec.easi(list(X1, X2), method='mb', cov.fun='latentcor') +}) From 98ab0567ffea4d33fb1885ecefa407cc0996b6b9 Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Sat, 4 Dec 2021 16:25:42 -0500 Subject: [PATCH 04/10] move dclr functions --- R/SparseICov.R | 3 --- R/normalization.R | 8 ++++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R/SparseICov.R b/R/SparseICov.R index ed828ef..0a00000 100644 --- a/R/SparseICov.R +++ b/R/SparseICov.R @@ -173,6 +173,3 @@ glm.neighborhood <- function(X, Y, lambda, link='binomial', ...) { # } # return(Bmat) # } - -dclr <- function(x) t(clr(apply(x, 1, norm_diric),2)) -dclrNPN <- function(x) huge::huge.npn(t(clr(apply(x, 1, norm_diric),2)), verbose=FALSE) diff --git a/R/normalization.R b/R/normalization.R index de629e6..1f9d4ce 100644 --- a/R/normalization.R +++ b/R/normalization.R @@ -196,6 +196,14 @@ alr.data.frame <- function(x.f, mar=2, ...) { alr(as.matrix(x.f), mar, ...) } +# TODO: should these be exported +#' @keywords internal +dclr <- function(x) t(clr(apply(x, 1, norm_diric),2)) + +#' @keywords internal +dclrNPN <- function(x) huge::huge.npn(t(clr(apply(x, 1, norm_diric),2)), verbose=FALSE) + + #' @keywords internal triu <- function(x) x[upper.tri(x)] #' @keywords internal From 4066f6a34d102d1cc8e91685a08780e56337861f Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Sat, 4 Dec 2021 16:49:42 -0500 Subject: [PATCH 05/10] add types and latentcor to slr method --- R/SparseICov.R | 1 + R/SparseLowRankICov.R | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/R/SparseICov.R b/R/SparseICov.R index 0a00000..bf4b9a7 100644 --- a/R/SparseICov.R +++ b/R/SparseICov.R @@ -69,6 +69,7 @@ sparseiCov <- function(data, method, cov.fun='cor', npn=FALSE, verbose=FALSE, co # est$sym <- ifelse(!is.null(args$sym), args$sym, 'or') # } est$data <- data + est$types <- types return(est) } diff --git a/R/SparseLowRankICov.R b/R/SparseLowRankICov.R index 75c1d1b..f03b021 100644 --- a/R/SparseLowRankICov.R +++ b/R/SparseLowRankICov.R @@ -20,7 +20,7 @@ #' The argument \code{nlambda} determines the number of penalties - somewhere between 10-100 is usually good, depending on how the values of empirical correlation are distributed.#' @export #' #' One of \code{beta} (penalty for the nuclear norm) or \code{r} (number of ranks) should be supplied or \code{r=2} is chosen by default. -sparseLowRankiCov <- function(data, cov.fun='cor', npn=FALSE, verbose=FALSE, cor=FALSE, ...) { +sparseLowRankiCov <- function(data, cov.fun='cor', npn=FALSE, verbose=FALSE, cor=FALSE, types=NULL, ...) { ## TODO: make args to admm2 explicit args <- list(...) if (length(args$r) > 1 || length(args$beta) > 1) @@ -29,9 +29,7 @@ sparseLowRankiCov <- function(data, cov.fun='cor', npn=FALSE, verbose=FALSE, cor if (npn) data <- huge::huge.npn(data, verbose=verbose) if (isSymmetric(data)) SigmaO <- data else { - stopifnot(cov.fun %in% c('cor', 'cov', 'latentcor')) - fcor <- match.fun(cov.fun) - SigmaO <- fcor(data) + SigmaO <- .match.cov(cov.fun, data, types) } # else SigmaO <- cov(data) # if (cor) SigmaO <- cov2cor(SigmaO) @@ -81,7 +79,7 @@ sparseLowRankiCov <- function(data, cov.fun='cor', npn=FALSE, verbose=FALSE, cor loglik[[i]] <- log(Matrix::det(R[z,z])) - sum(Matrix::diag(R[z,z] %*% SigmaO[z,z])) - (p-q) # args$opts$eta <- max(.5, (p-(n-i))/p) } - list(icov=icov, path=path, resid=resid, lambda=lambda, loglik=loglik, data=data) + list(icov=icov, path=path, resid=resid, lambda=lambda, loglik=loglik, data=data, types=types) } #' @useDynLib SpiecEasi From 8b55857abe7b3a51e2a469f737fc62e2fb68e809 Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Sat, 4 Dec 2021 16:50:41 -0500 Subject: [PATCH 06/10] add normalization status message --- R/spiec-easi.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/spiec-easi.R b/R/spiec-easi.R index 7de9542..a473e3a 100644 --- a/R/spiec-easi.R +++ b/R/spiec-easi.R @@ -66,9 +66,9 @@ spiec.easi.otu_table <- function(data, ...) { types <- attr(data, 'types') if (is.null(types)) types <- get_types(data) ## all columns are counts or comp - normalize ## - # TODO: create message if dataset is partially normalized comp <- is.normalized(data) if (comp || all(grepl("count", types))) { + message(sprintf(" Compositional or count dataset detected... %s normalizing", method)) if (comp & (method == "plcr")) { message("input data is already total-sum-normalized, Is pseudocount needed?") } From f3f14207c667da5869324e8c7c4881c45acdc5c4 Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Wed, 15 Dec 2021 17:22:25 -0500 Subject: [PATCH 07/10] uncomment class detection --- DESCRIPTION | 2 +- R/spiec-easi.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 32c2c42..97dc526 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: SpiecEasi Title: Sparse Inverse Covariance for Ecological Statistical Inference -Version: 2.0.0 +Version: 2.0.1 Authors@R: c( person("Zachary", "Kurtz", role = c("aut", "cre"), email="zdkurtz@gmail.com"), person("Christian", "Mueller", role = "aut"), diff --git a/R/spiec-easi.R b/R/spiec-easi.R index a473e3a..bb75b47 100644 --- a/R/spiec-easi.R +++ b/R/spiec-easi.R @@ -392,7 +392,7 @@ multi.spiec.easi <- function(datalist, method='glasso', sel.criterion='stars', spiec.easi.list <- function(data, ...) { args <- list(...) # TODO: move this check to to after types detection - # classes <- sapply(data, function(x) class(x)[1]) + classes <- sapply(data, function(x) class(x)[1]) # if ((length(unique(classes)) != 1) & (args$cov.method!='latentcor')) # message("input list contains data of mixed classes. 'latentcor' is the recommended `cov.method`") From 011b6d08d43df32ff9605e0d12c7ff7b0c3849c8 Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Wed, 18 Oct 2023 20:56:30 -0400 Subject: [PATCH 08/10] match string or function name in match.cov --- R/correlations.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/correlations.R b/R/correlations.R index daae42e..4a28a9e 100644 --- a/R/correlations.R +++ b/R/correlations.R @@ -9,8 +9,11 @@ latentcor <- function(data, types = NULL) { #' @keywords internal .match.cov <- function(cov.fun, data, types) { + # match string or function's name + cov.fun <- rev(as.character(substitute(base::cov)))[1] switch(cov.fun, - cor=cor(data), - cov=cov(data), - latentcor=latentcor(data, types=types)) + cor=cor(data), + cov=cov(data), + latentcor=SpiecEasi::latentcor(data, types=types), + stop("covariance function not recognized")) } From 6e505af1f795ba38e0533562536b9689b3fd4ab7 Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Wed, 18 Oct 2023 20:57:06 -0400 Subject: [PATCH 09/10] better counts detection in data typing --- R/data_types.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data_types.R b/R/data_types.R index 3465768..27f1953 100644 --- a/R/data_types.R +++ b/R/data_types.R @@ -2,7 +2,7 @@ is.count <- function(x) { stopifnot(is.vector(x)) if (!is.numeric(x) && !is.integer(x)) return(FALSE) - if (isFALSE(all.equal(x, as.integer(x)))) return(FALSE) + if (!isTRUE(all.equal(as.numeric(x), as.integer(x)))) return(FALSE) if (any(x<0)) return(FALSE) return(TRUE) } From 81fc814b62676c1e371aa91a2202fe9da1834e80 Mon Sep 17 00:00:00 2001 From: Zachary Kurtz Date: Wed, 18 Oct 2023 20:58:28 -0400 Subject: [PATCH 10/10] more type arg handling --- R/spiec-easi.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/spiec-easi.R b/R/spiec-easi.R index bb75b47..a0a813c 100644 --- a/R/spiec-easi.R +++ b/R/spiec-easi.R @@ -63,7 +63,9 @@ spiec.easi.otu_table <- function(data, ...) { # internal function to normalize a data matrix if (inherits(data, 'matrix')) { ## TODO: check that types argument and data attr are not both supplied - types <- attr(data, 'types') + if (!is.null(types) && !is.null(attr(data, 'types'))) { + stop('supply types as an argument or data attribute but not both.') + } if (is.null(types)) types <- get_types(data) ## all columns are counts or comp - normalize ## comp <- is.normalized(data)