Skip to content

Commit

Permalink
Add two distance metrics, three-way comparison and bootstrapping (#608)
Browse files Browse the repository at this point in the history
* add two distance metrics

* add obsm_key param to distance test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add agg fct

* speed up tests

* add type

* add description

* Update pertpy/tools/_distances/_distances.py

Co-authored-by: Lukas Heumos <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update pertpy/tools/_distances/_distances.py

Co-authored-by: Lukas Heumos <[email protected]>

* Update pertpy/tools/_distances/_distances.py

Co-authored-by: Lukas Heumos <[email protected]>

* Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <[email protected]>

* Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <[email protected]>

* Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <[email protected]>

* Update pertpy/tools/_distances/_distances.py

Co-authored-by: Eljas Roellin <[email protected]>

* update code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix drug

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add bootstrapping and metrics_3g

* speed up tests,

* remove test classes

* drop test classes

* update compare_de

* correct the comments

* speed tests

* speed up tests

* split metrics_3g

* fix pre-commit

* pin numpy <2

* unpin numpy

* speed up mahalanobis distance

* use scipy to calculate mahalanobis distance

* rename DGE to DGEEVAL

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Lukas Heumos <[email protected]>
Co-authored-by: Eljas Roellin <[email protected]>
  • Loading branch information
4 people authored Jun 24, 2024
1 parent 74914a9 commit a22aaab
Show file tree
Hide file tree
Showing 10 changed files with 880 additions and 144 deletions.
17 changes: 15 additions & 2 deletions pertpy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,31 @@
from pertpy.tools._coda._sccoda import Sccoda
from pertpy.tools._coda._tasccoda import Tasccoda
from pertpy.tools._dialogue import Dialogue
from pertpy.tools._differential_gene_expression import EdgeR, PyDESeq2, Statsmodels, TTest, WilcoxonTest
from pertpy.tools._differential_gene_expression import (
DGEEVAL,
EdgeR,
PyDESeq2,
Statsmodels,
TTest,
WilcoxonTest,
)
from pertpy.tools._distances._distance_tests import DistanceTest
from pertpy.tools._distances._distances import Distance
from pertpy.tools._enrichment import Enrichment
from pertpy.tools._milo import Milo
from pertpy.tools._mixscape import Mixscape
from pertpy.tools._perturbation_space._clustering import ClusteringSpace
from pertpy.tools._perturbation_space._comparison import PerturbationComparison
from pertpy.tools._perturbation_space._discriminator_classifiers import (
LRClassifierSpace,
MLPClassifierSpace,
)
from pertpy.tools._perturbation_space._simple import CentroidSpace, DBSCANSpace, KMeansSpace, PseudobulkSpace
from pertpy.tools._perturbation_space._simple import (
CentroidSpace,
DBSCANSpace,
KMeansSpace,
PseudobulkSpace,
)
from pertpy.tools._scgen import Scgen

__all__ = [
Expand Down
1 change: 1 addition & 0 deletions pertpy/tools/_differential_gene_expression/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ._base import ContrastType, LinearModelBase, MethodBase
from ._dge_comparison import DGEEVAL
from ._edger import EdgeR
from ._pydeseq2 import PyDESeq2
from ._simple_tests import SimpleComparisonBase, TTest, WilcoxonTest
Expand Down
86 changes: 86 additions & 0 deletions pertpy/tools/_differential_gene_expression/_dge_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import numpy as np
import pandas as pd
from anndata import AnnData


class DGEEVAL:
def compare(
self,
adata: AnnData | None = None,
de_key1: str = None,
de_key2: str = None,
de_df1: pd.DataFrame | None = None,
de_df2: pd.DataFrame | None = None,
shared_top: int = 100,
) -> dict[str, float]:
"""Compare two differential expression analyses.
Compare two sets of DE results and evaluate the similarity by the overlap of top DEG and
the correlation of their scores and adjusted p-values.
Args:
adata: AnnData object containing DE results in `uns`. Required if `de_key1` and `de_key2` are used.
de_key1: Key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
de_key2: Another key for DE results in `adata.uns`, e.g., output of `tl.rank_genes_groups`.
de_df1: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
de_df2: DataFrame containing DE results, e.g. output from pertpy differential gene expression interface.
shared_top: The number of top DEG to compute the proportion of their intersection.
"""
if (de_key1 or de_key2) and (de_df1 is not None or de_df2 is not None):
raise ValueError(
"Please provide either both `de_key1` and `de_key2` with `adata`, or `de_df1` and `de_df2`, but not both."
)

if de_df1 is None and de_df2 is None: # use keys
if not de_key1 or not de_key2:
raise ValueError("Both `de_key1` and `de_key2` must be provided together if using `adata`.")

else: # use dfs
if de_df1 is None or de_df2 is None:
raise ValueError("Both `de_df1` and `de_df2` must be provided together if using DataFrames.")

if de_key1:
if not adata:
raise ValueError("`adata` should be provided with `de_key1` and `de_key2`. ")
assert all(
k in adata.uns for k in [de_key1, de_key2]
), "Provided `de_key1` and `de_key2` must exist in `adata.uns`."
vars = adata.var_names

if de_df1 is not None:
for df in (de_df1, de_df2):
if not {"variable", "log_fc", "adj_p_value"}.issubset(df.columns):
raise ValueError("Each DataFrame must contain columns: 'variable', 'log_fc', and 'adj_p_value'.")

assert set(de_df1["variable"]) == set(de_df2["variable"]), "Variables in both dataframes must match."
vars = de_df1["variable"].sort_values()

shared_top = min(shared_top, len(vars))
vars_ranks = np.arange(1, len(vars) + 1)
results = pd.DataFrame(index=vars)
top_names = []

if de_key1 and de_key2:
for i, k in enumerate([de_key1, de_key2]):
label = adata.uns[k]["names"].dtype.names[0]
srt_idx = np.argsort(adata.uns[k]["names"][label])
results[f"scores_{i}"] = adata.uns[k]["scores"][label][srt_idx]
results[f"pvals_adj_{i}"] = adata.uns[k]["pvals_adj"][label][srt_idx]
results[f"ranks_{i}"] = vars_ranks[srt_idx]
top_names.append(adata.uns[k]["names"][label][:shared_top])
else:
for i, df in enumerate([de_df1, de_df2]):
srt_idx = np.argsort(df["variable"])
results[f"scores_{i}"] = df["log_fc"].values[srt_idx]
results[f"pvals_adj_{i}"] = df["adj_p_value"].values[srt_idx]
results[f"ranks_{i}"] = vars_ranks[srt_idx]
top_names.append(df["variable"][:shared_top])

metrics = {}
metrics["shared_top_genes"] = len(set(top_names[0]).intersection(top_names[1])) / shared_top
metrics["scores_corr"] = results["scores_0"].corr(results["scores_1"], method="pearson")
metrics["pvals_adj_corr"] = results["pvals_adj_0"].corr(results["pvals_adj_1"], method="pearson")
metrics["scores_ranks_corr"] = results["ranks_0"].corr(results["ranks_1"], method="spearman")

return metrics
13 changes: 9 additions & 4 deletions pertpy/tools/_distances/_distance_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,14 @@ def __init__(
self.alpha = alpha
self.correction = correction
self.cell_wise_metric = (
cell_wise_metric if cell_wise_metric else Distance(self.metric, self.obsm_key).cell_wise_metric
cell_wise_metric if cell_wise_metric else Distance(self.metric, obsm_key=self.obsm_key).cell_wise_metric
)

self.distance = Distance(
self.metric, layer_key=self.layer_key, obsm_key=self.obsm_key, cell_wise_metric=self.cell_wise_metric
self.metric,
layer_key=self.layer_key,
obsm_key=self.obsm_key,
cell_wise_metric=self.cell_wise_metric,
)

def __call__(
Expand Down Expand Up @@ -176,7 +179,8 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, show_progressbar:
# Evaluate the test
# count times shuffling resulted in larger distance
comparison_results = np.array(
pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0,
dtype=int,
)
n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
pvalues = n_failures / self.n_perms
Expand Down Expand Up @@ -284,7 +288,8 @@ def test_precomputed(self, adata: AnnData, groupby: str, contrast: str, verbose:
# Evaluate the test
# count times shuffling resulted in larger distance
comparison_results = np.array(
pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0,
dtype=int,
)
n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
pvalues = n_failures / self.n_perms
Expand Down
Loading

0 comments on commit a22aaab

Please sign in to comment.