Skip to content

Commit

Permalink
Add PerturbationDataValidator (#672)
Browse files Browse the repository at this point in the history
* Augur scsim warnings

Signed-off-by: zethson <[email protected]>

* Submodules

Signed-off-by: zethson <[email protected]>

* Add super draft of pertpy validator

Signed-off-by: zethson <[email protected]>

* Polish

Signed-off-by: zethson <[email protected]>

* Polish

Signed-off-by: zethson <[email protected]>

* Nested try

Signed-off-by: zethson <[email protected]>

* validator in test

Signed-off-by: zethson <[email protected]>

* try uv for rtd

Signed-off-by: zethson <[email protected]>

* rtd uv

Signed-off-by: zethson <[email protected]>

* rtd uv

Signed-off-by: zethson <[email protected]>

* rtd uv fix

Signed-off-by: zethson <[email protected]>

* mb sphinx fix for validator

Signed-off-by: zethson <[email protected]>

* docs

Signed-off-by: zethson <[email protected]>

* remove PerturbationValidator from docs

Signed-off-by: zethson <[email protected]>

* remove PerturbationValidator from docs

Signed-off-by: zethson <[email protected]>

---------

Signed-off-by: zethson <[email protected]>
  • Loading branch information
Zethson authored Oct 26, 2024
1 parent 851007c commit 31a7c64
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 19 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python 3.11
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.11"
python-version: "3.12"

- name: Install hatch
run: pip install hatch
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
- name: Install dependencies
run: |
uv pip install --system rpy2
uv pip install --system ${{ matrix.pip-flags }} ".[dev,test,coda,de]"
uv pip install --system ${{ matrix.pip-flags }} ".[dev,test,coda,de,validator]"
- name: Test
env:
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,7 @@ lightning_logs/*

node_modules

# lamindb
test.ipynb
test-perturbation
test-bug
19 changes: 7 additions & 12 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,16 @@ build:
os: ubuntu-22.04
tools:
python: "3.11"
jobs:
pre_build:
- python -c "import pertpy"
- pip freeze
commands:
- asdf plugin add uv
- asdf install uv latest
- asdf global uv latest
- uv venv
- uv pip install .[doc,coda,de]
- .venv/bin/python -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html
sphinx:
configuration: docs/conf.py
fail_on_warning: false
python:
install:
- method: pip
path: .
extra_requirements:
- doc
- coda
- de

submodules:
include: all
2 changes: 2 additions & 0 deletions pertpy/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,10 @@
zhang_2021,
zhao_2021,
)
from pertpy.data._perturbation_validator import PerturbationCurator

__all__ = [
"PerturbationCurator",
"adamson_2016_pilot",
"adamson_2016_upr_epistasis",
"adamson_2016_upr_perturb_seq",
Expand Down
105 changes: 105 additions & 0 deletions pertpy/data/_perturbation_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from typing import Literal

import anndata as ad


class _PerturbationValidatorUnavailable:
"""Curator flow for Perturbation data - see pertpy-datasets."""

def __init__(self):
raise RuntimeError("PerturbationValidator can only be instantiated if connected to a lamindb instance.")


# Nested try because django might not be installed
try:
from django.core.exceptions import ImproperlyConfigured

try:
import bionty as bt
import wetlab as wl
from cellxgene_lamin import CellxGeneFields, Curate
from lamin_utils import logger
from lamindb_setup.core.types import UPathStr
from lnschema_core import Record
from lnschema_core.types import FieldAttr

pt_defaults = CellxGeneFields.OBS_FIELD_DEFAULTS | {
"cell_line": "unknown",
"genetic_treatments": "",
"compound_treatments": "",
"environmental_treatments": "",
"combination_treatments": "",
}

pt_categoricals = CellxGeneFields.OBS_FIELDS | {
"cell_line": bt.CellLine.name,
"genetic_treatments": wl.GeneticTreatment.name,
"compound_treatments": wl.CompoundTreatment.name,
"environmental_treatments": wl.EnvironmentalTreatment.name,
"combination_treatments": wl.CombinationTreatment.name,
}

pt_sources: dict[str, Record] = {
"depmap_id": bt.Source.filter(name="depmap").one(),
"cell_line": bt.Source.filter(name="depmap").one(),
# "compound_treatments": bt.Source.filter(entity="Drug", name="chebi").first()
}

class PerturbationCurator(Curate):
"""Curator flow for Perturbation data - see pertpy-datasets."""

def __init__(
self,
adata: ad.AnnData | UPathStr,
var_index: FieldAttr = bt.Gene.ensembl_gene_id,
categoricals: dict[str, FieldAttr] = pt_categoricals,
organism: Literal["human", "mouse"] = "human",
*,
defaults: dict[str, str] = pt_defaults,
extra_sources: dict[str, Record] = pt_sources,
verbosity: str = "hint",
schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
using_key: str = "laminlabs/pertpy-datasets",
):
"""Curator flow for Perturbation data.
Args:
adata: Path to or AnnData object to curate against the CELLxGENE schema.
var_index: The registry field for mapping the ``.var`` index.
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
The PerturbationCurator maps against the required CELLxGENE fields and perturbation fields by default.
organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse' and therefore so do we.
defaults: Default values that are set if columns or column values are missing.
extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
verbosity: The verbosity level.
schema_version: The CELLxGENE schema version to curate against.
using_key: A reference LaminDB instance.
"""
self.organism = organism

# Set the Compound source to chebi; we don't want output if the source has already been set
with logger.mute():
chebi_source = bt.Source.filter(entity="Drug", name="chebi").first()
wl.Compound.add_source(chebi_source)

super().__init__(
adata=adata,
var_index=var_index,
categoricals=categoricals,
using_key=using_key,
defaults=defaults,
verbosity=verbosity,
organism=self.organism,
extra_sources=extra_sources,
schema_version=schema_version,
)

def validate(self) -> bool:
"""Validates the AnnData object against cellxgene and pertpy's requirements."""
return super().validate()

except ImproperlyConfigured:
PerturbationCurator = _PerturbationValidatorUnavailable # type: ignore

except ImportError:
PerturbationCurator = _PerturbationValidatorUnavailable # type: ignore
7 changes: 3 additions & 4 deletions pertpy/metadata/_compound.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def annotate_compounds(
adata = adata.copy()

if query_id not in adata.obs.columns:
raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n" f"Please check again. ")
raise ValueError(f"The requested query_id {query_id} is not in `adata.obs`.\n Please check again.")

query_dict = {}
not_matched_identifiers = []
Expand Down Expand Up @@ -84,7 +84,7 @@ def annotate_compounds(
query_df = pd.DataFrame.from_dict(query_dict, orient="index", columns=["pubchem_name", "pubchem_ID", "smiles"])
# Merge and remove duplicate columns
# Column is converted to float after merging due to unmatches
# Convert back to integers
# Convert back to integers afterwards
if query_id_type == "cid":
query_df.pubchem_ID = query_df.pubchem_ID.astype("Int64")
adata.obs = (
Expand Down Expand Up @@ -119,8 +119,7 @@ def lookup(self) -> LookUp:
The LookUp object provides an overview of the metadata to annotate.
Each annotate_{metadata} function has a corresponding lookup function in the LookUp object,
where users can search the reference_id in the metadata and
compare with the query_id in their own data.
where users can search the reference_id in the metadata and compare with the query_id in their own data.
Returns:
Returns a LookUp object specific for compound annotation.
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ de = [
dev = [
"pre-commit",
]
validator = [
"cellxgene-lamin",
"wetlab",
"findrefs"
]
doc = [
"docutils>=0.8,!=0.18.*,!=0.19.*",
"sphinx>=4",
Expand Down

0 comments on commit 31a7c64

Please sign in to comment.