Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Report on but do not error when patient has no RNA data #245

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 86 additions & 42 deletions cohorts/cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
from .variant_filters import no_filter
from .styling import set_styling
from . import variant_filters
from .errors import MissingBamFile, MissingTumorBamFile, MissingRNABamFile, MissingHLAType, MissingVariantFile

logger = get_logger(__name__, level=logging.INFO)

Expand Down Expand Up @@ -119,6 +120,19 @@ class Cohort(Collection):
What word to use for "benefit" when plotting.
merge_type : {"union", "intersection"}, optional
Use this method to merge multiple variant sets for a single patient, default "union"
fail_on_missing_bams: boolean
Whether to proceed with analysis if some patients are missing BAMs required for analysis
(e.g. analysis using expressed variants generally require tumor RNA bam files to be present)
defaults to True
fail_on_missing_variants: boolean
Whether to proceed with analysis if some patients are missing variant files required for analysis
(e.g. analysis of snvs or neoantigens require presense of VCF, MAF, or other file format containing
somatic variants)
defaults to False
fail_on_missing_hla: boolean
Whether to proceed with analysis if some patients are missing HLA type
(e.g. predicted neoantigen counts require that HLA types are provided)
defaults to False
"""
def __init__(self,
patients,
Expand All @@ -144,7 +158,10 @@ def __init__(self,
pageant_coverage_path=None,
pageant_dir_fn=None,
benefit_plot_name="Benefit",
merge_type="union"):
merge_type="union",
fail_on_missing_bams=True,
fail_on_missing_variants=False,
fail_on_missing_hla=False):
Collection.__init__(
self,
elements=patients)
Expand Down Expand Up @@ -180,6 +197,9 @@ def __init__(self,
self.benefit_plot_name = benefit_plot_name
self.merge_type = merge_type
self._genome = None
self.fail_on_missing_bams = fail_on_missing_bams
self.fail_on_missing_variants = fail_on_missing_variants
self.fail_on_missing_hla = fail_on_missing_hla

self.verify_id_uniqueness()
self.verify_survival()
Expand Down Expand Up @@ -605,12 +625,17 @@ def _load_single_patient_variants(self, patient, filter_fn, use_cache=True, **kw

## get merged variants
logger.debug("... getting merged variants for: {}".format(patient.id))
merged_variants = self._load_single_patient_merged_variants(patient, use_cache=use_cache)
try:
merged_variants = self._load_single_patient_merged_variants(patient, use_cache=use_cache)
except MissingVariantFile as e:
if self.fail_on_missing_variants:
raise
else:
logger.info(str(e))
return None

# Note None here is different from 0. We want to preserve None
if merged_variants is None:
logger.info("Variants did not exist for patient %s" % patient.id)
return None
assert merged_variants is not None, "Bug found - merged_variants should not be None. Please file an issue"

logger.debug("... applying filters to variants for: {}".format(patient.id))
filtered_variants = filter_variants(variant_collection=merged_variants,
Expand All @@ -629,6 +654,8 @@ def _load_single_patient_merged_variants(self, patient, use_cache=True):
Use `_load_single_patient_variants` to get filtered variants
"""
logger.debug("loading merged variants for patient {}".format(patient.id))
if len(patient.variants_list) == 0:
raise MissingVariantFile(patient_id=patient.id)
no_variants = False
try:
# get merged-variants from cache
Expand Down Expand Up @@ -675,14 +702,14 @@ def _load_single_patient_merged_variants(self, patient, use_cache=True):
else:
merged_variants = self._merge_variant_collections(variant_collections, self.merge_type)
except IOError:
logger.warning("Error reading from variant files for patient {}".format(patient.id))
no_variants = True

# Note that this is the number of variant collections and not the number of
# variants. 0 variants will lead to 0 neoantigens, for example, but 0 variant
# collections will lead to NaN variants and neoantigens.
if no_variants:
print("Variants did not exist for patient %s" % patient.id)
merged_variants = None
raise MissingVariantFile(patient_id=patient.id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be some repeated logic here; so many checks. Maybe file an issue to clean up this function? Don't want to delay the PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed - this function does need to be cleaned up. But, yep that would belong in a different PR


# save merged variants to file
if use_cache:
Expand Down Expand Up @@ -837,32 +864,38 @@ def _load_single_patient_effects(self, patient, only_nonsynonymous, all_effects,
else:
cached = self.load_from_cache(self.cache_names["effect"], patient.id, cached_file_name)
if cached is not None:
return filter_effects(effect_collection=cached,
variant_collection=variants,
patient=patient,
filter_fn=filter_fn,
**kwargs)
filtered_effects = filter_effects(effect_collection=cached,
variant_collection=variants,
patient=patient,
filter_fn=filter_fn,
**kwargs)
else:

effects = variants.effects()

effects = variants.effects()
self.save_to_cache(effects, self.cache_names["all_effect"], patient.id, cached_file_name)

self.save_to_cache(effects, self.cache_names["all_effect"], patient.id, cached_file_name)
effects = EffectCollection(list(effects.top_priority_effect_per_variant().values()))
self.save_to_cache(effects, self.cache_names["effect"], patient.id, cached_file_name)

effects = EffectCollection(list(effects.top_priority_effect_per_variant().values()))
self.save_to_cache(effects, self.cache_names["effect"], patient.id, cached_file_name)
# Always take the top priority effect per variant so we end up with a single
# effect per variant.
nonsynonymous_effects = EffectCollection(
list(effects.drop_silent_and_noncoding().top_priority_effect_per_variant().values()))
self.save_to_cache(nonsynonymous_effects, self.cache_names["nonsynonymous_effect"], patient.id, cached_file_name)

filtered_effects = filter_effects(
effect_collection=(
nonsynonymous_effects if only_nonsynonymous else effects),
variant_collection=variants,
patient=patient,
filter_fn=filter_fn,
**kwargs)

# Always take the top priority effect per variant so we end up with a single
# effect per variant.
nonsynonymous_effects = EffectCollection(
list(effects.drop_silent_and_noncoding().top_priority_effect_per_variant().values()))
self.save_to_cache(nonsynonymous_effects, self.cache_names["nonsynonymous_effect"], patient.id, cached_file_name)
if filtered_effects is None:
logger.info("Effects did not exist for patient {}".format(patient.id))

return filter_effects(
effect_collection=(
nonsynonymous_effects if only_nonsynonymous else effects),
variant_collection=variants,
patient=patient,
filter_fn=filter_fn,
**kwargs)
return filtered_effects

def load_kallisto(self):
"""
Expand Down Expand Up @@ -967,16 +1000,28 @@ def load_neoantigens(self, patients=None, only_expressed=False,

dfs = {}
for patient in self.iter_patients(patients):
df_epitopes = self._load_single_patient_neoantigens(
patient=patient,
only_expressed=only_expressed,
epitope_lengths=epitope_lengths,
ic50_cutoff=ic50_cutoff,
process_limit=process_limit,
max_file_records=max_file_records,
filter_fn=filter_fn)
if df_epitopes is not None:
dfs[patient.id] = df_epitopes
try:
df_epitopes = self._load_single_patient_neoantigens(
patient=patient,
only_expressed=only_expressed,
epitope_lengths=epitope_lengths,
ic50_cutoff=ic50_cutoff,
process_limit=process_limit,
max_file_records=max_file_records,
filter_fn=filter_fn)
except MissingBamFile as e:
if self.fail_on_missing_bams:
raise
else:
logger.info(str(e))
except MissingHLAType as e:
if self.fail_on_missing_hla:
raise
else:
logger.info(str(e))
else:
if df_epitopes is not None:
dfs[patient.id] = df_epitopes
return dfs

def _load_single_patient_neoantigens(self, patient, only_expressed, epitope_lengths,
Expand All @@ -991,8 +1036,7 @@ def _load_single_patient_neoantigens(self, patient, only_expressed, epitope_leng
return None

if patient.hla_alleles is None:
print("HLA alleles did not exist for patient %s" % patient.id)
return None
raise MissingHLAType(patient_id=patient.id)

if only_expressed:
cached = self.load_from_cache(self.cache_names["expressed_neoantigen"], patient.id, cached_file_name)
Expand Down Expand Up @@ -1098,9 +1142,9 @@ def load_single_patient_isovar(self, patient, variants, epitope_lengths):
import logging
logging.disable(logging.INFO)
if patient.tumor_sample is None:
raise ValueError("Patient %s has no tumor sample" % patient.id)
raise MissingTumorBamFile(patient_id=patient.id)
if patient.tumor_sample.bam_path_rna is None:
raise ValueError("Patient %s has no tumor RNA BAM path" % patient.id)
raise MissingRNABamFile(patient_id=patient.id)
rna_bam_file = AlignmentFile(patient.tumor_sample.bam_path_rna)

# To ensure that e.g. 8-11mers overlap substitutions, we need at least this
Expand Down
43 changes: 43 additions & 0 deletions cohorts/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

class MissingData(Exception):

def __init__(self, filepath=None, patient_id=None, filetype='Data'):
self.filepath = filepath
self.patient_id = patient_id
self.filetype = filetype

def __str__(self):
if self.patient_id is not None:
patient_str = "Patient {}".format(self.patient_id)
else:
patient_str = "Patient"

if self.filepath is not None:
print_str = "The {} ({}) for {} was not found.".format(self.filetype, repr(self.filepath), patient_str)
else:
print_str = "{} has no {}.".format(patient_str, self.filetype)
return print_str

class MissingBamFile(MissingData):
def __init__(self, filetype='Bamfile', *args, **kwargs):
super().__init__(filetype=filetype, *args, **kwargs)

class MissingRNABamFile(MissingBamFile):
def __init__(self, filetype="tumor RNA Bamfile", *args, **kwargs):
super().__init__(filetype=filetype, *args, **kwargs)

class MissingTumorBamFile(MissingBamFile):
def __init__(self, filetype="tumor sample bamfile", *args, **kwargs):
super().__init__(filetype=filetype, *args, **kwargs)

class MissingNormalBamFile(MissingBamFile):
def __init__(self, filetype="normal sample bamfile", *args, **kwargs):
super().__init__(filetype=filetype, *args, **kwargs)

class MissingHLAType(MissingData):
def __init__(self, filetype="HLA alleles", *args, **kwargs):
super().__init__(filetype=filetype, *args, **kwargs)

class MissingVariantFile(MissingData):
def __init__(self, filetype="vcf/maf files", *args, **kwargs):
super().__init__(filetype=filetype, *args, **kwargs)
62 changes: 41 additions & 21 deletions cohorts/varcode_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
# limitations under the License.

from varcode import EffectCollection, Variant
from .errors import MissingBamFile
import logging

logger = logging.getLogger(__name__)


def genome(variant_collection):
return variant_collection[0].ensembl
Expand Down Expand Up @@ -92,15 +97,22 @@ def filter_variants(variant_collection, patient, filter_fn, **kwargs):
Filtered variant collection, with only the variants passing the filter
"""
if filter_fn:
return variant_collection.clone_with_new_elements([
variant
for variant in variant_collection
if filter_fn(FilterableVariant(
try:
return variant_collection.clone_with_new_elements([
variant
for variant in variant_collection
if filter_fn(FilterableVariant(
variant=variant,
variant_collection=variant_collection,
patient=patient,
), **kwargs)
])
])
except MissingBamFile as e:
if patient.cohort.fail_on_missing_bams:
raise
else:
logger.info(str(e))
return None
else:
return variant_collection

Expand All @@ -121,27 +133,35 @@ def filter_effects(effect_collection, variant_collection, patient, filter_fn, **
Filtered effect collection, with only the variants passing the filter
"""
if filter_fn:
return EffectCollection([
effect
for effect in effect_collection
if filter_fn(FilterableEffect(
effect=effect,
variant_collection=variant_collection,
patient=patient), **kwargs)])
try:
return EffectCollection([
effect
for effect in effect_collection
if filter_fn(FilterableEffect(
effect=effect,
variant_collection=variant_collection,
patient=patient), **kwargs)])
except MissingBamFile as e:
logger.warning(str(e))
return None
else:
return effect_collection

def filter_neoantigens(neoantigens_df, variant_collection, patient, filter_fn):
if filter_fn:
filter_mask = neoantigens_df.apply(
lambda row: filter_fn(
FilterableNeoantigen(neoantigen_row=row,
variant_collection=variant_collection,
patient=patient)),
axis=1,
# reduce ensures that an empty result is a Series vs. a DataFrame
reduce=True)
return neoantigens_df[filter_mask]
try:
filter_mask = neoantigens_df.apply(
lambda row: filter_fn(
FilterableNeoantigen(neoantigen_row=row,
variant_collection=variant_collection,
patient=patient)),
axis=1,
# reduce ensures that an empty result is a Series vs. a DataFrame
reduce=True)
return neoantigens_df[filter_mask]
except MissingBamFile as e:
logger.warning(str(e))
return None
else:
return neoantigens_df

Expand Down
22 changes: 17 additions & 5 deletions cohorts/variant_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from varcode.common import memoize
import pandas as pd
from os import path
from .errors import MissingBamFile

logger = get_logger(__name__)

Expand Down Expand Up @@ -60,10 +61,18 @@ def expressed_variant_set(cohort, patient, variant_collection):
# TODO: we're currently using the same isovar cache that we use for expressed
# neoantigen prediction; so we pass in the same epitope lengths.
# This is hacky and should be addressed.
df_isovar = patient.cohort.load_single_patient_isovar(
patient=patient,
variants=variant_collection,
epitope_lengths=[8, 9, 10, 11])
try:
df_isovar = patient.cohort.load_single_patient_isovar(
patient=patient,
variants=variant_collection,
epitope_lengths=[8, 9, 10, 11])
except MissingBamFile as e:
if cohort.fail_on_missing_bams:
raise
else:
logger.info(str(e))
return None

expressed_variant_set = set()
for _, row in df_isovar.iterrows():
expressed_variant = Variant(contig=row["chr"],
Expand All @@ -79,7 +88,10 @@ def variant_expressed_filter(filterable_variant, **kwargs):
cohort=filterable_variant.patient.cohort,
patient=filterable_variant.patient,
variant_collection=filterable_variant.variant_collection)
return filterable_variant.variant in expressed_variants
if expressed_variants is None:
return None
else:
return filterable_variant.variant in expressed_variants

def effect_expressed_filter(filterable_effect, **kwargs):
return variant_expressed_filter(filterable_effect, **kwargs)
Expand Down