hammerlab · jburos · Aug 17, 2017 · Aug 17, 2017 · Aug 17, 2017 · Aug 17, 2017
diff --git a/cohorts/cohort.py b/cohorts/cohort.py
@@ -60,6 +60,7 @@
 from .variant_filters import no_filter
 from .styling import set_styling
 from . import variant_filters
+from .errors import MissingBamFile, MissingTumorBamFile, MissingRNABamFile, MissingHLAType, MissingVariantFile
 
 logger = get_logger(__name__, level=logging.INFO)
 
@@ -119,6 +120,19 @@ class Cohort(Collection):
         What word to use for "benefit" when plotting.
     merge_type : {"union", "intersection"}, optional
         Use this method to merge multiple variant sets for a single patient, default "union"
+    fail_on_missing_bams: boolean
+        Whether to proceed with analysis if some patients are missing BAMs required for analysis
+        (e.g. analysis using expressed variants generally require tumor RNA bam files to be present)
+        defaults to True
+    fail_on_missing_variants: boolean
+        Whether to proceed with analysis if some patients are missing variant files required for analysis
+        (e.g. analysis of snvs or neoantigens require presense of VCF, MAF, or other file format containing
+        somatic variants)
+        defaults to False
+    fail_on_missing_hla: boolean
+        Whether to proceed with analysis if some patients are missing HLA type
+        (e.g. predicted neoantigen counts require that HLA types are provided)
+        defaults to False
     """
     def __init__(self,
                  patients,
@@ -144,7 +158,10 @@ def __init__(self,
                  pageant_coverage_path=None,
                  pageant_dir_fn=None,
                  benefit_plot_name="Benefit",
-                 merge_type="union"):
+                 merge_type="union",
+                 fail_on_missing_bams=True,
+                 fail_on_missing_variants=False,
+                 fail_on_missing_hla=False):
         Collection.__init__(
             self,
             elements=patients)
@@ -180,6 +197,9 @@ def __init__(self,
         self.benefit_plot_name = benefit_plot_name
         self.merge_type = merge_type
         self._genome = None
+        self.fail_on_missing_bams = fail_on_missing_bams
+        self.fail_on_missing_variants = fail_on_missing_variants
+        self.fail_on_missing_hla = fail_on_missing_hla
 
         self.verify_id_uniqueness()
         self.verify_survival()
@@ -605,12 +625,17 @@ def _load_single_patient_variants(self, patient, filter_fn, use_cache=True, **kw
 
         ## get merged variants
         logger.debug("... getting merged variants for: {}".format(patient.id))
-        merged_variants = self._load_single_patient_merged_variants(patient, use_cache=use_cache)
+        try:
+            merged_variants = self._load_single_patient_merged_variants(patient, use_cache=use_cache)
+        except MissingVariantFile as e:
+            if self.fail_on_missing_variants:
+                raise
+            else:
+                logger.info(str(e))
+                return None
 
         # Note None here is different from 0. We want to preserve None
-        if merged_variants is None:
-            logger.info("Variants did not exist for patient %s" % patient.id)
-            return None
+        assert merged_variants is not None, "Bug found - merged_variants should not be None. Please file an issue"
 
         logger.debug("... applying filters to variants for: {}".format(patient.id))
         filtered_variants = filter_variants(variant_collection=merged_variants,
@@ -629,6 +654,8 @@ def _load_single_patient_merged_variants(self, patient, use_cache=True):
             Use `_load_single_patient_variants` to get filtered variants
         """
         logger.debug("loading merged variants for patient {}".format(patient.id))
+        if len(patient.variants_list) == 0:
+            raise MissingVariantFile(patient_id=patient.id)
         no_variants = False
         try:
             # get merged-variants from cache
@@ -675,14 +702,14 @@ def _load_single_patient_merged_variants(self, patient, use_cache=True):
             else:
                 merged_variants = self._merge_variant_collections(variant_collections, self.merge_type)
         except IOError:
+            logger.warning("Error reading from variant files for patient {}".format(patient.id))
             no_variants = True
 
         # Note that this is the number of variant collections and not the number of
         # variants. 0 variants will lead to 0 neoantigens, for example, but 0 variant
         # collections will lead to NaN variants and neoantigens.
         if no_variants:
-            print("Variants did not exist for patient %s" % patient.id)
-            merged_variants = None
+            raise MissingVariantFile(patient_id=patient.id)
 
         # save merged variants to file
         if use_cache:
@@ -837,32 +864,38 @@ def _load_single_patient_effects(self, patient, only_nonsynonymous, all_effects,
             else:
                 cached = self.load_from_cache(self.cache_names["effect"], patient.id, cached_file_name)
         if cached is not None:
-            return filter_effects(effect_collection=cached,
-                                  variant_collection=variants,
-                                  patient=patient,
-                                  filter_fn=filter_fn,
-                                 **kwargs)
+            filtered_effects = filter_effects(effect_collection=cached,
+                                              variant_collection=variants,
+                                              patient=patient,
+                                              filter_fn=filter_fn,
+                                              **kwargs)
+        else:
+
+            effects = variants.effects()
 
-        effects = variants.effects()
+            self.save_to_cache(effects, self.cache_names["all_effect"], patient.id, cached_file_name)
 
-        self.save_to_cache(effects, self.cache_names["all_effect"], patient.id, cached_file_name)
+            effects = EffectCollection(list(effects.top_priority_effect_per_variant().values()))
+            self.save_to_cache(effects, self.cache_names["effect"], patient.id, cached_file_name)
 
-        effects = EffectCollection(list(effects.top_priority_effect_per_variant().values()))
-        self.save_to_cache(effects, self.cache_names["effect"], patient.id, cached_file_name)
+            # Always take the top priority effect per variant so we end up with a single
+            # effect per variant.
+            nonsynonymous_effects = EffectCollection(
+                list(effects.drop_silent_and_noncoding().top_priority_effect_per_variant().values()))
+            self.save_to_cache(nonsynonymous_effects, self.cache_names["nonsynonymous_effect"], patient.id, cached_file_name)
+
+            filtered_effects = filter_effects(
+                effect_collection=(
+                    nonsynonymous_effects if only_nonsynonymous else effects),
+                variant_collection=variants,
+                patient=patient,
+                filter_fn=filter_fn,
+                **kwargs)
 
-        # Always take the top priority effect per variant so we end up with a single
-        # effect per variant.
-        nonsynonymous_effects = EffectCollection(
-            list(effects.drop_silent_and_noncoding().top_priority_effect_per_variant().values()))
-        self.save_to_cache(nonsynonymous_effects, self.cache_names["nonsynonymous_effect"], patient.id, cached_file_name)
+        if filtered_effects is None:
+            logger.info("Effects did not exist for patient {}".format(patient.id))
 
-        return filter_effects(
-            effect_collection=(
-                nonsynonymous_effects if only_nonsynonymous else effects),
-            variant_collection=variants,
-            patient=patient,
-            filter_fn=filter_fn,
-            **kwargs)
+        return filtered_effects
 
     def load_kallisto(self):
         """
@@ -967,16 +1000,28 @@ def load_neoantigens(self, patients=None, only_expressed=False,
 
         dfs = {}
         for patient in self.iter_patients(patients):
-            df_epitopes = self._load_single_patient_neoantigens(
-                patient=patient,
-                only_expressed=only_expressed,
-                epitope_lengths=epitope_lengths,
-                ic50_cutoff=ic50_cutoff,
-                process_limit=process_limit,
-                max_file_records=max_file_records,
-                filter_fn=filter_fn)
-            if df_epitopes is not None:
-                dfs[patient.id] = df_epitopes
+            try:
+                df_epitopes = self._load_single_patient_neoantigens(
+                    patient=patient,
+                    only_expressed=only_expressed,
+                    epitope_lengths=epitope_lengths,
+                    ic50_cutoff=ic50_cutoff,
+                    process_limit=process_limit,
+                    max_file_records=max_file_records,
+                    filter_fn=filter_fn)
+            except MissingBamFile as e:
+                if self.fail_on_missing_bams:
+                    raise
+                else:
+                    logger.info(str(e))
+            except MissingHLAType as e:
+                if self.fail_on_missing_hla:
+                    raise
+                else:
+                    logger.info(str(e))
+            else:
+                if df_epitopes is not None:
+                    dfs[patient.id] = df_epitopes
         return dfs
 
     def _load_single_patient_neoantigens(self, patient, only_expressed, epitope_lengths,
@@ -991,8 +1036,7 @@ def _load_single_patient_neoantigens(self, patient, only_expressed, epitope_leng
             return None
 
         if patient.hla_alleles is None:
-            print("HLA alleles did not exist for patient %s" % patient.id)
-            return None
+            raise MissingHLAType(patient_id=patient.id)
 
         if only_expressed:
             cached = self.load_from_cache(self.cache_names["expressed_neoantigen"], patient.id, cached_file_name)
@@ -1098,9 +1142,9 @@ def load_single_patient_isovar(self, patient, variants, epitope_lengths):
         import logging
         logging.disable(logging.INFO)
         if patient.tumor_sample is None:
-            raise ValueError("Patient %s has no tumor sample" % patient.id)
+            raise MissingTumorBamFile(patient_id=patient.id)
         if patient.tumor_sample.bam_path_rna is None:
-            raise ValueError("Patient %s has no tumor RNA BAM path" % patient.id)
+            raise MissingRNABamFile(patient_id=patient.id)
         rna_bam_file = AlignmentFile(patient.tumor_sample.bam_path_rna)
 
         # To ensure that e.g. 8-11mers overlap substitutions, we need at least this

diff --git a/cohorts/errors.py b/cohorts/errors.py
@@ -0,0 +1,43 @@
+
+class MissingData(Exception):
+
+    def __init__(self, filepath=None, patient_id=None, filetype='Data'):
+        self.filepath = filepath
+        self.patient_id = patient_id
+        self.filetype = filetype
+
+    def __str__(self):
+        if self.patient_id is not None:
+            patient_str = "Patient {}".format(self.patient_id)
+        else:
+            patient_str = "Patient"
+
+        if self.filepath is not None:
+            print_str = "The {} ({}) for {} was not found.".format(self.filetype, repr(self.filepath), patient_str)
+        else:
+            print_str = "{} has no {}.".format(patient_str, self.filetype)
+        return print_str
+
+class MissingBamFile(MissingData):
+    def __init__(self, filetype='Bamfile', *args, **kwargs):
+        super().__init__(filetype=filetype, *args, **kwargs)
+
+class MissingRNABamFile(MissingBamFile):
+    def __init__(self, filetype="tumor RNA Bamfile", *args, **kwargs):
+        super().__init__(filetype=filetype, *args, **kwargs)
+
+class MissingTumorBamFile(MissingBamFile):
+    def __init__(self, filetype="tumor sample bamfile", *args, **kwargs):
+        super().__init__(filetype=filetype, *args, **kwargs)
+
+class MissingNormalBamFile(MissingBamFile):
+    def __init__(self, filetype="normal sample bamfile", *args, **kwargs):
+        super().__init__(filetype=filetype, *args, **kwargs)
+
+class MissingHLAType(MissingData):
+    def __init__(self, filetype="HLA alleles", *args, **kwargs):
+        super().__init__(filetype=filetype, *args, **kwargs)
+
+class MissingVariantFile(MissingData):
+    def __init__(self, filetype="vcf/maf files", *args, **kwargs):
+        super().__init__(filetype=filetype, *args, **kwargs)
diff --git a/cohorts/varcode_utils.py b/cohorts/varcode_utils.py
@@ -13,6 +13,11 @@
 # limitations under the License.
 
 from varcode import EffectCollection, Variant
+from .errors import MissingBamFile
+import logging
+
+logger = logging.getLogger(__name__)
+
 
 def genome(variant_collection):
     return variant_collection[0].ensembl
@@ -92,15 +97,22 @@ def filter_variants(variant_collection, patient, filter_fn, **kwargs):
         Filtered variant collection, with only the variants passing the filter
     """
     if filter_fn:
-        return variant_collection.clone_with_new_elements([
-            variant
-            for variant in variant_collection
-            if filter_fn(FilterableVariant(
+        try:
+            return variant_collection.clone_with_new_elements([
+                variant
+                for variant in variant_collection
+                if filter_fn(FilterableVariant(
                         variant=variant,
                         variant_collection=variant_collection,
                         patient=patient,
                         ), **kwargs)
-        ])
+            ])
+        except MissingBamFile as e:
+            if patient.cohort.fail_on_missing_bams:
+                raise
+            else:
+                logger.info(str(e))
+                return None
     else:
         return variant_collection
 
@@ -121,27 +133,35 @@ def filter_effects(effect_collection, variant_collection, patient, filter_fn, **
         Filtered effect collection, with only the variants passing the filter
     """
     if filter_fn:
-        return EffectCollection([
-            effect
-            for effect in effect_collection
-            if filter_fn(FilterableEffect(
-                    effect=effect,
-                    variant_collection=variant_collection,
-                    patient=patient), **kwargs)])
+        try:
+            return EffectCollection([
+                effect
+                for effect in effect_collection
+                if filter_fn(FilterableEffect(
+                        effect=effect,
+                        variant_collection=variant_collection,
+                        patient=patient), **kwargs)])
+        except MissingBamFile as e:
+            logger.warning(str(e))
+            return None
     else:
         return effect_collection
 
 def filter_neoantigens(neoantigens_df, variant_collection, patient, filter_fn):
     if filter_fn:
-        filter_mask = neoantigens_df.apply(
-            lambda row: filter_fn(
-                FilterableNeoantigen(neoantigen_row=row,
-                                     variant_collection=variant_collection,
-                                     patient=patient)),
-            axis=1,
-            # reduce ensures that an empty result is a Series vs. a DataFrame
-            reduce=True)
-        return neoantigens_df[filter_mask]
+        try:
+            filter_mask = neoantigens_df.apply(
+                lambda row: filter_fn(
+                    FilterableNeoantigen(neoantigen_row=row,
+                                         variant_collection=variant_collection,
+                                         patient=patient)),
+                axis=1,
+                # reduce ensures that an empty result is a Series vs. a DataFrame
+                reduce=True)
+            return neoantigens_df[filter_mask]
+        except MissingBamFile as e:
+            logger.warning(str(e))
+            return None
     else:
         return neoantigens_df
 

diff --git a/cohorts/variant_filters.py b/cohorts/variant_filters.py
@@ -19,6 +19,7 @@
 from varcode.common import memoize
 import pandas as pd
 from os import path
+from .errors import MissingBamFile
 
 logger = get_logger(__name__)
 
@@ -60,10 +61,18 @@ def expressed_variant_set(cohort, patient, variant_collection):
     # TODO: we're currently using the same isovar cache that we use for expressed
     # neoantigen prediction; so we pass in the same epitope lengths.
     # This is hacky and should be addressed.
-    df_isovar = patient.cohort.load_single_patient_isovar(
-        patient=patient,
-        variants=variant_collection,
-        epitope_lengths=[8, 9, 10, 11])
+    try:
+        df_isovar = patient.cohort.load_single_patient_isovar(
+            patient=patient,
+            variants=variant_collection,
+            epitope_lengths=[8, 9, 10, 11])
+    except MissingBamFile as e:
+        if cohort.fail_on_missing_bams:
+            raise
+        else:
+            logger.info(str(e))
+            return None
+
     expressed_variant_set = set()
     for _, row in df_isovar.iterrows():
         expressed_variant = Variant(contig=row["chr"],
@@ -79,7 +88,10 @@ def variant_expressed_filter(filterable_variant, **kwargs):
         cohort=filterable_variant.patient.cohort,
         patient=filterable_variant.patient,
         variant_collection=filterable_variant.variant_collection)
-    return filterable_variant.variant in expressed_variants
+    if expressed_variants is None:
+        return None
+    else:
+        return filterable_variant.variant in expressed_variants
 
 def effect_expressed_filter(filterable_effect, **kwargs):
     return variant_expressed_filter(filterable_effect, **kwargs)