diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 89603237..ac6e40ae 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -38,7 +38,6 @@ from rich.progress import track from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException -from ms2rescore.utils import infer_spectrum_path logger = logging.getLogger(__name__) @@ -184,12 +183,10 @@ def add_features(self, psm_list: PSMList) -> None: f"Running MSĀ²PIP for PSMs from run ({current_run}/{total_runs}) `{run}`..." ) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - spectrum_filename = infer_spectrum_path(self.spectrum_path, run) - logger.debug(f"Using spectrum file `{spectrum_filename}`") try: ms2pip_results = correlate( psms=psm_list_run, - spectrum_file=spectrum_filename, + spectrum_file=run, # Run has already been mapped to a path spectrum_id_pattern=self.spectrum_id_pattern, model=self.model, ms2_tolerance=self.ms2_tolerance, diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 3eb3d19f..25f7fc25 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -1,7 +1,10 @@ import logging import re -from typing import Dict, Union +from typing import Dict, Union, Optional +from pathlib import Path +from glob import glob +import numpy as np import psm_utils.io from psm_utils import PSMList @@ -52,6 +55,17 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] psm_list["spectrum_id"] = new_ids + # Add filename if all values are none + # if (psm_list["run"] == None).all(): # noqa: E711 + # # Map inferred spectrum paths + spectrum_path_mapping = { + run: infer_spectrum_path(configured_path=config["spectrum_path"], run_name=run) + for run in set(psm_list["run"]) + } + logger.debug(f"Mapped PSM list runs to spectrum file paths: {spectrum_path_mapping}") + psm_list["run"] = np.vectorize(spectrum_path_mapping.get)(psm_list["run"]) + exit() + # TODO: Temporary fix until implemented in psm_utils # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str) psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]] @@ -117,3 +131,73 @@ def _match_psm_ids(old_id, regex_pattern): "`psm_id_pattern` could not be matched to all PSM spectrum IDs." " Ensure that the regex contains a capturing group?" ) + + +def infer_spectrum_path( + configured_path: Union[str, Path, None], + run_name: Optional[str] = None, +) -> Union[str, Path]: + """ + Infer spectrum path from passed path and expected filename (e.g. from PSM file). + + Parameters + ---------- + configured_path: str, Path, None + User-defined path to spectrum file or directory containing spectrum file + run_name : str, optional + MS run name (stem of spectrum filename), e.g., as expected from PSM file. + + """ + # If no spectrum path configured, use expected run_name in default dir + if not configured_path: + if run_name: + resolved_path = Path(".").joinpath(run_name) + else: + raise MS2RescoreConfigurationError( + "Could not resolve spectrum file name: No spectrum path configured " + "and no run name in PSM file found." + ) + + else: + configured_path = Path(configured_path) + # If passed path is directory, join with run name + if configured_path.is_dir(): + if run_name: + resolved_path = configured_path.joinpath(run_name) + else: + raise MS2RescoreConfigurationError( + "Could not resolve spectrum file name: Spectrum path is directory " + "but no run name in PSM file found." + ) + + # If passed path is file, use that, but warn if basename doesn't match expected + elif configured_path.is_file(): + if run_name and configured_path.stem != Path(run_name).stem: + logger.warning( + "Passed spectrum path (`%s`) does not match run name found in PSM " + "file (`%s`). Continuing with passed spectrum path.", + configured_path, + run_name, + ) + resolved_path = configured_path + else: + raise MS2RescoreConfigurationError( + "Configured `spectrum_path` must be `None` or a path to an existing file " + "or directory. If `None` or path to directory, spectrum run information " + "should be present in the PSM file." + ) + + # Match with file extension if not in resolved_path yet + if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE): + for filename in glob(resolved_path + "*"): + if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE): + resolved_path = filename + break + else: + raise MS2RescoreConfigurationError( + f"Resolved spectrum filename '{resolved_path}' does not contain a supported file " + "extension (mgf or mzml) and could not find any matching existing " + "files." + ) + + return Path(resolved_path).as_posix() diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py deleted file mode 100644 index 26c88a7e..00000000 --- a/ms2rescore/utils.py +++ /dev/null @@ -1,78 +0,0 @@ -import logging -import os -import re -from glob import glob -from pathlib import Path -from typing import Optional, Union - -from ms2rescore.exceptions import MS2RescoreConfigurationError - -logger = logging.getLogger(__name__) - - -def infer_spectrum_path( - configured_path: Union[str, Path, None], - run_name: Optional[str] = None, -) -> Union[str, Path]: - """ - Infer spectrum path from passed path and expected filename (e.g. from PSM file). - - Parameters - ---------- - configured_path: str, Path, None - User-defined path to spectrum file or directory containing spectrum file - run_name : str, optional - MS run name (stem of spectrum filename), e.g., as expected from PSM file. - - """ - # If no spectrum path configured, use expected run_name in default dir - if not configured_path: - if run_name: - resolved_path = os.path.join(".", run_name) - else: - raise MS2RescoreConfigurationError( - "Could not resolve spectrum file name: No spectrum path configured " - "and no run name in PSM file found." - ) - - # If passed path is directory, join with run name - elif os.path.isdir(configured_path): - if run_name: - resolved_path = os.path.join(configured_path, run_name) - else: - raise MS2RescoreConfigurationError( - "Could not resolve spectrum file name: Spectrum path is directory " - "but no run name in PSM file found." - ) - - # If passed path is file, use that, but warn if basename doesn't match expected - elif os.path.isfile(configured_path): - if run_name and Path(configured_path).stem != Path(run_name).stem: - logger.warning( - "Passed spectrum path (`%s`) does not match run name found in PSM " - "file (`%s`). Continuing with passed spectrum path.", - configured_path, - run_name, - ) - resolved_path = configured_path - else: - raise MS2RescoreConfigurationError( - "Configured `spectrum_path` must be `None` or a path to an existing file " - "or directory. If `None` or path to directory, spectrum run information " - "should be present in the PSM file." - ) - - # Match with file extension if not in resolved_path yet - if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE): - for filename in glob(resolved_path + "*"): - if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE): - resolved_path = filename - break - else: - raise MS2RescoreConfigurationError( - "Resolved spectrum filename does not contain a supported file " - "extension (mgf or mzml) and could not find any matching existing " - "files." - ) - - return resolved_path