diff --git a/ms2rescore/core.py b/ms2rescore/core.py index fed1f67..25eaf62 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -13,7 +13,11 @@ from ms2rescore.parse_spectra import add_precursor_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence +from ms2rescore.rescoring_engines.mokapot import ( + add_peptide_confidence, + add_psm_confidence, +) +from ms2rescore.utils import filter_mumble_psms logger = logging.getLogger(__name__) @@ -106,6 +110,12 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: ) psm_list = psm_list[psms_with_features] + if "mumble" in config["psm_generator"]: + # Remove PSMs where matched_ions_pct drops 25% below the original hit + psm_list = filter_mumble_psms(psm_list, threshold=0.75) + # Currently replace the score with the hyperscore for Mumble + # psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix + # Write feature names to file _write_feature_names(feature_names, output_file_root) @@ -211,7 +221,10 @@ def _write_feature_names(feature_names, output_file_root): def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int: """Log #PSMs identified before rescoring.""" id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) + (psm_list["qvalue"] <= 0.01) + & (psm_list["rank"] <= max_rank) + & (~psm_list["is_decoy"]) + & ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]]) ).sum() logger.info( f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before " @@ -277,7 +290,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList: ) # Recalculate confidence - new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"]) + new_confidence = lin_psm_data.assign_confidence( + scores=list(psm_list["score"]) + ) # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot # Add new confidence estimations to PSMList add_psm_confidence(psm_list, new_confidence) diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index ba5492a..68b1b43 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError): """Error while rescoring PSMs.""" pass + + +class ParseSpectrumError(MS2RescoreError): + """Error while rescoring PSMs.""" + + pass diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 52440a5..9eff1b5 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -4,10 +4,11 @@ from ms2rescore.feature_generators.basic import BasicFeatureGenerator from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator +from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator +from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator -from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, @@ -16,4 +17,5 @@ "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, "im2deep": IM2DeepFeatureGenerator, + "ms2": MS2FeatureGenerator, } diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py index bc2222e..df00516 100644 --- a/ms2rescore/feature_generators/basic.py +++ b/ms2rescore/feature_generators/basic.py @@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None: charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list]) precursor_mzs = psm_list["precursor_mz"] scores = psm_list["score"] + peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list]) has_charge = None not in charge_states has_mz = None not in precursor_mzs and has_charge @@ -74,6 +75,14 @@ def add_features(self, psm_list: PSMList) -> None: if has_score: self._feature_names.append("search_engine_score") + if has_mz and has_charge: + experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812) + theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812) + mass_error = experimental_mass - theoretical_mass + self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"]) + + self._feature_names.append("pep_len") + for i, psm in enumerate(psm_list): psm.rescoring_features.update( dict( @@ -81,6 +90,10 @@ def add_features(self, psm_list: PSMList) -> None: **charge_one_hot[i] if has_charge else {}, **{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {}, **{"search_engine_score": scores[i]} if has_score else {}, + **{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {}, + **{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {}, + **{"mass_error": mass_error[i]} if has_mz and has_charge else {}, + **{"pep_len": peptide_lengths[i]}, ) ) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index a0e617d..2ee6b06 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -43,6 +43,7 @@ def __init__( *args, lower_score_is_better: bool = False, calibration_set_size: Union[int, float, None] = None, + calibration_set: Union[str, None] = None, processes: int = 1, **kwargs, ) -> None: @@ -74,6 +75,7 @@ def __init__( self.lower_psm_score_better = lower_score_is_better self.calibration_set_size = calibration_set_size + self.calibration_set = calibration_set self.processes = processes self.deeplc_kwargs = kwargs or {} @@ -123,7 +125,6 @@ def add_features(self, psm_list: PSMList) -> None: # Run DeepLC for each spectrum file current_run = 1 total_runs = sum(len(runs) for runs in psm_dict.values()) - for runs in psm_dict.values(): # Reset DeepLC predictor for each collection of runs self.deeplc_predictor = None @@ -141,13 +142,13 @@ def add_features(self, psm_list: PSMList) -> None: ) # Disable wild logging to stdout by Tensorflow, unless in debug mode - - with contextlib.redirect_stdout( - open(os.devnull, "w", encoding="utf-8") - ) if not self._verbose else contextlib.nullcontext(): + with ( + contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8")) + if not self._verbose + else contextlib.nullcontext() + ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - psm_list_calibration = self._get_calibration_psms(psm_list_run) logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") self.deeplc_predictor = self.DeepLC( @@ -197,7 +198,10 @@ def add_features(self, psm_list: PSMList) -> None: def _get_calibration_psms(self, psm_list: PSMList): """Get N best scoring target PSMs for calibration.""" - psm_list_targets = psm_list[~psm_list["is_decoy"]] + psm_list_targets = psm_list[ + ~psm_list["is_decoy"] + & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]] + ] if self.calibration_set_size: n_psms = self._get_number_of_calibration_psms(psm_list_targets) indices = np.argsort(psm_list_targets["score"]) diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index fbe3abb..2b5cb7a 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -161,7 +161,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p identified_psms = psm_list_df[ (psm_list_df["qvalue"] < 0.01) & (~psm_list_df["is_decoy"]) - & (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep + & (psm_list_df["charge"] < 7) # predictions do not go higher for IM2Deep + & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]]) ] calibration_psms = identified_psms[ identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold) diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index d6b4a88..7154e6b 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -29,7 +29,11 @@ try: from ionmob import __file__ as ionmob_file from ionmob.preprocess.data import to_tf_dataset_inference - from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs + from ionmob.utilities.chemistry import ( + VARIANT_DICT, + calculate_mz, + reduced_mobility_to_ccs, + ) from ionmob.utilities.tokenization import tokenizer_from_json from ionmob.utilities.utility import get_ccs_shift except ImportError: diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py new file mode 100644 index 0000000..0b226a5 --- /dev/null +++ b/ms2rescore/feature_generators/ms2.py @@ -0,0 +1,302 @@ +""" +MS2-based feature generator. + +""" + +import logging +import re +from collections import defaultdict +from itertools import chain +from typing import List, Optional + +import numpy as np +from psm_utils import PSMList +from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum +from ms2rescore_rs import get_ms2_spectra + +from ms2rescore.exceptions import ParseSpectrumError +from ms2rescore.feature_generators.base import FeatureGeneratorBase +from ms2rescore.utils import infer_spectrum_path + +logger = logging.getLogger(__name__) + +FRAGMENTATION_MODELS = { + "cidhcd": FragmentationModel.CidHcd, + "etd": FragmentationModel.Etd, + "ethcd": FragmentationModel.Ethcd, + "all": FragmentationModel.All, +} +MASS_MODES = { + "average": MassMode.Average, + "monoisotopic": MassMode.Monoisotopic, +} + + +class MS2FeatureGenerator(FeatureGeneratorBase): + """DeepLC retention time-based feature generator.""" + + def __init__( + self, + *args, + spectrum_path: Optional[str] = None, + spectrum_id_pattern: str = "(.*)", + fragmentation_model: str = "All", + mass_mode: str = "Monoisotopic", + processes: int = 1, + calculate_hyperscore: bool = True, # Allow optional ? + **kwargs, + ) -> None: + """ + Generate MS2-based features for rescoring. + + Parameters + ---------- + + Attributes + ---------- + feature_names: list[str] + Names of the features that will be added to the PSMs. + + """ + super().__init__(*args, **kwargs) + + self.spectrum_path = spectrum_path + self.spectrum_id_pattern = spectrum_id_pattern + self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] + self.mass_mode = MASS_MODES[mass_mode.lower()] + self.processes = processes + self.calculate_hyperscore = calculate_hyperscore + + @property + def feature_names(self) -> List[str]: + return [ + "ln_explained_intensity", + "ln_total_intensity", + "ln_explained_intensity_ratio", + "ln_explained_b_ion_ratio", + "ln_explained_y_ion_ratio", + "longest_b_ion_sequence", + "longest_y_ion_sequence", + "matched_b_ions", + "matched_b_ions_pct", + "matched_y_ions", + "matched_y_ions_pct", + "matched_ions_pct", + "hyperscore", + ] + + def add_features(self, psm_list: PSMList) -> None: + """Add MS2-derived features to PSMs.""" + + logger.info("Adding MS2-derived features to PSMs.") + psm_dict = psm_list.get_psm_dict() + current_run = 1 + total_runs = sum(len(runs) for runs in psm_dict.values()) + + for runs in psm_dict.values(): + for run, psms in runs.items(): + logger.info( + f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..." + ) + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + spectrum_filename = infer_spectrum_path(self.spectrum_path, run) + + self._calculate_features(psm_list_run, spectrum_filename) + current_run += 1 + + def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: + """Retrieve annotated spectra for all psms.""" + + spectra = get_ms2_spectra(str(spectrum_file)) + spectrum_id_pattern = re.compile( + self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)" + ) + try: + spectra_dict = { + spectrum_id_pattern.search(spectrum.identifier).group(1): spectrum + for spectrum in spectra + } + except AttributeError: + raise ParseSpectrumError( + "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." + ) + + for psm in psm_list: + annotated_spectrum = self._annotate_spectrum(psm, spectra_dict[psm.spectrum_id]) + psm.rescoring_features.update( + self._calculate_spectrum_features(psm, annotated_spectrum) + ) + + if self.calculate_hyperscore: + # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?) + b, y = get_by_fragments(annotated_spectrum) + hs = calculate_hyperscore( + n_y=len(y), n_b=len(b), y_ion_intensities=y, b_ion_intensities=b + ) + psm.rescoring_features.update({"hyperscore": hs}) + + @staticmethod + def _longest_ion_sequence(lst): + max_sequence = 0 + current_sequence = 0 + + for value in lst: + current_sequence = current_sequence + 1 if value else 0 + max_sequence = max(max_sequence, current_sequence) + + return max_sequence + + def _calculate_spectrum_features(self, psm, annotated_spectrum): + + if not annotated_spectrum: + return {} + + features = defaultdict(list) + b_ions_matched = [False] * (len(psm.peptidoform.sequence)) + y_ions_matched = [False] * (len(psm.peptidoform.sequence)) + + pseudo_count = 0.00001 + ion_fragment_regex = re.compile(r"\d+") + + for peak in annotated_spectrum: + features["total_intensity"].append(peak.intensity) + + if peak.annotation: + features["matched_intensity"].append(peak.intensity) + for matched_ion in peak.annotation: + if "y" in matched_ion.ion: + features["y_ion_matched"].append(peak.intensity) + y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( + True + ) + elif "b" in matched_ion.ion: + features["b_ion_matched"].append(peak.intensity) + b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( + True + ) + + total_intensity_sum = np.sum(features["total_intensity"]) + matched_intensity_sum = np.sum(features["matched_intensity"]) + b_ion_matched_sum = np.sum(features["b_ion_matched"]) + y_ion_matched_sum = np.sum(features["y_ion_matched"]) + + return { + "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count), + "ln_total_intensity": np.log(total_intensity_sum + pseudo_count), + "ln_explained_intensity_ratio": np.log( + matched_intensity_sum / total_intensity_sum + pseudo_count + ), + "ln_explained_b_ion_ratio": np.log( + b_ion_matched_sum / matched_intensity_sum + pseudo_count + ), + "ln_explained_y_ion_ratio": np.log( + y_ion_matched_sum / matched_intensity_sum + pseudo_count + ), + "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched), + "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched), + "matched_b_ions": sum(b_ions_matched), + "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), + "matched_y_ions": sum(y_ions_matched), + "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), + "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) + / (len(b_ions_matched) + len(y_ions_matched)), + } + + def _annotate_spectrum(self, psm, spectrum): + + spectrum = RawSpectrum( + title=psm.spectrum_id, + num_scans=1, + rt=psm.retention_time, + precursor_charge=psm.get_precursor_charge(), + precursor_mass=spectrum.precursor.mz, + mz_array=spectrum.mz, + intensity_array=spectrum.intensity, + ) + try: + linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0]) + annotated_spectrum = spectrum.annotate( + peptide=linear_peptide, + model=self.fragmentation_model, + mode=self.mass_mode, + ) + except: # noqa E722 + return [] + + return annotated_spectrum.spectrum + + +def factorial(n): + """ + Compute factorial of n using a loop. + Parameters: + n (int): Non-negative integer. + Returns: + int: Factorial of n. + """ + result = 1 + for i in range(1, n + 1): + result *= i + return result + + +def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities): + """ + Calculate the hyperscore for a peptide-spectrum match. + Parameters: + n_y (int): Number of matched y-ions. + n_b (int): Number of matched b-ions. + y_ion_intensities (list): Intensities of matched y-ions. + b_ion_intensities (list): Intensities of matched b-ions. + Returns: + float: Calculated hyperscore. + """ + # Calculate the product of y-ion and b-ion intensities + product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1 + product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1 + + # Calculate factorial using custom function + factorial_y = factorial(n_y) + factorial_b = factorial(n_b) + + # Compute hyperscore + hyperscore = np.log(factorial_y * factorial_b * (product_y + product_b)) + return hyperscore + + +def infer_fragment_identity(frag, allow_ion_types=["b", "y"]): + ion = frag.ion + + is_allowed = False + for allowed_ion_type in allow_ion_types: + if allowed_ion_type in ion: + is_allowed = True + break + + if not is_allowed: + return False + # Radicals + if "·" in ion: + return False + if frag.neutral_loss is not None: + return False + if frag.charge > 2: + return False + + return ion[0] + + +def get_by_fragments(annotated_spectrum): + b_intensities = [] + y_intensities = [] + for peak in annotated_spectrum: + + for fragment in peak.annotation: + + ion_type = infer_fragment_identity(fragment) + + if ion_type == "b": + b_intensities.append(peak.intensity) + if ion_type == "y": + y_intensities.append(peak.intensity) + return b_intensities, y_intensities diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py index 429e117..57cee3c 100644 --- a/ms2rescore/gui/__main__.py +++ b/ms2rescore/gui/__main__.py @@ -1,8 +1,8 @@ """Entrypoint for MS²Rescore GUI.""" +import contextlib import multiprocessing import os -import contextlib from ms2rescore.gui.app import app diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 805be2c..29042e9 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -12,6 +12,7 @@ }, "maxquant": {} }, + "psm_generator": {}, "rescoring_engine": { "mokapot": { "train_fdr": 0.01, diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index e9b0379..3a2286c 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -62,6 +62,18 @@ "mokapot": {} } }, + "psm_generator": { + "description": "PSM generator and their configuration.", + "type": "object", + "minProperties": 0, + "maxProperties": 1, + "patternProperties": { + ".*": { "$ref": "#/definitions/psm_generator" }, + "mumble": { + "$ref": "#/definitions/mumble" + } + } + }, "config_file": { "description": "Path to configuration file", "oneOf": [{ "type": "string" }, { "type": "null" }] @@ -184,6 +196,7 @@ "type": "boolean", "default": false } + } } }, @@ -198,6 +211,11 @@ "type": "object", "additionalProperties": true }, + "psm_generator": { + "description": "Additional PSM feature generator configuration", + "type": "object", + "additionalProperties": true + }, "basic": { "$ref": "#/definitions/feature_generator", "description": "Basic feature generator configuration", @@ -278,6 +296,30 @@ } } }, + "mumble": { + "$ref": "#/definitions/psm_generator", + "description": "Mumble PSM generator configuration using Mubmle", + "type": "object", + "additionalProperties": true, + "properties": { + "aa_combinations": { + "description": "Additional amino acid combinations to consider as mass shift", + "type": "integer", + "default": 0 + }, + "fasta_file": { + "description": "Maximum number of modifications per peptide", + "oneOf": [{ "type": "string" }, { "type": "null" }], + "default": false + }, + "mass_error": { + "description": "mass error in Da", + "type": "number", + "default": 0.2 + } + } + }, + "mokapot": { "$ref": "#/definitions/rescoring_engine", "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.", diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 6a3c606..cd69c6b 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -4,6 +4,7 @@ import numpy as np import psm_utils.io +from mumble import PSMHandler from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -90,6 +91,36 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] psm_list["spectrum_id"] = new_ids + # Addition of Modifications for mass shifts in the PSMs with Mumble + if "mumble" in config["psm_generator"]: + logger.debug("Applying modifications for mass shifts using Mumble...") + + # set inlcude original psm to True and include decoy psm to true + if ( + "include_original_psm" not in config["psm_generator"]["mumble"] + or not config["psm_generator"]["mumble"] + ): + config["psm_generator"]["mumble"]["include_original_psm"] = True + if ( + "include_decoy_psm" not in config["psm_generator"]["mumble"] + or not config["psm_generator"]["mumble"] + ): + config["psm_generator"]["mumble"]["include_decoy_psm"] = True + + mumble_config = config["psm_generator"]["mumble"] + + # Check if psm_list[0].rescoring_features is empty or not + if psm_list[0].rescoring_features: + logger.debug("Removing psm_file rescoring features from PSMs...") + # psm_list.remove_rescoring_features() # TODO add this to psm_utils + for psm in psm_list: + psm.rescoring_features = {} + + psm_handler = PSMHandler( + **mumble_config, + ) + psm_list = psm_handler.add_modified_psms(psm_list) + return psm_list diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py index 192950b..32d856c 100644 --- a/ms2rescore/rescoring_engines/percolator.py +++ b/ms2rescore/rescoring_engines/percolator.py @@ -19,8 +19,8 @@ import logging import subprocess -from typing import Any, Dict, Optional from copy import deepcopy +from typing import Any, Dict, Optional import psm_utils diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 2d2d157..a64d6ec 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -3,9 +3,11 @@ from glob import glob from pathlib import Path from typing import Optional, Union +import numpy as np from ms2rescore.exceptions import MS2RescoreConfigurationError from ms2rescore_rs import is_supported_file_type +from psm_utils import PSMList logger = logging.getLogger(__name__) @@ -91,3 +93,57 @@ def _is_minitdf(spectrum_file: str) -> bool: files = set(Path(spectrum_file).glob("*ms2spectrum.bin")) files.update(Path(spectrum_file).glob("*ms2spectrum.parquet")) return len(files) >= 2 + + +def filter_mumble_psms(psm_list: PSMList, threshold=1) -> PSMList: + """ + Filter out mumble PSMs with `matched_ions_pct` lower than the original hit. + + Parameters + ---------- + psm_list : PSMList + List of PSMs to filter + threshold : float, optional + Threshold to lower the maximum matched_ions_pct of the original hit + """ + # Extract relevant fields from the PSM list + original_hit = np.array([metadata.get("original_psm") for metadata in psm_list["metadata"]]) + spectrum_indices = np.array([psm.spectrum_id for psm in psm_list]) + runs = np.array([psm.run for psm in psm_list]) + + # Check if matched_ions_pct exists + if "matched_ions_pct" not in psm_list[0].rescoring_features: + return psm_list + + matched_ions = np.array([psm.rescoring_features["matched_ions_pct"] for psm in psm_list]) + + # Create unique keys for each (run, spectrum_id) + unique_keys = np.core.defchararray.add(runs.astype(str), spectrum_indices.astype(str)) + unique_keys_indices, inverse_indices = np.unique(unique_keys, return_inverse=True) + + # Initialize an array to store the `matched_ions_pct` of original hits per group + original_matched_ions_pct = np.full( + len(unique_keys_indices), -np.inf + ) # Default to -inf for groups without original hits + + # Assign the `matched_ions_pct` of original hits to their groups + np.maximum.at( + original_matched_ions_pct, inverse_indices[original_hit], matched_ions[original_hit] + ) + + # lower the maximum with the threshold + original_matched_ions_pct = original_matched_ions_pct * threshold + + # Broadcast the original `matched_ions_pct` back to all PSMs in each group + original_matched_ions_for_all = original_matched_ions_pct[inverse_indices] + + # Determine the filtering condition + keep = np.logical_or( + original_hit, # Always keep original hits + matched_ions + >= original_matched_ions_for_all, # Keep hits with `matched_ions_pct` >= the original + ) + + # Filter PSMs + logger.debug(f"Filtered out {len(psm_list) - np.sum(keep)} mumble PSMs.") + return psm_list[keep] diff --git a/pyproject.toml b/pyproject.toml index 9f873ac..dc47581 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "customtkinter>=5,<6", "deeplc>=3.0,<3.1", "deeplcretrainer", - "im2deep>=0.1.3", + "im2deep>=0.3.0", "jinja2>=3", "lxml>=4.5", "mokapot>=0.10", @@ -54,6 +54,7 @@ dependencies = [ [project.optional-dependencies] ionmob = ["ionmob>=0.2", "tensorflow"] +mumble = ["rustyms>=0.8.3", "mumble>=0.2.0"] dev = ["ruff", "black", "pytest", "pytest-cov", "pre-commit"] docs = [ "sphinx",