Source code for simba.core.data.preprocessing

import copy
import functools
import json
from itertools import groupby

import numpy as np
import pandas as pd
import requests
from rdkit import Chem

from simba.core.data.spectrum import SpectrumExt



[docs]
class PreprocessingUtils:


[docs]
    @staticmethod
    def is_centroid(intensity):
        return np.all(intensity > 0)



[docs]
    @staticmethod
    def order_by_charge(
        spectra: list[SpectrumExt],
    ) -> dict[int, list[SpectrumExt]]:
        """
        Order spectra by their precursor charge.

        Parameters
        ----------
        spectra : List[SpectrumExt]
            List of SpectrumExt objects to be ordered.

        Returns
        -------
        Dict[int, List[SpectrumExt]]
            A dictionary where keys are precursor charges and values are lists of SpectrumExt objects with that charge.
        """
        spectra_new = copy.deepcopy(spectra)
        # Sort the list based on the precursor_charge
        spectra_new.sort(key=lambda a: a.precursor_charge)
        # Group the elements based on the precursor_charge
        spectra_by_charge = {}
        for key, group in groupby(
            spectra_new, key=lambda a: a.precursor_charge
        ):
            spectra_by_charge[key] = list(group)
        return spectra_by_charge



[docs]
    @staticmethod
    def order_spectra_by_mz(spectra: list[SpectrumExt]) -> list[SpectrumExt]:
        """
        Order spectra by their precursor m/z.

        Parameters
        ----------
        spectra : List[SpectrumExt]
            List of SpectrumExt objects to be ordered.

        Returns
        -------
        List[SpectrumExt]
            A list of SpectrumExt objects ordered by their precursor m/z.
        """
        spectra_by_charge = PreprocessingUtils.order_by_charge(spectra)

        all_spectra = []
        for charge, spectra_group in spectra_by_charge.items():
            # order by mz
            mzs = np.array([s.precursor_mz for s in spectra_group])
            ordered_indexes = np.argsort(mzs)
            sorted_spectra = [spectra_group[r] for r in ordered_indexes]
            all_spectra = all_spectra + sorted_spectra

        return all_spectra


    def _smiles_to_mol(smiles):
        try:
            return Chem.MolFromSmiles(smiles)
        except ArgumentError:
            return None


[docs]
    @functools.lru_cache
    def get_class(
        inchi: str, smiles: str
    ) -> tuple[str | None, str | None, str | None]:
        """
        Get the superclass, class and subclass of a molecule using Classyfire.
        Either InChI or SMILES can be used as input.
        Parameters
        ----------
        inchi : str
            The InChI string of the molecule.
        smiles : str
            The SMILES string of the molecule.
        Returns
        -------
        tuple
            A tuple (superclass, class, subclass) if successful,
            (None, None, None) otherwise.
        """
        clss = (
            PreprocessingUtils._get_class("inchi", inchi)
            if inchi is not None and inchi != "N/A"
            else None
        )
        if clss is None and not pd.isna(smiles) and smiles != "N/A":
            mol = PreprocessingUtils._smiles_to_mol(smiles)
            clss = (
                PreprocessingUtils._get_class(
                    "smiles", Chem.MolToSmiles(mol, False)
                )
                if mol is not None
                else None
            )
        return clss if clss is not None else (None, None, None)


    @functools.lru_cache
    def _get_class(
        mol_type: str, mol_val: str
    ) -> tuple[str, str, str] | None:
        """
        Get the superclass, class and subclass of a molecule using Classyfire.
        Either InChI or SMILES can be used as input.
        Parameters
        ----------
        mol_type : str
            Either "inchi" or "smiles".
        mol_val : str
            The InChI or SMILES string of the molecule.
        Returns
        -------
        tuple or None
            A tuple (superclass, class, subclass) if successful, None otherwise.
        """
        r = requests.get(
            f"https://gnps-structure.ucsd.edu/classyfire?{mol_type}={mol_val}"
        )
        if r.status_code != 200:
            return None
        try:
            classyfire_json = r.json()
            if not classyfire_json:
                return None
            if (
                "superclass" not in classyfire_json
                or "class" not in classyfire_json
                or "subclass" not in classyfire_json
            ):
                return None
            superclass = classyfire_json["superclass"]
            if superclass is not None:
                superclass = superclass["name"]
            clss = classyfire_json["class"]
            if clss is not None:
                clss = clss["name"]
            subclass = classyfire_json["subclass"]
            if subclass is not None:
                subclass = subclass["name"]
            return superclass, clss, subclass
        except json.decoder.JSONDecodeError:
            return None