import copy
import functools
import json
from itertools import groupby
import numpy as np
import pandas as pd
import requests
from rdkit import Chem
from simba.core.data.spectrum import SpectrumExt
[docs]
class PreprocessingUtils:
[docs]
@staticmethod
def is_centroid(intensity):
return np.all(intensity > 0)
[docs]
@staticmethod
def order_by_charge(
spectra: list[SpectrumExt],
) -> dict[int, list[SpectrumExt]]:
"""
Order spectra by their precursor charge.
Parameters
----------
spectra : List[SpectrumExt]
List of SpectrumExt objects to be ordered.
Returns
-------
Dict[int, List[SpectrumExt]]
A dictionary where keys are precursor charges and values are lists of SpectrumExt objects with that charge.
"""
spectra_new = copy.deepcopy(spectra)
# Sort the list based on the precursor_charge
spectra_new.sort(key=lambda a: a.precursor_charge)
# Group the elements based on the precursor_charge
spectra_by_charge = {}
for key, group in groupby(
spectra_new, key=lambda a: a.precursor_charge
):
spectra_by_charge[key] = list(group)
return spectra_by_charge
[docs]
@staticmethod
def order_spectra_by_mz(spectra: list[SpectrumExt]) -> list[SpectrumExt]:
"""
Order spectra by their precursor m/z.
Parameters
----------
spectra : List[SpectrumExt]
List of SpectrumExt objects to be ordered.
Returns
-------
List[SpectrumExt]
A list of SpectrumExt objects ordered by their precursor m/z.
"""
spectra_by_charge = PreprocessingUtils.order_by_charge(spectra)
all_spectra = []
for charge, spectra_group in spectra_by_charge.items():
# order by mz
mzs = np.array([s.precursor_mz for s in spectra_group])
ordered_indexes = np.argsort(mzs)
sorted_spectra = [spectra_group[r] for r in ordered_indexes]
all_spectra = all_spectra + sorted_spectra
return all_spectra
def _smiles_to_mol(smiles):
try:
return Chem.MolFromSmiles(smiles)
except ArgumentError:
return None
[docs]
@functools.lru_cache
def get_class(
inchi: str, smiles: str
) -> tuple[str | None, str | None, str | None]:
"""
Get the superclass, class and subclass of a molecule using Classyfire.
Either InChI or SMILES can be used as input.
Parameters
----------
inchi : str
The InChI string of the molecule.
smiles : str
The SMILES string of the molecule.
Returns
-------
tuple
A tuple (superclass, class, subclass) if successful,
(None, None, None) otherwise.
"""
clss = (
PreprocessingUtils._get_class("inchi", inchi)
if inchi is not None and inchi != "N/A"
else None
)
if clss is None and not pd.isna(smiles) and smiles != "N/A":
mol = PreprocessingUtils._smiles_to_mol(smiles)
clss = (
PreprocessingUtils._get_class(
"smiles", Chem.MolToSmiles(mol, False)
)
if mol is not None
else None
)
return clss if clss is not None else (None, None, None)
@functools.lru_cache
def _get_class(
mol_type: str, mol_val: str
) -> tuple[str, str, str] | None:
"""
Get the superclass, class and subclass of a molecule using Classyfire.
Either InChI or SMILES can be used as input.
Parameters
----------
mol_type : str
Either "inchi" or "smiles".
mol_val : str
The InChI or SMILES string of the molecule.
Returns
-------
tuple or None
A tuple (superclass, class, subclass) if successful, None otherwise.
"""
r = requests.get(
f"https://gnps-structure.ucsd.edu/classyfire?{mol_type}={mol_val}"
)
if r.status_code != 200:
return None
try:
classyfire_json = r.json()
if not classyfire_json:
return None
if (
"superclass" not in classyfire_json
or "class" not in classyfire_json
or "subclass" not in classyfire_json
):
return None
superclass = classyfire_json["superclass"]
if superclass is not None:
superclass = superclass["name"]
clss = classyfire_json["class"]
if clss is not None:
clss = clss["name"]
subclass = classyfire_json["subclass"]
if subclass is not None:
subclass = subclass["name"]
return superclass, clss, subclass
except json.decoder.JSONDecodeError:
return None