Source code for app.modules.all_descriptors

from __future__ import annotations

from typing import Union

from rdkit.Chem import Descriptors
from rdkit.Chem import Lipinski
from rdkit.Chem import QED
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdmolops

from app.modules.toolkits.cdk_wrapper import cdk_base
from app.modules.toolkits.cdk_wrapper import get_aromatic_ring_count
from app.modules.toolkits.cdk_wrapper import get_CDK_SDG
from app.modules.toolkits.cdk_wrapper import get_tanimoto_similarity_CDK
from app.modules.toolkits.cdk_wrapper import get_vander_waals_volume
from app.modules.toolkits.cdk_wrapper import JClass
from app.modules.toolkits.helpers import parse_input
from app.modules.toolkits.rdkit_wrapper import check_RO5_violations
from app.modules.toolkits.rdkit_wrapper import get_tanimoto_similarity_rdkit



[docs]
def get_all_rdkit_descriptors(molecule: any) -> Union[tuple, str]:
    """Calculate a selected set of molecular descriptors using RDKit.

    This function takes an input SMILES string and calculates various molecular descriptors
    using RDKit.

    Args:
        molecule (Chem.mol): RDKit molecule object.

    Returns:
        tuple: A tuple containing calculated molecular descriptors. If an error occurs during processing, an error message is returned.
    """

    if molecule:
        AtomC = rdMolDescriptors.CalcNumAtoms(molecule)
        BondC = molecule.GetNumBonds()
        HeavyAtomsC = rdMolDescriptors.CalcNumHeavyAtoms(molecule)
        MolWt = "%.2f" % Descriptors.MolWt(molecule)
        ExactMolWt = "%.5f" % Descriptors.ExactMolWt(molecule)
        ALogP = "%.2f" % QED.properties(molecule).ALOGP
        NumRotatableBonds = rdMolDescriptors.CalcNumRotatableBonds(molecule)
        PSA = "%.2f" % rdMolDescriptors.CalcTPSA(molecule)
        HBA = Descriptors.NumHAcceptors(molecule)
        HBD = Descriptors.NumHDonors(molecule)
        Lipinski_HBA = Lipinski.NumHAcceptors(molecule)
        Lipinski_HBD = Lipinski.NumHDonors(molecule)
        Ro5Violations = check_RO5_violations(molecule)
        AromaticRings = rdMolDescriptors.CalcNumAromaticRings(molecule)
        QEDWeighted = "%.2f" % QED.qed(molecule)
        FormalCharge = rdmolops.GetFormalCharge(molecule)
        fsp3 = "%.2f" % rdMolDescriptors.CalcFractionCSP3(molecule)
        NumRings = rdMolDescriptors.CalcNumRings(molecule)
        VABCVolume = None
        return (
            AtomC,
            BondC,
            HeavyAtomsC,
            float(MolWt),
            float(ExactMolWt),
            float(ALogP),
            NumRotatableBonds,
            float(PSA),
            HBA,
            HBD,
            Lipinski_HBA,
            Lipinski_HBD,
            Ro5Violations,
            AromaticRings,
            float(QEDWeighted),
            FormalCharge,
            float(fsp3),
            NumRings,
            VABCVolume,
        )
    else:
        return "Error reading SMILES string, check again."




[docs]
def get_all_cdk_descriptors(molecule: any) -> Union[tuple, str]:
    """Calculate a set of molecular descriptors using the CDK.

    This function takes a SMILES string as input and calculates various molecular descriptors
    using the CDK. The calculated descriptors are returned as a tuple.

    Args:
        molecule (IAtomContainer): CDK molecule object.

    Returns:
        tuple: A tuple containing calculated molecular descriptors. If an error occurs during processing, an error message is returned.
    """
    SDGMol = get_CDK_SDG(molecule)
    if SDGMol:
        AtomCountDescriptor = (
            JClass(cdk_base + ".qsar.descriptors.molecular.AtomCountDescriptor")()
            .calculate(SDGMol)
            .getValue()
        )
        BondCountDescriptor = (
            JClass(cdk_base + ".qsar.descriptors.molecular.BondCountDescriptor")()
            .calculate(SDGMol)
            .getValue()
        )
        HeavyAtomsC = SDGMol.getAtomCount()
        WeightDescriptor = (
            JClass(cdk_base + ".qsar.descriptors.molecular.WeightDescriptor")()
            .calculate(SDGMol)
            .getValue()
            .toString()
        )
        TotalExactMass = JClass(
            cdk_base + ".tools.manipulator.AtomContainerManipulator",
        ).getTotalExactMass(SDGMol)
        ALogP = (
            JClass(cdk_base + ".qsar.descriptors.molecular.ALOGPDescriptor")()
            .calculate(SDGMol)
            .getValue()
        )
        NumRotatableBonds = (
            JClass(
                cdk_base + ".qsar.descriptors.molecular.RotatableBondsCountDescriptor",
            )()
            .calculate(SDGMol)
            .getValue()
        )
        TPSADescriptor = (
            JClass(cdk_base + ".qsar.descriptors.molecular.TPSADescriptor")()
            .calculate(SDGMol)
            .getValue()
            .toString()
        )
        HBondAcceptorCountDescriptor = (
            JClass(
                cdk_base + ".qsar.descriptors.molecular.HBondAcceptorCountDescriptor",
            )()
            .calculate(SDGMol)
            .getValue()
        )
        HBondDonorCountDescriptor = (
            JClass(cdk_base + ".qsar.descriptors.molecular.HBondDonorCountDescriptor")()
            .calculate(SDGMol)
            .getValue()
        )
        RuleOfFiveDescriptor = (
            JClass(cdk_base + ".qsar.descriptors.molecular.RuleOfFiveDescriptor")()
            .calculate(SDGMol)
            .getValue()
        )
        AromaticRings = get_aromatic_ring_count(SDGMol)
        QEDWeighted = None
        FormalCharge = JClass(
            cdk_base + ".tools.manipulator.AtomContainerManipulator",
        ).getTotalFormalCharge(SDGMol)
        FractionalCSP3Descriptor = (
            JClass(cdk_base + ".qsar.descriptors.molecular.FractionalCSP3Descriptor")()
            .calculate(SDGMol)
            .getValue()
            .toString()
        )
        NumRings = (
            JClass(
                cdk_base + ".graph.Cycles",
            )
            .mcb(SDGMol)
            .numberOfCycles()
        )
        VABCVolume = get_vander_waals_volume(SDGMol)

        return (
            int(str(AtomCountDescriptor)),
            int(str(BondCountDescriptor)),
            HeavyAtomsC,
            float("{:.2f}".format(float(str(WeightDescriptor)))),
            float("{:.5f}".format(float(str(TotalExactMass)))),
            float("{:.2f}".format(float(str(ALogP).split(",")[0]))),
            int(str(NumRotatableBonds)),
            float("{:.2f}".format(float(str(TPSADescriptor)))),
            int(str(HBondAcceptorCountDescriptor)),
            int(str(HBondDonorCountDescriptor)),
            int(str(HBondAcceptorCountDescriptor)),
            int(str(HBondDonorCountDescriptor)),
            int(str(RuleOfFiveDescriptor)),
            AromaticRings,
            str(QEDWeighted),
            FormalCharge,
            float("{:.2f}".format(float(str(FractionalCSP3Descriptor)))),
            NumRings,
            float(str(VABCVolume)),
        )
    else:
        return "Error reading SMILES string, check again."




[docs]
def get_cdk_rdkit_combined_descriptors(
    smiles: str,
) -> Union[dict, str]:
    """Calculate a selected set of molecular descriptors using CDK and RDKit.

    for a given SMILES string.

    Args:
        smiles (str): A SMILES string representing a chemical compound.

    Returns:
        Union[Dict[str, Tuple[float, float]], str]:
            - If successful, a dictionary containing calculated descriptors. Each descriptor is a key
              mapped to a tuple of its values calculated by RDKit and CDK.
            - If unsuccessful due to descriptor calculation errors, returns an error message as a string.
    """
    # Calculate RDKit and CDK descriptors
    rdkitMol = parse_input(smiles, "rdkit", False)
    rdkit_descriptors = get_all_rdkit_descriptors(rdkitMol)
    cdkMol = parse_input(smiles, "cdk", False)
    cdk_descriptors = get_all_cdk_descriptors(cdkMol)

    # List of descriptors to calculate
    all_descriptors = (
        "Atom count",
        "Bond count",
        "Heavy atom count",
        "Molecular weight",
        "Exact molecular weight",
        "Calculated LogP",
        "Rotatable bond count",
        "Topological polar surface area",
        "Hydrogen bond acceptors",
        "Hydrogen bond donors",
        "Hydrogen bond acceptors (Lipinski)",
        "Hydrogen bond donors (Lipinski)",
        "Lipinski's rule of five violations",
        "Aromatic rings count",
        "QED drug likeliness",
        "Formal Charge",
        "FractionCSP3",
        "Number of Minimal Rings",
        "Van der Waals Volume",
    )

    if len(all_descriptors) == len(rdkit_descriptors) == len(cdk_descriptors):
        combined_dict = {
            descriptor: (rdkit_desc, cdk_desc)
            for descriptor, rdkit_desc, cdk_desc in zip(
                all_descriptors,
                rdkit_descriptors,
                cdk_descriptors,
            )
        }
        return combined_dict
    else:
        return "Error: Dictionary length is invalid"




[docs]
def get_table(tanimoto_values: list) -> str:
    """Convert a list of Tanimoto similarity values into an HTML table.

    Args:
        tanimoto_values (list): A list of lists containing Tanimoto similarity values.

    Returns:
        str: HTML representation of the table.
    """
    table_html = "<table>"

    # Add header row with column indexes
    table_html += "<tr><th></th>"
    for j in range(len(tanimoto_values[0])):
        table_html += f"<th>{j}</th>"
    table_html += "</tr>"

    # Add data rows with row indexes
    for i, row in enumerate(tanimoto_values):
        table_html += "<tr>"
        table_html += f"<td>{i}</td>"
        for cell in row:
            table_html += f"<td>{cell}</td>"
        table_html += "</tr>"

    table_html += "</table>"
    return table_html




[docs]
def get_tanimoto_similarity(smileslist: str, toolkit: str = "cdk") -> list:
    """Calculate the Tanimoto similarity index between pairs of SMILES strings.

    This function takes a list of SMILES strings, splits them, and calculates
    the Tanimoto similarity index between every pair of SMILES strings.

    Args:
        smileslist (str): A comma-separated list of SMILES strings.
        toolkit (str, optional): The toolkit to use for calculating similarity.Can be "cdk" (Chemistry Development Kit) or "rdkit" (RDKit). Defaults to "cdk".

    Returns:
        list: A matrix containing Tanimoto similarity scores.Rows and columns correspond to SMILES strings in the input list.

    Raises:
        ValueError: If an unsupported toolkit is provided.
    """

    # Parse comma-separated list of SMILES into a list of SMILES strings
    smiles_list = smileslist.split(",")

    # Create an empty matrix to store similarity scores
    matrix = [[0.0] * len(smiles_list) for _ in range(len(smiles_list))]

    # Populate the matrix with similarity scores
    for i in range(len(smiles_list)):
        for j in range(len(smiles_list)):
            if toolkit == "rdkit":
                mol1 = parse_input(smiles_list[i], "rdkit", False)
                mol2 = parse_input(smiles_list[j], "rdkit", False)
                matrix[i][j] = get_tanimoto_similarity_rdkit(mol1, mol2)
            elif toolkit == "cdk":
                mol1 = parse_input(smiles_list[i], "cdk", False)
                mol2 = parse_input(smiles_list[j], "cdk", False)
                matrix[i][j] = get_tanimoto_similarity_CDK(mol1, mol2)
            else:
                raise ValueError("Unsupported toolkit:", toolkit)

    return get_table(matrix)