Source code for app.modules.coconut.preprocess

from __future__ import annotations

from chembl_structure_pipeline import standardizer
from rdkit import Chem

import app.modules.toolkits.cdk_wrapper as cdk
import app.modules.toolkits.rdkit_wrapper as rdkitmodules
from app.modules.coconut.descriptors import get_COCONUT_descriptors
from app.modules.toolkits.helpers import parse_input


[docs] def get_mol_block(input_text: str) -> str: """Generate a Molblock from input text using CDK. Args: input_text (str): Input text (Mol/SMILES). Returns: str: Molblock representation. Raises: ValueError: If input_text is not a valid Mol or SMILES. """ check = rdkitmodules.is_valid_molecule(input_text) if check == "smiles": molecule = parse_input(input_text, "cdk", False) mol_block = cdk.get_CDK_SDG_mol( molecule, V3000=False, ).replace("$$$$\n", "") return mol_block elif check == "mol": return input_text else: return "Error!, Check the input text."
[docs] def get_molecule_hash(molecule: any) -> dict: """Return various molecule hashes for the provided SMILES. Args: smiles (str): Standardized SMILES string. Returns: dict: Dictionary containing Formula, Isomeric SMILES, and Canonical SMILES. """ if molecule: Formula = Chem.rdMolDescriptors.CalcMolFormula(molecule) Isomeric_SMILES = Chem.MolToSmiles(molecule, kekuleSmiles=True) Canonical_SMILES = Chem.MolToSmiles( molecule, kekuleSmiles=True, isomericSmiles=False, ) return { "Formula": Formula, "Isomeric_SMILES": Isomeric_SMILES, "Canonical_SMILES": Canonical_SMILES, } else: return {"Error": "Check input SMILES"}
[docs] def get_representations(molecule: any) -> dict: """Return COCONUT representations for the provided SMILES. Args: smiles (str): SMILES string. Returns: dict: Dictionary containing InChI, InChi Key, and Murko framework. """ if molecule: InChI = Chem.inchi.MolToInchi(molecule) InChI_Key = Chem.inchi.MolToInchiKey(molecule) cdkMolecule = parse_input(Chem.MolToSmiles(molecule), "cdk", False) Murko = cdk.get_murko_framework(cdkMolecule) return {"InChI": InChI, "InChI_Key": InChI_Key, "Murko": Murko} else: return {"Error": "Check input SMILES"}
[docs] def get_COCONUT_preprocessing(input_text: str) -> dict: """Preprocess user input text suitable for the COCONUT database submission. data. Args: input_text (str): Input text (Mol/str). Returns: dict: COCONUT preprocessed data. """ original_mol = get_mol_block(input_text) standarised_mol_block = standardizer.standardize_molblock(original_mol) standardised_SMILES = Chem.MolToSmiles( Chem.MolFromMolBlock(standarised_mol_block), kekuleSmiles=True, ) rdkitMol = parse_input(standardised_SMILES, "rdkit", False) molecule_hash = get_molecule_hash(rdkitMol) parent_canonical_smiles = molecule_hash["Canonical_SMILES"] cdkParentMol = parse_input(parent_canonical_smiles, "cdk", False) parent_2D_molblock = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=False).replace( "$$$$\n", "", ) parent_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkParentMol, V3000=True).replace( "$$$$\n", "", ) rdkitParentMol = parse_input(parent_canonical_smiles, "rdkit", False) parent_3D_molblock = rdkitmodules.get_3d_conformers(rdkitParentMol) parent_representations = get_representations(rdkitParentMol) parent_descriptors = get_COCONUT_descriptors( parent_canonical_smiles, "rdkit", ) if rdkitmodules.has_stereochemistry(rdkitMol): variant_isomeric_smiles = molecule_hash["Isomeric_SMILES"] cdkVariantMol = parse_input(variant_isomeric_smiles, "cdk", False) variant_2D_molblock = cdk.get_CDK_SDG_mol(cdkVariantMol, V3000=False).replace( "$$$$\n", "", ) variant_2D_molblock_v3 = cdk.get_CDK_SDG_mol(cdkVariantMol, V3000=True).replace( "$$$$\n", "", ) rdkitVariantMol = parse_input(standardised_SMILES, "rdkit", False) variant_3D_molblock = rdkitmodules.get_3d_conformers(rdkitVariantMol) variant_representations = get_representations(rdkitVariantMol) variant_descriptors = get_COCONUT_descriptors( variant_isomeric_smiles, "rdkit", ) return { "original_mol": original_mol, "standardised_mol": standarised_mol_block, "standardised_SMILES": standardised_SMILES, "molecule_hash": molecule_hash, "parent": { "2D_mol": parent_2D_molblock, "3D_mol": parent_3D_molblock, "v3000": parent_2D_molblock_v3, "representations": parent_representations, "descriptors": parent_descriptors, }, "stereochemical_variants": True, "variants": { "2D_mol": variant_2D_molblock, "3D_mol": variant_3D_molblock, "v3000": variant_2D_molblock_v3, "representations": variant_representations, "descriptors": variant_descriptors, }, } return { "original_mol": original_mol, "standardised_mol": standarised_mol_block, "standardised_SMILES": standardised_SMILES, "molecule_hash": molecule_hash, "parent": { "2D_mol": parent_2D_molblock, "3D_mol": parent_3D_molblock, "v3000": parent_2D_molblock_v3, "representations": parent_representations, "descriptors": parent_descriptors, }, "stereochemical_variants": False, }