from __future__ import annotations
import os
from typing import List
from typing import Union
import pystow
from jpype import getDefaultJVMPath
from jpype import isJVMStarted
from jpype import JClass
from jpype import JPackage
from jpype import JVMNotFoundException
from jpype import startJVM
# Start JVM to use CDK in python
try:
jvmPath = getDefaultJVMPath()
except JVMNotFoundException:
print(
"If you see this message, for some reason JPype",
"cannot find jvm.dll.",
"This indicates that the environment variable JAVA_HOME",
"is not set properly.",
"You can set it or set it manually in the code",
)
jvmPath = "Define/path/or/set/JAVA_HOME/variable/properly"
if not isJVMStarted():
cdk_path = "https://github.com/cdk/cdk/releases/download/cdk-2.9/cdk-2.9.jar"
sru_path = "https://github.com/JonasSchaub/SugarRemoval/releases/download/v1.3.2/SugarRemovalUtility-jar-with-dependencies.jar"
centres_path = (
"https://github.com/SiMolecule/centres/releases/download/1.0/centres.jar"
)
opsin_path = "https://github.com/dan2097/opsin/releases/download/2.8.0/opsin-cli-2.8.0-jar-with-dependencies.jar"
cdkjar_path = str(pystow.join("STOUT-V2")) + "/cdk-2.9.jar"
srujar_path = (
str(pystow.join("STOUT-V2")) + "/SugarRemovalUtility-jar-with-dependencies.jar"
)
centresjar_path = str(pystow.join("STOUT-V2")) + "/centres.jar"
opsinjar_path = (
str(pystow.join("STOUT-V2")) + "/opsin-cli-2.8.0-jar-with-dependencies.jar"
)
if not os.path.exists(cdkjar_path):
jar_path = pystow.ensure("STOUT-V2", url=cdk_path)
if not os.path.exists(srujar_path):
jar_path = pystow.ensure("STOUT-V2", url=sru_path)
if not os.path.exists(centresjar_path):
jar_path = pystow.ensure("STOUT-V2", url=centres_path)
if not os.path.exists(opsinjar_path):
jar_path = pystow.ensure("STOUT-V2", url=opsin_path)
startJVM(
"-ea",
"-Xmx4096M",
classpath=[cdkjar_path, srujar_path, centresjar_path, opsinjar_path],
)
cdk_base = "org.openscience.cdk"
opsin_base = JPackage("uk").ac.cam.ch.wwmm.opsin
_nametostruct = opsin_base.NameToStructure.getInstance()
_restoinchi = opsin_base.NameToInchi.convertResultToInChI
[docs]
def get_CDK_IAtomContainer(smiles: str):
"""This function takes the input SMILES and creates a CDK IAtomContainer.
Args:
smiles (str): SMILES string as input.
Returns:
mol (object): IAtomContainer with CDK.
"""
SCOB = JClass(cdk_base + ".silent.SilentChemObjectBuilder")
SmilesParser = JClass(
cdk_base + ".smiles.SmilesParser",
)(SCOB.getInstance())
molecule = SmilesParser.parseSmiles(smiles)
return molecule
[docs]
def get_CDK_SDG(molecule: any):
"""This function takes the input IAtomContainer and Creates a.
Structure Diagram Layout using the CDK.
Args:
molecule (IAtomContainer): molecule given by the user.
Returns:
mol object: mol object with CDK SDG.
"""
StructureDiagramGenerator = JClass(
cdk_base + ".layout.StructureDiagramGenerator",
)()
StructureDiagramGenerator.generateCoordinates(molecule)
molecule_ = StructureDiagramGenerator.getMolecule()
return molecule_
[docs]
def get_CDK_SDG_mol(molecule: any, V3000=False) -> str:
"""Returns a mol block string with Structure Diagram Layout for the given.
SMILES.
Args:
molecule (IAtomContainer): molecule given by the user.
V3000 (bool, optional): Option to return V3000 mol. Defaults to False.
Returns:
str: CDK Structure Diagram Layout mol block.
"""
StringW = JClass("java.io.StringWriter")()
moleculeSDG = get_CDK_SDG(molecule)
SDFW = JClass(cdk_base + ".io.SDFWriter")(StringW)
SDFW.setAlwaysV3000(V3000)
SDFW.write(moleculeSDG)
SDFW.flush()
mol_str = str(StringW.toString())
return mol_str
[docs]
def get_murko_framework(molecule: any) -> str:
"""This function takes the user input SMILES and returns.
the Murko framework
Args:
molecule (IAtomContainer): molecule given by the user.
Returns:
smiles (string): Murko Framework as SMILES.
"""
MurkoFragmenter = JClass(cdk_base + ".fragment.MurckoFragmenter")(True, 3)
MurkoFragmenter.generateFragments(molecule)
if len(MurkoFragmenter.getFrameworks()) == 0:
return "None"
return str(MurkoFragmenter.getFrameworks()[0])
[docs]
def get_aromatic_ring_count(molecule) -> int:
"""Calculate the number of aromatic rings present in a given molecule.
Args:
molecule (IAtomContainer): molecule given by the user.
Returns:
int: The number of aromatic rings present in the molecule.
"""
Cycles = JClass(cdk_base + ".graph.Cycles")
ElectronDonation = JClass(cdk_base + ".aromaticity.ElectronDonation")
Aromaticity = JClass(cdk_base + ".aromaticity.Aromaticity")(
ElectronDonation.daylight(),
Cycles.cdkAromaticSet(),
)
Aromaticity.apply(molecule)
MCBRings = Cycles.mcb(molecule).toRingSet()
NumberOfAromaticRings = 0
for RingContainer in MCBRings.atomContainers():
AreAllRingBondsAromatic = True
for Bond in RingContainer.bonds():
if not Bond.isAromatic():
AreAllRingBondsAromatic = False
break
if AreAllRingBondsAromatic:
NumberOfAromaticRings += 1
return NumberOfAromaticRings
[docs]
def get_vander_waals_volume(molecule: any) -> float:
"""Calculate the Van der Waals volume of a given molecule.
Args:
molecule (IAtomContainer): molecule given by the user.
Returns:
float: The Van der Waals volume of the molecule.
"""
AtomContainerManipulator = JClass(
cdk_base + ".tools.manipulator.AtomContainerManipulator",
)
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule)
VABCVolume = JClass(
cdk_base + ".geometry.volume.VABCVolume",
)().calculate(molecule)
return VABCVolume
[docs]
def get_CDK_descriptors(molecule: any) -> Union[tuple, str]:
"""Take an input SMILES and generate a selected set of molecular.
descriptors generated using CDK as a list.
Args (str): molecule (IAtomContainer): molecule given by the
user.
Returns (list): A list of calculated descriptors.
"""
SDGMol = get_CDK_SDG(molecule)
if SDGMol:
AtomCountDescriptor = (
JClass(cdk_base + ".qsar.descriptors.molecular.AtomCountDescriptor")()
.calculate(SDGMol)
.getValue()
)
HeavyAtomsC = SDGMol.getAtomCount()
WeightDescriptor = (
JClass(cdk_base + ".qsar.descriptors.molecular.WeightDescriptor")()
.calculate(SDGMol)
.getValue()
.toString()
)
TotalExactMass = JClass(
cdk_base + ".tools.manipulator.AtomContainerManipulator",
).getTotalExactMass(SDGMol)
ALogP = (
JClass(cdk_base + ".qsar.descriptors.molecular.ALOGPDescriptor")()
.calculate(SDGMol)
.getValue()
)
NumRotatableBonds = (
JClass(
cdk_base + ".qsar.descriptors.molecular.RotatableBondsCountDescriptor",
)()
.calculate(SDGMol)
.getValue()
)
TPSADescriptor = (
JClass(cdk_base + ".qsar.descriptors.molecular.TPSADescriptor")()
.calculate(SDGMol)
.getValue()
.toString()
)
HBondAcceptorCountDescriptor = (
JClass(
cdk_base + ".qsar.descriptors.molecular.HBondAcceptorCountDescriptor",
)()
.calculate(SDGMol)
.getValue()
)
HBondDonorCountDescriptor = (
JClass(
cdk_base + ".qsar.descriptors.molecular.HBondAcceptorCountDescriptor",
)()
.calculate(SDGMol)
.getValue()
)
RuleOfFiveDescriptor = (
JClass(cdk_base + ".qsar.descriptors.molecular.RuleOfFiveDescriptor")()
.calculate(SDGMol)
.getValue()
)
AromaticRings = get_aromatic_ring_count(SDGMol)
QEDWeighted = None
FormalCharge = JClass(
cdk_base + ".tools.manipulator.AtomContainerManipulator",
).getTotalFormalCharge(SDGMol)
FractionalCSP3Descriptor = (
JClass(cdk_base + ".qsar.descriptors.molecular.FractionalCSP3Descriptor")()
.calculate(SDGMol)
.getValue()
.toString()
)
NumRings = (
JClass(
cdk_base + ".graph.Cycles",
)
.mcb(SDGMol)
.numberOfCycles()
)
VABCVolume = get_vander_waals_volume(SDGMol)
return (
int(str(AtomCountDescriptor)),
int(HeavyAtomsC),
float("{:.2f}".format(float(str(WeightDescriptor)))),
float("{:.5f}".format(float(str(TotalExactMass)))),
float("{:.2f}".format(float(str(ALogP).split(",")[0]))),
int(str(NumRotatableBonds)),
float("{:.2f}".format(float(str(TPSADescriptor)))),
int(str(HBondAcceptorCountDescriptor)),
int(str(HBondDonorCountDescriptor)),
int(str(HBondAcceptorCountDescriptor)),
int(str(HBondDonorCountDescriptor)),
int(str(RuleOfFiveDescriptor)),
int(AromaticRings),
str(QEDWeighted),
int(FormalCharge),
float("{:.2f}".format(float(str(FractionalCSP3Descriptor)))),
int(NumRings),
float("{:.2f}".format(float(str(VABCVolume)))),
)
else:
return "Check input and try again!"
[docs]
def get_tanimoto_similarity_PubChem_CDK(mol1: any, mol2: any) -> str:
"""Calculate the Tanimoto similarity index between two molecules using.
PubChem fingerprints.
Args:
mol1 (IAtomContainer): First molecule given by the user.
mol2 (IAtomContainer): Second molecule given by the user.
Returns:
str: The Tanimoto similarity as a string with 5 decimal places, or an error message.
"""
Tanimoto = JClass(cdk_base + ".similarity.Tanimoto")
SCOB = JClass(cdk_base + ".silent.SilentChemObjectBuilder")
PubchemFingerprinter = JClass(cdk_base + ".fingerprint.PubchemFingerprinter")(
SCOB.getInstance(),
)
CDKHydrogenAdder = JClass(cdk_base + ".tools.CDKHydrogenAdder").getInstance(
SCOB.getInstance(),
)
AtomContainerManipulator = JClass(
cdk_base + ".tools.manipulator.AtomContainerManipulator",
)
Cycles = JClass(cdk_base + ".graph.Cycles")
ElectronDonation = JClass(cdk_base + ".aromaticity.ElectronDonation")
Aromaticity = JClass(cdk_base + ".aromaticity.Aromaticity")(
ElectronDonation.cdk(),
Cycles.cdkAromaticSet(),
)
if mol1 and mol2:
# Perceive atom types and configure atoms
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol1)
AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol2)
# add Implicit Hydrogens
CDKHydrogenAdder.addImplicitHydrogens(mol1)
CDKHydrogenAdder.addImplicitHydrogens(mol2)
# Convert implicit to explicit Hydrogens
AtomContainerManipulator.convertImplicitToExplicitHydrogens(mol1)
AtomContainerManipulator.convertImplicitToExplicitHydrogens(mol2)
# Apply Aromaticity
Aromaticity.apply(mol1)
Aromaticity.apply(mol2)
# Generate BitSets using PubChemFingerprinter
fingerprint1 = PubchemFingerprinter.getBitFingerprint(mol1).asBitSet()
fingerprint2 = PubchemFingerprinter.getBitFingerprint(mol2).asBitSet()
# Calculate Tanimoto similarity
Similarity = Tanimoto.calculate(fingerprint1, fingerprint2)
return "{:.5f}".format(float(str(Similarity)))
else:
return "Check the SMILES string for errors"
[docs]
def get_tanimoto_similarity_ECFP_CDK(mol1: any, mol2: any, ECFP: int = 2) -> str:
"""Calculate the Tanimoto similarity index between two molecules using.
CircularFingerprinter fingerprints.
https://cdk.github.io/cdk/2.8/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
Args:
mol1 (IAtomContainer): First molecule given by the user.
mol2 (IAtomContainer): Second molecule given by the user.
Returns:
str: The Tanimoto similarity as a string with 5 decimal places, or an error message.
"""
Tanimoto = JClass(cdk_base + ".similarity.Tanimoto")
CircularFingerprinter = JClass(
cdk_base + ".fingerprint.CircularFingerprinter",
)()
if ECFP == 2:
fingerprinter_class = CircularFingerprinter.CLASS_ECFP2
elif ECFP == 4:
fingerprinter_class = CircularFingerprinter.CLASS_ECFP4
elif ECFP == 6:
fingerprinter_class = CircularFingerprinter.CLASS_ECFP6
else:
return "only ECFP 2/4/6 allowed"
CircularFingerprinter_ECFP = JClass(
cdk_base + ".fingerprint.CircularFingerprinter",
)(fingerprinter_class)
if mol1 and mol2:
fingerprint1 = CircularFingerprinter_ECFP.getBitFingerprint(mol1)
fingerprint2 = CircularFingerprinter_ECFP.getBitFingerprint(mol2)
# Calculate Tanimoto similarity
Similarity = Tanimoto.calculate(fingerprint1, fingerprint2)
return "{:.5f}".format(float(str(Similarity)))
else:
return "Check the SMILES string for errors"
[docs]
def get_tanimoto_similarity_CDK(
mol1: any,
mol2: any,
fingerprinter: str = "PubChem",
ECFP: int = 6,
) -> float:
"""Calculate the Tanimoto similarity between two molecules using.
PubChem/CircularFingerprints in CDK.
Args:
mol1 (IAtomContainer): First molecule given by the user.
mol2 (IAtomContainer): Second molecule given by the user.
fingerprinter (str, optional): The fingerprinter to use. Currently, only "PubChem/ECFP6" is supported. Defaults to "PubChem".
Returns:
float: The Tanimoto similarity score between the two molecules.
Raises:
ValueError: If an unsupported fingerprinter is specified.
"""
if fingerprinter == "PubChem":
tanimoto = get_tanimoto_similarity_PubChem_CDK(mol1, mol2)
elif fingerprinter == "ECFP":
tanimoto = get_tanimoto_similarity_ECFP_CDK(mol1, mol2, ECFP)
else:
raise ValueError(
"Unsupported fingerprinter. Currently, only 'PubChem' is supported.",
)
return tanimoto
[docs]
def get_cip_annotation(molecule: any) -> str:
"""Return the CIP (Cahn–Ingold–Prelog) annotations using the CDK CIP.
toolkit.
This function takes a SMILES (Simplified Molecular Input Line Entry System) string
as input and returns a CIP annotated molecule block using the CDK CIP toolkit.
Args:
molecule (IAtomContainer): molecule given by the user.
Returns:
str: A CIP annotated molecule block.
"""
SDGMol = get_CDK_SDG(molecule)
centres_base = "com.simolecule.centres"
Cycles = JClass(cdk_base + ".graph.Cycles")
IBond = JClass(cdk_base + ".interfaces.IBond")
IStereoElement = JClass(cdk_base + ".interfaces.IStereoElement")
Stereocenters = JClass(cdk_base + ".stereo.Stereocenters")
StandardGenerator = JClass(
cdk_base + ".renderer.generators.standard.StandardGenerator",
)
BaseMol = JClass(centres_base + ".BaseMol")
CdkLabeller = JClass(centres_base + ".CdkLabeller")
Descriptor = JClass(centres_base + ".Descriptor")
stereocenters = Stereocenters.of(SDGMol)
for atom in SDGMol.atoms():
if (
stereocenters.isStereocenter(atom.getIndex())
and stereocenters.elementType(atom.getIndex())
== Stereocenters.Type.Tetracoordinate
):
atom.setProperty(StandardGenerator.ANNOTATION_LABEL, "(?)")
# Iterate over bonds
for bond in SDGMol.bonds():
if bond.getOrder() != IBond.Order.DOUBLE:
continue
begIdx = bond.getBegin().getIndex()
endIdx = bond.getEnd().getIndex()
if (
stereocenters.elementType(
begIdx,
)
== Stereocenters.Type.Tricoordinate
and stereocenters.elementType(endIdx) == Stereocenters.Type.Tricoordinate
and stereocenters.isStereocenter(begIdx)
and stereocenters.isStereocenter(endIdx)
):
# Check if not in a small ring <7
if Cycles.smallRingSize(bond, 7) == 0:
bond.setProperty(StandardGenerator.ANNOTATION_LABEL, "(?)")
# no defined stereo?
if not SDGMol.stereoElements().iterator().hasNext():
return SDGMol
# Call the Java method
CdkLabeller.label(SDGMol)
# Update to label appropriately for racemic and relative stereochemistry
for se in SDGMol.stereoElements():
if se.getConfigClass() == IStereoElement.TH and se.getGroupInfo() != 0:
focus = se.getFocus()
label = focus.getProperty(BaseMol.CIP_LABEL_KEY)
if (
isinstance(label, Descriptor)
and label != Descriptor.ns
and label != Descriptor.Unknown
):
if (se.getGroupInfo() & IStereoElement.GRP_RAC) != 0:
inv = None
if label == Descriptor.R:
inv = Descriptor.S
elif label == Descriptor.S:
inv = Descriptor.R
if inv is not None:
focus.setProperty(
BaseMol.CIP_LABEL_KEY,
label.toString() + inv.name(),
)
elif (se.getGroupInfo() & IStereoElement.GRP_REL) != 0:
if label == Descriptor.R or label == Descriptor.S:
focus.setProperty(
BaseMol.CIP_LABEL_KEY,
label.toString() + "*",
)
# Iterate over atoms
for atom in SDGMol.atoms():
if atom.getProperty(BaseMol.CONF_INDEX) is not None:
atom.setProperty(
StandardGenerator.ANNOTATION_LABEL,
StandardGenerator.ITALIC_DISPLAY_PREFIX
+ atom.getProperty(BaseMol.CONF_INDEX).toString(),
)
elif atom.getProperty(BaseMol.CIP_LABEL_KEY) is not None:
atom.setProperty(
StandardGenerator.ANNOTATION_LABEL,
StandardGenerator.ITALIC_DISPLAY_PREFIX
+ atom.getProperty(BaseMol.CIP_LABEL_KEY).toString(),
)
# Iterate over bonds
for bond in SDGMol.bonds():
if bond.getProperty(BaseMol.CIP_LABEL_KEY) is not None:
bond.setProperty(
StandardGenerator.ANNOTATION_LABEL,
StandardGenerator.ITALIC_DISPLAY_PREFIX
+ bond.getProperty(BaseMol.CIP_LABEL_KEY).toString(),
)
return SDGMol
[docs]
def get_CXSMILES(molecule: any) -> str:
"""Generate CXSMILES representation with 2D atom coordinates from the.
given.
SMILES.
Args:
molecule (IAtomContainer): molecule given by the user.
Returns:
str: CXSMILES representation with 2D atom coordinates.
"""
SDGMol = get_CDK_SDG(molecule)
SmiFlavor = JClass(cdk_base + ".smiles.SmiFlavor")
SmilesGenerator = JClass(cdk_base + ".smiles.SmilesGenerator")(
SmiFlavor.Absolute | SmiFlavor.CxSmilesWithCoords,
)
CXSMILES = SmilesGenerator.create(SDGMol)
return str(CXSMILES)
[docs]
def get_canonical_SMILES(molecule: any) -> str:
"""Generate Canonical SMILES representation with 2D atom coordinates from.
the given SMILES.
Args:
molecule (IAtomContainer): molecule given by the user.
Returns:
str: Canonical SMILES representation with 2D atom coordinates.
"""
SDGMol = get_CDK_SDG(molecule)
SmiFlavor = JClass(cdk_base + ".smiles.SmiFlavor")
SmilesGenerator = JClass(
cdk_base + ".smiles.SmilesGenerator",
)(SmiFlavor.Absolute)
CanonicalSMILES = SmilesGenerator.create(SDGMol)
return str(CanonicalSMILES)
[docs]
def get_InChI(molecule: any, InChIKey=False) -> str:
"""Generate InChI or InChIKey from the given SMILES string.
Args:
molecule (IAtomContainer): molecule given by the user.
InChIKey (bool): If True, return InChIKey instead of InChI. The default is False.
Returns:
str: InChI or InChIKey string.
"""
SDGMol = get_CDK_SDG(molecule)
InChIGeneratorFactory = JClass(cdk_base + ".inchi.InChIGeneratorFactory")
InChI = InChIGeneratorFactory.getInstance().getInChIGenerator(SDGMol).getInchi()
if InChIKey:
InChIKey = (
InChIGeneratorFactory.getInstance().getInChIGenerator(SDGMol).getInchiKey()
)
return InChIKey
return InChI
[docs]
def get_smiles_opsin(input_text: str) -> str:
"""Convert IUPAC chemical name to SMILES notation using OPSIN.
Parameters:
- input_text (str): The IUPAC chemical name to be converted.
Returns:
- str: The SMILES notation corresponding to the given IUPAC name.
Raises:
- Exception: If the IUPAC name is not valid or if there are issues in the conversion process. The exception message will guide the user to check the data again.
"""
try:
print(input_text)
OpsinResult = _nametostruct.parseChemicalName(input_text)
print(OpsinResult)
if str(OpsinResult.getStatus()) == "FAILURE":
raise Exception(
(
"Failed to convert '%s' to format '%s'\n%s using OPSIN"
% (input_text, format, OpsinResult.getMessage())
),
)
print(OpsinResult.getSmiles())
return str(OpsinResult.getSmiles())
except Exception:
return str(
"Failed to convert '%s' to format '%s'\n%s using OPSIN"
% (input_text, format, OpsinResult.getMessage()),
)
[docs]
async def get_CDK_HOSE_codes(
molecule: any,
noOfSpheres: int,
ringsize: bool,
) -> List[str]:
"""Generate CDK-generated HOSECodes for the given SMILES.
Args:
molecule (IAtomContainer): molecule given by the user.
noOfSpheres (int): Number of spheres for HOSECode generation.
ringsize (bool): Whether to consider ring size for HOSECode generation.
Returns:
List[str]: List of CDK-generated HOSECodes.
"""
HOSECodeGenerator = JClass(cdk_base + ".tools.HOSECodeGenerator")()
HOSECodes = []
atoms = molecule.atoms()
for atom in atoms:
moleculeHOSECode = HOSECodeGenerator.getHOSECode(
molecule,
atom,
noOfSpheres,
ringsize,
)
HOSECodes.append(str(moleculeHOSECode))
return HOSECodes