# -------------------------------------------------------------------------------------------------------------------------------------
# Following code adapted from (https://github.com/maabuu/posebusters_em) and (https://github.com/luwei0917/DynamicBind)
# -------------------------------------------------------------------------------------------------------------------------------------
"""Energy minimization of a ligand in a protein pocket as used in the
PoseBusters/DynamicBind paper.
This code is based on the OpenMM user guide:
http://docs.openmm.org/latest/userguide
"""
from __future__ import annotations
import copy
import logging
import os
import shutil
import sys
from pathlib import Path
import hydra
import numpy as np
import rootutils
from beartype.typing import Any, Dict, Optional
from Bio.PDB import MMCIFIO, PDBIO, MMCIFParser, PDBParser, Select
from omegaconf import DictConfig
from openff.toolkit.topology import Molecule
from openff.units import Quantity as openff_Quantity
from openmm import (
CustomExternalForce,
LangevinIntegrator,
Platform,
System,
XmlSerializer,
unit,
)
from openmm.app import Atom, ForceField, HBonds, Modeller, PDBFile, PDBxFile, Simulation
from openmm.unit import kelvin, kilojoule, molar, mole, nanometer, picosecond
from openmmforcefields.generators import SMIRNOFFTemplateGenerator, SystemGenerator
from pdbfixer import PDBFixer
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolfiles import MolFromMolFile, MolToMolFile
from rdkit.Chem.rdmolops import AddHs, RemoveHs
from scipy.spatial.distance import cdist
from scipy.spatial.transform import Rotation as R
from wrapt_timeout_decorator import timeout
rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
from posebench import register_custom_omegaconf_resolvers
from posebench.utils.data_utils import combine_molecules
logging.basicConfig(format="[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
FORCE_FIELDS_IMPLICIT = ["amber14-all.xml", "implicit/gbn2.xml"]
FORCE_FIELDS_EXPLICIT = ["amber14-all.xml", "amber14/tip3pfb.xml"]
OPTIMIZATION_TIMEOUT_IN_SECONDS = 600
bond_lengths = {
("C", "CA"): (1.5335,),
("C", "O"): (1.23,),
("CA", "CB"): (1.5365,),
("CA", "N"): (1.4664,),
("CB", "CG"): (1.5331,),
("CE", "SD"): (1.8117,),
("CG", "SD"): (1.8135,),
("C", "N"): (1.3359,),
("CB", "OG"): (1.4144,),
("CB", "SG"): (1.8146,),
("CD", "CG"): (1.5308,),
("CD", "N"): (1.457,),
("CB", "CG1"): (1.5365,),
("CB", "CG2"): (1.5341,),
("CD1", "CG1"): (1.5334,),
("CG", "OD1"): (1.2477,),
("CG", "OD2"): (1.2507,),
("CD", "CE"): (1.5312,),
("CE", "NZ"): (1.4831,),
("CD", "NE"): (1.4748,),
("CZ", "NE"): (1.3205,),
("CZ", "NH1"): (1.3041,),
("CZ", "NH2"): (1.3018,),
("CB", "OG1"): (1.4086,),
("CD1", "CG"): (1.5223,),
("CD2", "CG"): (1.4589,),
("CD1", "CE1"): (1.406,),
("CD2", "CE2"): (1.4063,),
("CE1", "CZ"): (1.406,),
("CE2", "CZ"): (1.4059,),
("CD", "NE2"): (1.3141,),
("CD", "OE1"): (1.2493,),
("CD", "OE2"): (1.2524,),
("CG", "ND2"): (1.3112,),
("CD1", "NE1"): (1.3804,),
("CD2", "CE3"): (1.4115,),
("CE2", "CZ2"): (1.401,),
("CE2", "NE1"): (1.3774,),
("CE3", "CZ3"): (1.4076,),
("CH2", "CZ2"): (1.4046,),
("CH2", "CZ3"): (1.4062,),
("CD2", "NE2"): (1.3971,),
("CE1", "ND1"): (1.3424,),
("CE1", "NE2"): (1.3406,),
("CG", "ND1"): (1.3768,),
("CZ", "OH"): (1.3576,),
("C", "OXT"): (1.2491,),
("SG", "SG"): (2.0324,),
}
chi_atoms = dict(
chi1=dict(
ARG=["N", "CA", "CB", "CG"],
ASN=["N", "CA", "CB", "CG"],
ASP=["N", "CA", "CB", "CG"],
CYS=["N", "CA", "CB", "SG"],
GLN=["N", "CA", "CB", "CG"],
GLU=["N", "CA", "CB", "CG"],
HIS=["N", "CA", "CB", "CG"],
ILE=["N", "CA", "CB", "CG1"],
LEU=["N", "CA", "CB", "CG"],
LYS=["N", "CA", "CB", "CG"],
MET=["N", "CA", "CB", "CG"],
PHE=["N", "CA", "CB", "CG"],
PRO=["N", "CA", "CB", "CG"],
SER=["N", "CA", "CB", "OG"],
THR=["N", "CA", "CB", "OG1"],
TRP=["N", "CA", "CB", "CG"],
TYR=["N", "CA", "CB", "CG"],
VAL=["N", "CA", "CB", "CG1"],
),
altchi1=dict(
VAL=["N", "CA", "CB", "CG2"],
),
chi2=dict(
ARG=["CA", "CB", "CG", "CD"],
ASN=["CA", "CB", "CG", "OD1"],
ASP=["CA", "CB", "CG", "OD1"],
GLN=["CA", "CB", "CG", "CD"],
GLU=["CA", "CB", "CG", "CD"],
HIS=["CA", "CB", "CG", "ND1"],
ILE=["CA", "CB", "CG1", "CD1"],
LEU=["CA", "CB", "CG", "CD1"],
LYS=["CA", "CB", "CG", "CD"],
MET=["CA", "CB", "CG", "SD"],
PHE=["CA", "CB", "CG", "CD1"],
PRO=["CA", "CB", "CG", "CD"],
TRP=["CA", "CB", "CG", "CD1"],
TYR=["CA", "CB", "CG", "CD1"],
),
altchi2=dict(
ASP=["CA", "CB", "CG", "OD2"],
LEU=["CA", "CB", "CG", "CD2"],
PHE=["CA", "CB", "CG", "CD2"],
TYR=["CA", "CB", "CG", "CD2"],
),
chi3=dict(
ARG=["CB", "CG", "CD", "NE"],
GLN=["CB", "CG", "CD", "OE1"],
GLU=["CB", "CG", "CD", "OE1"],
LYS=["CB", "CG", "CD", "CE"],
MET=["CB", "CG", "SD", "CE"],
),
chi4=dict(
ARG=["CG", "CD", "NE", "CZ"],
LYS=["CG", "CD", "CE", "NZ"],
),
chi5=dict(
ARG=["CD", "NE", "CZ", "NH1"],
),
)
chi_names = ["chi%s" % i for i in range(1, 6)]
# NOTE: a tuple containing left bond atoms and right bond atoms -> a list of atoms that should rotate with the bond
chi1_bond_dict = {
"ALA": None,
"ARG": ("CA", "CB", ["CG", "CD", "NE", "NH1", "NH2", "CZ"]),
"ASN": ("CA", "CB", ["CG", "ND2", "OD1"]),
"ASP": ("CA", "CB", ["CG", "OD1", "OD2"]),
"CYS": ("CA", "CB", ["SG"]),
"GLN": ("CA", "CB", ["CG", "CD", "NE2", "OE1"]),
"GLU": ("CA", "CB", ["CG", "CD", "OE1", "OE2"]),
"GLY": None,
"HIS": ("CA", "CB", ["CG", "CD2", "ND1", "CE1", "NE2"]),
"ILE": ("CA", "CB", ["CG1", "CG2", "CD1"]),
"LEU": ("CA", "CB", ["CG", "CD1", "CD2"]),
"LYS": ("CA", "CB", ["CG", "CD", "CE", "NZ"]),
"MET": ("CA", "CB", ["CG", "SD", "CE"]),
"PHE": ("CA", "CB", ["CG", "CD1", "CD2", "CE1", "CE2", "CZ"]),
"PRO": ("CA", "CB", ["CG", "CD"]),
"SER": ("CA", "CB", ["OG"]),
"THR": ("CA", "CB", ["CG2", "OG1"]),
"TRP": ("CA", "CB", ["CG", "CD1", "CD2", "CE2", "CE3", "NE1", "CH2", "CZ2", "CZ3"]),
"TYR": ("CA", "CB", ["CG", "CD1", "CD2", "CE1", "CE2", "OH", "CZ"]),
"VAL": ("CA", "CB", ["CG1", "CG2"]),
}
chi2_bond_dict = {
"ALA": None,
"ARG": ("CB", "CG", ["CD", "NE", "NH1", "NH2", "CZ"]),
"ASN": ("CB", "CG", ["ND2", "OD1"]),
"ASP": ("CB", "CG", ["OD1", "OD2"]),
"CYS": None,
"GLN": ("CB", "CG", ["CD", "NE2", "OE1"]),
"GLU": ("CB", "CG", ["CD", "OE1", "OE2"]),
"GLY": None,
"HIS": ("CB", "CG", ["CD2", "ND1", "CE1", "NE2"]),
"ILE": ("CB", "CG1", ["CD1"]),
"LEU": ("CB", "CG", ["CD1", "CD2"]),
"LYS": ("CB", "CG", ["CD", "CE", "NZ"]),
"MET": ("CB", "CG", ["SD", "CE"]),
"PHE": ("CB", "CG", ["CD1", "CD2", "CE1", "CE2", "CZ"]),
"PRO": ("CB", "CG", ["CD"]),
"SER": None,
"THR": None,
"TRP": ("CB", "CG", ["CD1", "CD2", "CE2", "CE3", "NE1", "CH2", "CZ2", "CZ3"]),
"TYR": ("CB", "CG", ["CD1", "CD2", "CE1", "CE2", "OH", "CZ"]),
"VAL": None,
}
chi3_bond_dict = {
"ALA": None,
"ARG": ("CG", "CD", ["NE", "NH1", "NH2", "CZ"]),
"ASN": None,
"ASP": None,
"CYS": None,
"GLN": ("CG", "CD", ["NE2", "OE1"]),
"GLU": ("CG", "CD", ["OE1", "OE2"]),
"GLY": None,
"HIS": None,
"ILE": None,
"LEU": None,
"LYS": ("CG", "CD", ["CE", "NZ"]),
"MET": ("CG", "SD", ["CE"]),
"PHE": None,
"PRO": None,
"SER": None,
"THR": None,
"TRP": None,
"TYR": None,
"VAL": None,
}
chi4_bond_dict = {
"ARG": ("CD", "NE", ["NH1", "NH2", "CZ"]),
"LYS": ("CD", "CE", ["NZ"]),
}
chi5_bond_dict = {
"ARG": ("NE", "CZ", ["NH1", "NH2"]),
}
chi_dict = {
1: chi1_bond_dict,
2: chi2_bond_dict,
3: chi3_bond_dict,
4: chi4_bond_dict,
5: chi5_bond_dict,
}
[docs]
class NoHydrogen(Select):
"""Select class that filters out hydrogen atoms."""
[docs]
def accept_atom(self, atom: Any) -> bool:
"""Return True if the atom element is not H or D."""
if atom.element == "H" or atom.element == "D":
return False
return True
[docs]
def remove_hydrogens_from_pdb(input_pdb_filepath: str, output_pdb_filepath: str):
"""Remove hydrogen atoms from a PDB file.
:param input_pdb_filepath: Path to the input PDB file.
:param output_pdb_filepath: Path to the output PDB file.
"""
parser = (
MMCIFParser(QUIET=True)
if os.path.splitext(input_pdb_filepath)[-1] == ".cif"
else PDBParser(QUIET=True)
)
s = parser.get_structure("x", input_pdb_filepath)
io = MMCIFIO() if os.path.splitext(output_pdb_filepath)[-1] == ".cif" else PDBIO()
io.set_structure(s)
io.save(output_pdb_filepath, select=NoHydrogen())
[docs]
def remove_mol_hydrogens_and_reorder(mol: Chem.Mol) -> Chem.Mol:
"""Remove hydrogens from a molecule and re-order the atoms.
:param mol: The molecule to process.
:return: The processed molecule.
"""
mol = Chem.RemoveAllHs(mol)
_ = Chem.MolToSmiles(mol)
mol_order = list(
mol.GetPropsAsDict(includePrivate=True, includeComputed=True)["_smilesAtomOutputOrder"]
)
mol = Chem.RenumberAtoms(mol, mol_order)
return mol
[docs]
def will_restrain(atom: Atom, residue_set: str) -> bool:
"""Returns True if the atom will be restrained by the given restraint set.
:param atom: Atom to check.
:param residue_set: Residue set to use for checking.
"""
if residue_set == "non_hydrogen":
return atom.element.name != "hydrogen"
elif residue_set == "c_alpha":
return atom.name == "CA"
else:
raise ValueError(f"Unsupported residue set: {residue_set}")
[docs]
def get_all_protein_atoms(input_pdb_filepath: str) -> list[Any]:
"""Returns a list of all protein atoms in the PDB file.
:param input_pdb_filepath: Path to the input PDB file.
:return: A list of all protein atoms in the PDB file.
"""
if os.path.splitext(input_pdb_filepath)[-1] == ".cif":
parser = MMCIFParser(QUIET=True)
else:
parser = PDBParser(QUIET=True)
s = parser.get_structure(input_pdb_filepath, input_pdb_filepath)
all_atoms = list(s.get_atoms())
return all_atoms
[docs]
def randomly_rotate_protein_side_chains(
structure: Any,
violation_residue_idx: list[int],
min_angle: float = 0.6,
max_angle: float = np.pi,
num_chi_groups: int = 5,
eps: float = 1e-6,
) -> Any:
"""Randomly rotate side chains of the protein residues with local geometry
violations.
:param structure: Structure to rotate.
:param violation_residue_idx: List of residue indices with local
geometry violations.
:param min_angle: Minimum angle to rotate.
:param max_angle: Maximum angle to rotate.
:param num_chi_groups: Number of side chain groups.
:param eps: Epsilon value.
"""
for res in structure.get_residues():
if res.get_full_id()[3][1] not in violation_residue_idx:
continue
chi_mask = [0] * 5
if res.id[0] != " ":
continue
resname = res.resname
if resname in ("ALA", "GLY"):
continue
for x, chi in enumerate(chi_names):
chi_res = chi_atoms[chi]
if resname not in chi_res:
continue
chi_mask[x] = 1
pred_chi = np.random.uniform(min_angle, max_angle, 5)
for i in range(num_chi_groups):
if chi_mask[i] == 0:
continue
try:
chi_bond_dict = chi_dict[i + 1]
except KeyError:
logger.warning(f"Dictionary for `chi{i + 1}_bond_dict` not found.")
continue
atom1, atom2, rotate_atom_list = chi_bond_dict[resname]
if (atom1 not in res) or (atom2 not in res):
continue
atom1_coord = res[atom1].coord
atom2_coord = res[atom2].coord
rot_vec = atom2_coord - atom1_coord
rot_vec = pred_chi[i] * (rot_vec) / (np.linalg.norm(rot_vec) + eps)
rot_mat = R.from_rotvec(rot_vec).as_matrix()
for rotate_atom in rotate_atom_list:
if rotate_atom not in res:
continue
new_coord = (
np.matmul(res[rotate_atom].coord - res[atom1].coord, rot_mat.T)
+ res[atom1].coord
)
res[rotate_atom].set_coord(new_coord)
[docs]
def save_biopython_structure_to_pdb(
structure: Any, output_pdb_filepath: str, ca_only: bool = False
):
"""Saves a BioPython protein `Structure` objects to a PDB file.
:param structure: Structure to save.
:param output_pdb_filepath: Path to the input PDB file.
:param ca_only: Whether to save only the CA atoms or not.
"""
if os.path.splitext(output_pdb_filepath)[-1] == ".pdb":
io = PDBIO()
elif os.path.splitext(output_pdb_filepath)[-1] == ".cif":
io = MMCIFIO()
else:
raise ValueError(
f"Unsupported file format for saving modified protein structures following relaxation: {os.path.splitext(output_pdb_filepath)[-1]}"
)
class MySelect(Select):
def accept_atom(self, atom):
if atom.get_name() == "CA":
return True
else:
return False
class RemoveHs(Select):
def accept_atom(self, atom):
if atom.element != "H":
return True
else:
return False
io.set_structure(structure)
if ca_only:
io.save(output_pdb_filepath, MySelect())
else:
io.save(output_pdb_filepath, RemoveHs())
[docs]
def compute_protein_local_geometry_violations(input_pdb_filepath: str) -> tuple[int, list[int]]:
"""Computes the number of protein local geometry violations in the PDB
file.
:param input_pdb_filepath: Path to the latest protein input PDB
file.
:return: A tuple containing the number of protein local geometry
violations and the list of residue indices with associated
violations.
"""
protein_all_atoms = get_all_protein_atoms(input_pdb_filepath)
protein_pdb = (
PDBxFile(input_pdb_filepath)
if os.path.splitext(input_pdb_filepath)[-1] == ".cif"
else PDBFile(input_pdb_filepath)
)
bond_index_table = [
[bond.atom1.index, bond.atom2.index] for bond in protein_pdb.topology.bonds()
]
violation_residue_idx = []
deviation_greater_than_cutoff = 0
for bond_index in bond_index_table:
i, j = bond_index
atom1 = protein_all_atoms[i]
atom2 = protein_all_atoms[j]
if atom1.id == "OXT" or atom2.id == "OXT":
continue
dis = atom1 - atom2
deviation = abs(bond_lengths[(atom1.name, atom2.name)][0] - dis)
violation = deviation > 0.3
deviation_greater_than_cutoff += violation
if violation:
violation_residue_idx.extend([atom1.get_full_id()[3][1], atom2.get_full_id()[3][1]])
return deviation_greater_than_cutoff, violation_residue_idx
[docs]
def get_all_ligand_non_hydrogen_atoms(
input_sdf_filepath: str | None,
ref_input_sdf_filepath: str | None,
input_mol: Chem.Mol | None = None,
ref_mol: Chem.Mol | None = None,
) -> tuple[np.ndarray, list[list[bool]]]:
"""Get all non-hydrogen atoms in the ligand SDF file.
:param input_sdf_filepath: Path to the latest ligand input SDF file.
:param ref_input_sdf_filepath: Path to the reference (e.g.,
original) ligand input SDF file.
:param input_mol: The optional RDKit molecule to directly use for
the input ligand.
:param ref_mol: The optional RDKit molecule to directly use for the
reference ligand.
:return: A tuple containing the NumPy ligand atom coordinates and
the list-of-lists ligand adjacency matrix.
"""
try:
if input_mol is not None:
mol = input_mol
else:
mol = Chem.MolFromMolFile(input_sdf_filepath)
if mol is None:
raise Exception(
f"In `get_all_ligand_non_hydrogen_atoms()`, molecule could not be loaded from SDF file: {input_sdf_filepath}"
)
except Exception as e:
if ref_mol is not None:
mol = ref_mol
else:
mol = Chem.MolFromMolFile(ref_input_sdf_filepath)
mol.RemoveAllConformers()
mol = Chem.AddHs(mol)
generate_conformer(mol)
mol = Chem.RemoveAllHs(mol)
Chem.MolToSmiles(mol)
smiles_atom_output_order = list(
mol.GetPropsAsDict(includePrivate=True, includeComputed=True)["_smilesAtomOutputOrder"]
)
mol = Chem.RenumberAtoms(mol, smiles_atom_output_order)
mol_atoms = list(mol.GetAtoms())
mol_atoms = [a.GetSymbol() for a in mol_atoms]
c = mol.GetConformer()
mol_atom_coords = c.GetPositions()
return mol_atom_coords, Chem.GetAdjacencyMatrix(mol).astype(bool)
[docs]
def compute_ligand_local_geometry_violations(
ref_input_sdf_filepath: str,
input_sdf_filepath: str | None,
) -> int:
"""Computes the number of ligand local geometry violations in the SDF file.
:param ref_input_sdf_filepath: Path to the reference (e.g.,
original) ligand input SDF file.
:param input_sdf_filepath: Path to the optional latest ligand input
SDF file.
:return: The cumulative number of ligand local geometry violations.
"""
deviations_greater_than_cutoff = []
sdf_filepath = input_sdf_filepath if input_sdf_filepath is not None else ref_input_sdf_filepath
ligand_frags = Chem.GetMolFrags(Chem.MolFromMolFile(sdf_filepath), asMols=True)
for ligand_frag in ligand_frags:
ref_ligand_atom_coords, local_geometry_mask = get_all_ligand_non_hydrogen_atoms(
ref_input_sdf_filepath,
input_sdf_filepath,
ref_mol=copy.deepcopy(ligand_frag),
)
ligand_atom_coords, _ = get_all_ligand_non_hydrogen_atoms(
input_sdf_filepath,
ref_input_sdf_filepath,
input_mol=copy.deepcopy(ligand_frag),
)
assert len(ref_ligand_atom_coords) == len(
ligand_atom_coords
), "The number of atoms in the reference and input ligands are not the same."
ref_pair_distances = cdist(ref_ligand_atom_coords, ref_ligand_atom_coords)
pair_distances = cdist(ligand_atom_coords, ligand_atom_coords)
deviation_greater_than_cutoff = (
abs(ref_pair_distances[local_geometry_mask] - pair_distances[local_geometry_mask])
> 0.3
).sum()
deviations_greater_than_cutoff.append(deviation_greater_than_cutoff)
return sum(deviations_greater_than_cutoff)
[docs]
def load_molecule(mol_path: Path, **kwargs) -> Molecule:
"""Load a molecule from a file.
:param mol_path: Path to the molecule file.
:param kwargs: Additional keyword arguments to pass to the
Molecule.from_file method.
:return: The loaded molecule.
"""
mols = Molecule.from_file(str(mol_path), file_format="sdf", **kwargs)
molecule = mols[0] if isinstance(mols, list) else mols
molecule.name = str(mol_path)
return molecule
[docs]
def prep_ligand(
ligand_file: Path,
temp_file: Path,
relax_protein: bool,
allow_undefined_stereo: bool = True,
mol: Chem.Mol | None = None,
) -> tuple[Molecule, str | None]:
"""Prepare a ligand for use in OpenMM.
:param ligand_file: Path to the ligand file.
:param temp_file: Path to the temporary file to save the prepared
ligand to.
:param relax_protein: Whether or not to relax the protein.
:param allow_undefined_stereo: Whether or not to allow undefined
stereochemistry.
:param mol: The optional RDKit molecule to directly use for the
ligand.
:return: The prepared ligand.
"""
try:
mol_smiles = None
if not temp_file.exists():
temp_file.parent.mkdir(parents=True, exist_ok=True)
if relax_protein:
if mol is None:
mol = MolFromMolFile(str(ligand_file))
mol_with_hs = AddHs(mol, addCoords=True)
mol_smiles = Chem.MolToSmiles(mol_with_hs)
mol_smiles_atom_output_order = list(
mol_with_hs.GetPropsAsDict(includePrivate=True, includeComputed=True)[
"_smilesAtomOutputOrder"
]
)
mol_with_hs = Chem.RenumberAtoms(mol_with_hs, mol_smiles_atom_output_order)
mol = Molecule.from_smiles(
mol_smiles, allow_undefined_stereo=allow_undefined_stereo
)
MolToMolFile(mol.to_rdkit(), str(temp_file))
else:
if mol is None:
mol = MolFromMolFile(str(ligand_file), sanitize=True)
mol = AddHs(mol, addCoords=True)
MolToMolFile(mol, str(temp_file))
return (
load_molecule(temp_file, allow_undefined_stereo=allow_undefined_stereo),
mol_smiles,
)
except Exception as e:
logger.error(f"Error preparing ligand: {e}")
temp_file.unlink(missing_ok=True)
raise e
[docs]
def prep_protein(
protein_file: Path,
temp_file: Path,
relax_protein: bool,
remove_initial_protein_hydrogens: bool,
add_solvent: bool = False,
) -> PDBFile:
"""Prepare a protein for use in OpenMM.
:param protein_file: Path to the protein file.
:param temp_file: Path to the temporary file to save the prepared
protein to.
:param relax_protein: Whether or not to relax the protein.
:param remove_initial_protein_hydrogens: Whether or not to remove
the initial protein hydrogens.
:param add_solvent: Whether or not to add solvent to the protein.
:return: The prepared protein.
"""
try:
if not temp_file.exists():
temp_file.parent.mkdir(parents=True, exist_ok=True)
if relax_protein or remove_initial_protein_hydrogens:
remove_hydrogens_from_pdb(str(protein_file), str(temp_file))
else:
shutil.copy(protein_file, temp_file)
fixer = PDBFixer(str(temp_file))
fixer.findMissingResidues()
fixer.findNonstandardResidues()
fixer.replaceNonstandardResidues()
fixer.removeHeterogens(keepWater=False)
fixer.findMissingAtoms()
fixer.addMissingAtoms()
fixer.addMissingHydrogens(7.0)
if add_solvent:
fixer.addSolvent(fixer.topology.getUnitCellDimensions())
with open(temp_file, "w", encoding="utf-8") as f:
PDBFile.writeFile(fixer.topology, fixer.positions, f, keepIds=relax_protein)
return PDBFile(str(temp_file))
except Exception as e:
logger.error(f"Error preparing protein: {e}")
temp_file.unlink(missing_ok=True)
raise e
[docs]
def add_ligand_to_complex(
modeller: Modeller,
ligand: Molecule,
) -> Modeller:
"""Add a ligand to a protein-ligand complex.
:param modeller: The Modeller object for the protein-ligand complex.
:param ligand: The prepared ligand.
:return: The updated Modeller object for the protein-ligand complex.
"""
topology = ligand.to_topology().to_openmm()
positions = ligand.conformers[0].magnitude * unit.angstrom
modeller.add(topology, positions)
return modeller
[docs]
def deserialize(path: str) -> System:
"""Deserialize an OpenMM system from a file.
:param path: Path to the file to deserialize the system from.
:return: The deserialized system.
"""
with open(path, encoding="utf-8") as file:
system = XmlSerializer.deserialize(file.read())
return system
[docs]
def serialize(system: System, path: str) -> str:
"""Serialize an OpenMM system to a file.
:param system: The system to serialize.
:param path: Path to the file to serialize the system to.
:return: The path to the serialized system.
"""
with open(path, "w", encoding="utf-8") as file:
file.write(XmlSerializer.serialize(system))
return path
[docs]
def generate_system(
modeller: Modeller,
ligands: Molecule,
num_particles_protein: int,
num_particles_total: int,
name: str,
force_fields: list[str],
relax_protein: bool,
cache_dir: Path | None = None,
) -> System:
"""Generate an OpenMM system for the protein-ligand complex.
:param modeller: The Modeller object for the protein-ligand complex.
:param ligands: The prepared ligands.
:param num_particles_protein: The number of particles in the
protein.
:param num_particles_total: The total number of particles in the
complex.
:param name: The name of the system.
:param force_fields: The force fields to use.
:param relax_protein: Whether or not to relax the protein.
:param cache_dir: The directory to cache the system in.
:return: The generated OpenMM system.
"""
# try load from cache
if cache_dir is not None:
system_path = cache_dir / f"{name}_system.xml"
if system_path.exists():
system = deserialize(str(system_path))
# NOTE: perform rudimentary check to see if system is correct
if system.getNumParticles() == num_particles_total:
logger.info(f"Loaded system from cache: {system_path}")
return system
logger.info(f"Generating system for {name}")
if relax_protein:
# set up forcefield
forcefield_kwargs = {
"constraints": HBonds,
# process iron atoms (reference: http://docs.openmm.org/latest/api-python/generated/openmm.app.forcefield.ForceField.html#openmm.app.forcefield.ForceField.createSystem)
"residueTemplates": {
res: "FE"
for res in modeller.topology.residues()
if res.name == "FE" or all(a.element._symbol.upper() == "FE" for a in res._atoms)
},
}
system_generator = SystemGenerator(
forcefields=["amber/ff14SB.xml"],
small_molecule_forcefield="gaff-2.11",
forcefield_kwargs=forcefield_kwargs,
)
# set up system
system = system_generator.create_system(
modeller.topology,
molecules=ligands,
)
else:
# set up forcefield
forcefield = ForceField(*force_fields)
smirnoff = SMIRNOFFTemplateGenerator(molecules=ligands)
forcefield.registerTemplateGenerator(smirnoff.generator)
# set up system
system = forcefield.createSystem(
modeller.topology,
residueTemplates={
res: "FE"
for res in modeller.topology.residues()
if res.name == "FE" or all(a.element._symbol.upper() == "FE" for a in res._atoms)
},
)
for i in range(num_particles_protein):
system.setParticleMass(i, 0.0)
# save to cache
if cache_dir is not None:
system_path = cache_dir / f"{name}_system.xml"
serialize(system, str(system_path))
return system
[docs]
def setup_simulation(
modeller: Modeller,
system: System,
platform: Platform,
platform_properties: dict[str, Any],
temperature: float = 300.0 * kelvin,
friction_coeff: float = 1.0 / picosecond,
step_size: float = 0.002 * picosecond,
) -> Simulation:
"""Set up an OpenMM simulation for the protein-ligand complex.
:param modeller: The Modeller object for the protein-ligand complex.
:param system: The OpenMM system for the protein-ligand complex.
:param platform: The OpenMM platform to use.
:param platform_properties: The properties to use with the OpenMM
platform.
:param temperature: The temperature to use with a
LangevinIntegrator.
:param friction_coeff: The friction coefficient to use with a
LangevinIntegrator.
:param step_size: The step size to use with a LangevinIntegrator.
:return: The setup OpenMM simulation.
"""
integrator = LangevinIntegrator(temperature, friction_coeff, step_size)
simulation = Simulation(modeller.topology, system, integrator, platform, platform_properties)
simulation.context.setPositions(modeller.positions)
return simulation
[docs]
def save_with_rdkit(
molecule: Molecule,
file_path: Path,
conformer_index: int = 0,
name: str | None = None,
mol: Chem.Mol | None = None,
):
"""Save a molecule to a file using RDKit.
:param molecule: The molecule to save.
:param file_path: The path to save the molecule to.
:param conformer_index: The index of the conformer to save.
:param name: The name to give the molecule.
:param mol: The optional RDKit molecule to directly use for the
ligand.
"""
# NOTE: use RDKit because the `.to_file()` method does not allow picking conformation and only writes first one
if mol is None:
mol = molecule.to_rdkit()
mol = RemoveHs(mol)
if name is not None:
mol.SetProp("_Name", name)
MolToMolFile(mol, str(file_path), confId=conformer_index)
[docs]
@timeout(OPTIMIZATION_TIMEOUT_IN_SECONDS, use_signals=False)
def optimize_ligand_in_pocket(
protein_file: Path,
ligand_file: Path,
output_file: Path | None = None,
protein_output_file: Path | None = None,
complex_output_file: Path | None = None,
protein_gap_mask: str | None = None,
ligand_stiffness: float = 3000.0,
protein_stiffness: float = 1000.0,
protein_residue_set: str = "non_hydrogen",
tolerance: float = 0.01,
allow_undefined_stereo: bool = True,
prep_only: bool = False,
temp_dir: Path = Path("."),
name: str | None = None,
add_solvent: bool = False,
relax_protein: bool = False,
remove_initial_protein_hydrogens: bool = False,
assign_each_ligand_unique_force: bool = False,
model_ions: bool = True,
cache_files: bool = True,
assign_partial_charges_manually: bool = False,
report_initial_energy_only: bool = False,
platform_name: str = "fastest",
cuda_device_index: int = 0,
max_iterations: int = 0,
energy: float = unit.kilocalories_per_mole,
length: float = unit.angstroms,
) -> dict[str, Any]:
"""Optimize a ligand in a protein pocket using OpenMM.
:param protein_file: Path to the protein file.
:param ligand_file: Path to the ligand file.
:param output_file: Path to save the optimized ligand to.
:param protein_output_file: Path to save the optimized protein to.
:param complex_output_file: Path to save the optimized protein-ligand complex to.
:param protein_gap_mask: Mask of the protein regions in the PDB file for which to skip relaxation, represented as a sequence-length string.
:param ligand_stiffness: Stiffness of the ligand chains.
:param protein_stiffness: Stiffness of the protein chains.
:param protein_residue_set: Residue set to use for relaxation. Can be either `non_hydrogen` or `c_alpha`.
:param tolerance: The tolerance to use for energy minimization.
:param allow_undefined_stereo: Whether or not to allow undefined stereochemistry.
:param prep_only: Whether or not to only prepare the ligand and protein.
:param temp_dir: The temporary directory to use.
:param name: The name of the system.
:param add_solvent: Whether or not to add solvent to the protein.
:param relax_protein: Whether or not to relax the protein.
:param remove_initial_protein_hydrogens: Whether or not to remove the initial protein hydrogens.
:param assign_each_ligand_unique_force: Whether to assign each ligand a unique force constant.
:param model_ions: Whether or not to model ions.
:param cache_files: Whether or not to cache the prepared files.
:param assign_partial_charges_manually: Whether or not to assign partial charges manually.
:param report_initial_energy_only: Whether or not to report the initial energy only.
:param platform_name: The name of the OpenMM platform to use.
:param cuda_device_index: The index of the CUDA device to use.
:param max_iterations: The maximum number of iterations to use for energy minimization.
:param energy: When relaxing the protein, the unit of energy to use.
:param length: When relaxing the protein, the unit of length to use.
:return: A dictionary containing the energy before and after minimization, and the optimized
ligand.
"""
name = protein_file.stem if name is None else name
protein_cache = temp_dir / f"{name}_prepped_protein.pdb"
protein_complex = prep_protein(
protein_file=protein_file,
temp_file=protein_cache,
relax_protein=relax_protein,
remove_initial_protein_hydrogens=remove_initial_protein_hydrogens,
add_solvent=False,
)
if not cache_files:
protein_cache.unlink(missing_ok=True)
# construct initial complex
modeller = Modeller(protein_complex.topology, protein_complex.positions)
# parse all input ligands
ligand_mol = MolFromMolFile(str(ligand_file), sanitize=not relax_protein)
ligand_frags = Chem.GetMolFrags(ligand_mol, asMols=True)
ligands, ligands_smiles = [], []
for ligand_index, ligand in enumerate(ligand_frags, start=1):
ligand_cache = temp_dir / f"{name}_prepped_ligand_{ligand_index}.sdf"
ligand, ligand_smiles = prep_ligand(
ligand_file=ligand_file,
temp_file=ligand_cache,
relax_protein=relax_protein,
allow_undefined_stereo=allow_undefined_stereo,
mol=ligand,
)
if not cache_files:
ligand_cache.unlink(missing_ok=True)
if relax_protein or assign_partial_charges_manually:
try:
ligand.assign_partial_charges(partial_charge_method="mmff94")
except Exception as e:
ligand.assign_partial_charges(partial_charge_method="zeros")
logger.warning(f"Unable to assign partial charges to {ligand_file} due to: {e}")
modeller = add_ligand_to_complex(modeller, ligand)
ligands.append(ligand)
ligands_smiles.append(ligand_smiles)
dimensions = protein_complex.getTopology().getUnitCellDimensions()
if add_solvent and dimensions is not None:
force_fields = FORCE_FIELDS_EXPLICIT
modeller.addSolvent(
dimensions, model="tip3p", padding=1.0 * nanometer, ionicStrength=0.15 * molar
)
else:
force_fields = (
FORCE_FIELDS_EXPLICIT + ["amber14/GLYCAM_06j-1.xml"]
if model_ions
else FORCE_FIELDS_IMPLICIT
)
num_particles_protein = len(protein_complex.positions)
num_particles_ligand = sum([len(ligand.conformers[0].magnitude) for ligand in ligands])
num_particles_total = len(modeller.getPositions())
assert (
num_particles_ligand == num_particles_total - num_particles_protein
), "Number of ligand particles does not match expectation."
# generate system
system = generate_system(
modeller=modeller,
ligands=ligands,
force_fields=force_fields,
num_particles_protein=num_particles_protein,
num_particles_total=num_particles_total,
cache_dir=temp_dir,
name=name,
relax_protein=relax_protein,
)
if prep_only:
return {}
platform = (
get_fastest_platform()
if platform_name == "fastest"
else Platform.getPlatformByName(platform_name)
)
platform_properties = {}
if platform.getName() == "CUDA":
platform_properties["DeviceIndex"] = str(cuda_device_index)
logger.info(f"Using platform: {platform.getName()}:{cuda_device_index}")
else:
logger.info(f"Using platform: {platform.getName()}")
if relax_protein:
# collect metadata
if protein_gap_mask is None:
protein_gap_mask = "0" * protein_complex.topology.getNumResidues()
num_residues = len(protein_gap_mask)
reference_modeller = modeller
# add protein constraint
force = CustomExternalForce("0.5 * protein_k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)")
force.addGlobalParameter("protein_k", protein_stiffness)
for p in ["x0", "y0", "z0"]:
force.addPerParticleParameter(p)
for i, atom in enumerate(reference_modeller.topology.atoms()):
if atom.residue.index < num_residues and protein_gap_mask[atom.residue.index] == "1":
continue
if atom.residue.index >= num_residues:
continue
if will_restrain(atom, protein_residue_set):
force.addParticle(i, reference_modeller.positions[i])
system.addForce(force)
# add ligand constraint(s)
if assign_each_ligand_unique_force:
ligand_index = 0
num_ligand_atoms_observed = 0
num_atoms_in_ligand = len(ligands[0].atoms)
ligand_force = CustomExternalForce(
f"0.5 * ligand_k_{ligand_index} * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)"
)
ligand_force.addGlobalParameter(f"ligand_k_{ligand_index}", ligand_stiffness)
for p in ["x0", "y0", "z0"]:
ligand_force.addPerParticleParameter(p)
for i, atom in enumerate(reference_modeller.topology.atoms()):
if atom.residue.index < num_residues:
continue
if num_ligand_atoms_observed == num_atoms_in_ligand:
# aggregate intermediate ligand forces
ligand_index += 1
if ligand_index < len(ligands):
system.addForce(ligand_force)
num_ligand_atoms_observed = 0
num_atoms_in_ligand = len(ligands[ligand_index].atoms)
ligand_force = CustomExternalForce(
f"0.5 * ligand_k_{ligand_index} * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)"
)
ligand_force.addGlobalParameter(
f"ligand_k_{ligand_index}", ligand_stiffness
)
for p in ["x0", "y0", "z0"]:
ligand_force.addPerParticleParameter(p)
else:
break
if will_restrain(atom, protein_residue_set):
ligand_force.addParticle(i, reference_modeller.positions[i])
num_ligand_atoms_observed += 1
if ligand_index > 0:
# add initial (and/or final) ligand force
system.addForce(ligand_force)
else:
ligand_force = CustomExternalForce("0.5 * ligand_k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)")
ligand_force.addGlobalParameter("ligand_k", ligand_stiffness)
for p in ["x0", "y0", "z0"]:
ligand_force.addPerParticleParameter(p)
for i, atom in enumerate(reference_modeller.topology.atoms()):
if atom.residue.index < num_residues:
continue
if will_restrain(atom, protein_residue_set):
ligand_force.addParticle(i, reference_modeller.positions[i])
system.addForce(ligand_force)
simulation = setup_simulation(
modeller,
system,
platform,
platform_properties,
temperature=0.0 if relax_protein else 300.0 * kelvin,
friction_coeff=0.01 if relax_protein else 1.0 / picosecond,
step_size=0.0 if relax_protein else 0.002 * picosecond,
)
results_dict = {}
if relax_protein:
# save initial state
state_before = simulation.context.getState(getEnergy=True, getPositions=True)
results_dict["e_init"] = state_before.getPotentialEnergy().value_in_unit(energy)
results_dict["pos_init"] = state_before.getPositions(asNumpy=True).value_in_unit(length)
if report_initial_energy_only:
return results_dict
# minimize
logger.info(f"Minimizing {name}")
simulation.minimizeEnergy(tolerance=tolerance, maxIterations=max_iterations)
# save final state
logger.info(f"Saving {name}")
state_after = simulation.context.getState(getEnergy=True, getPositions=True)
results_dict["e_final"] = state_after.getPotentialEnergy().value_in_unit(energy)
results_dict["pos_final"] = state_after.getPositions(asNumpy=True).value_in_unit(length)
# record protein PDB file
if protein_file.suffix == ".pdb":
with open(protein_output_file, "w") as f:
PDBFile.writeFile(
protein_complex.topology,
results_dict["pos_final"][:num_particles_protein],
f,
keepIds=True,
)
remove_hydrogens_from_pdb(str(protein_output_file), str(protein_output_file))
# record complex PDB file
if complex_output_file is not None:
with open(complex_output_file, "w") as f:
PDBFile.writeFile(modeller.topology, results_dict["pos"], f, keepIds=True)
# record protein mmCIF file
elif protein_file.suffix == ".cif":
with open(protein_output_file, "w") as f:
PDBxFile.writeFile(
protein_complex.topology,
results_dict["pos_final"][:num_particles_protein],
f,
keepIds=True,
)
remove_hydrogens_from_pdb(str(protein_output_file), str(protein_output_file))
# record complex mmCIF file
if complex_output_file is not None:
with open(complex_output_file, "w"):
PDBxFile.writeFile(modeller.topology, results_dict["pos"], f, keepIds=True)
# record ligand file(s)
if output_file is not None:
new_mols = []
num_ligand_atoms_observed = 0
for ligand_index in range(len(ligands)):
num_atoms_in_ligand = len(ligands[ligand_index].atoms)
pos_start_index = num_particles_protein + num_ligand_atoms_observed
pos_end_index = pos_start_index + num_atoms_in_ligand
new_molecule = Molecule.from_smiles(
ligands_smiles[ligand_index], allow_undefined_stereo=allow_undefined_stereo
)
new_molecule.add_conformer(
openff_Quantity(
results_dict["pos_final"][pos_start_index:pos_end_index], units="angstrom"
)
)
new_mol = new_molecule.to_rdkit()
new_mol = remove_mol_hydrogens_and_reorder(new_mol)
new_mols.append(new_mol)
num_ligand_atoms_observed += num_atoms_in_ligand
new_mol = combine_molecules(new_mols)
with Chem.SDWriter(str(output_file)) as f:
f.write(new_mol)
else:
# save initial state
state_before = simulation.context.getState(getEnergy=True, getPositions=False)
results_dict["e_init"] = state_before.getPotentialEnergy()
if report_initial_energy_only:
return results_dict
# minimize
logger.info(f"Minimizing {name}")
simulation.minimizeEnergy(
tolerance=tolerance * kilojoule / mole / nanometer, maxIterations=max_iterations
)
# save final state
logger.info(f"Saving {name}")
state_after = simulation.context.getState(getEnergy=True, getPositions=True)
results_dict["e_final"] = state_after.getPotentialEnergy()
# save ligand(s)
ligand_mols = []
num_ligand_atoms_observed = 0
for ligand_index in range(len(ligands)):
num_atoms_in_ligand = len(ligands[ligand_index].atoms)
pos_start_index = num_particles_protein + num_ligand_atoms_observed
pos_end_index = pos_start_index + num_atoms_in_ligand
ligand_positions = state_after.getPositions(asNumpy=True)[
pos_start_index:pos_end_index
]
ligands[ligand_index].add_conformer(ligand_positions)
ligand_mols.append(ligands[ligand_index].to_rdkit())
num_ligand_atoms_observed += num_atoms_in_ligand
ligand = combine_molecules(ligand_mols)
if output_file is not None:
save_with_rdkit(ligand, output_file, conformer_index=1, name=name, mol=ligand)
results_dict["ligands"] = ligands
return results_dict
[docs]
@hydra.main(
version_base="1.3",
config_path="../../configs/model",
config_name="minimize_energy.yaml",
)
def minimize_energy(cfg: DictConfig) -> Optional[Dict[str, Any]]:
"""Minimize the energy of a ligand in a protein pocket using OpenMM."""
logger.setLevel(cfg.log_level)
protein_file_path = Path(cfg.protein_file)
ligand_file_path = Path(cfg.ligand_file)
output_file_path = Path(cfg.output_file)
protein_output_file_path = Path(cfg.protein_output_file) if cfg.protein_output_file else None
complex_output_file_path = Path(cfg.complex_output_file) if cfg.complex_output_file else None
temp_directory = Path(cfg.temp_dir)
prep_only = cfg.prep_only
if not protein_file_path.exists():
raise FileNotFoundError(f"File does not exist: {protein_file_path}")
if not ligand_file_path.exists():
raise FileNotFoundError(f"File does not exist: {ligand_file_path}")
temp_protein_file_path, temp_ligand_file_path = protein_file_path, ligand_file_path
if cfg.relax_protein and not prep_only:
assert (
protein_output_file_path is not None
), "Protein output file path must be provided when relaxing the protein."
temp_protein_file_path = Path(str(protein_file_path).replace(".pdb", "_temp.pdb"))
temp_ligand_file_path = Path(str(ligand_file_path).replace(".sdf", "_temp.sdf"))
shutil.copyfile(protein_file_path, temp_protein_file_path)
shutil.copyfile(ligand_file_path, temp_ligand_file_path)
logger.info("Performing iteration 0 of complex relaxation")
try:
num_attempts = 0
results_dict = optimize_ligand_in_pocket(
protein_file=temp_protein_file_path,
ligand_file=temp_ligand_file_path,
output_file=output_file_path,
protein_output_file=protein_output_file_path,
complex_output_file=complex_output_file_path,
temp_dir=temp_directory,
prep_only=prep_only,
name=cfg.name,
platform_name=cfg.platform,
cuda_device_index=cfg.cuda_device_index,
add_solvent=cfg.add_solvent,
relax_protein=cfg.relax_protein,
remove_initial_protein_hydrogens=cfg.remove_initial_protein_hydrogens,
assign_each_ligand_unique_force=cfg.assign_each_ligand_unique_force,
report_initial_energy_only=cfg.report_initial_energy_only,
model_ions=cfg.model_ions,
cache_files=cfg.cache_files,
assign_partial_charges_manually=cfg.assign_partial_charges_manually,
tolerance=2.39 if cfg.relax_protein else 0.01,
)
except Exception as e:
logger.error(
f"Complex relaxation was not fully successful due to: {e}. Copying the input files as the relaxed output files."
)
# organize output files
shutil.copyfile(ligand_file_path, output_file_path)
try:
if complex_output_file_path is not None:
os.remove(complex_output_file_path)
except OSError:
pass
logger.info(f"Finalizing complex relaxation with relax_protein={cfg.relax_protein}")
if cfg.relax_protein:
# organize output files
shutil.copyfile(protein_file_path, protein_output_file_path)
# clean up temporary files
try:
os.remove(temp_protein_file_path)
except OSError:
pass
try:
os.remove(temp_ligand_file_path)
except OSError:
pass
temp_name = temp_protein_file_path.stem if cfg.name is None else cfg.name
try:
os.remove(temp_directory / f"{temp_name}_prepped_protein.pdb")
except OSError:
pass
try:
os.remove(temp_directory / f"{temp_name}_prepped_ligand.sdf")
except OSError:
pass
return None
if prep_only:
sys.exit(0)
if not cfg.relax_protein:
if cfg.report_initial_energy_only:
energy_before = results_dict["e_init"].value_in_unit(kilojoule / mole)
logger.info(f"{ligand_file_path}, " + f"E_start: {energy_before:.2f} kJ/mol, ")
else:
energy_before = results_dict["e_init"].value_in_unit(kilojoule / mole)
energy_after = results_dict["e_final"].value_in_unit(kilojoule / mole)
logger.info(
f"{ligand_file_path}, "
+ f"E_start: {energy_before:.2f} kJ/mol, "
+ f"E_end: {energy_after:.2f} kJ/mol, "
+ f"ΔE: {energy_after - energy_before:.2f} kJ/mol"
)
else:
try:
shutil.copyfile(protein_output_file_path, temp_protein_file_path)
while results_dict["e_final"] > 0 and num_attempts < cfg.max_num_attempts:
logger.info(f"Performing iteration {num_attempts + 1} of complex relaxation")
results_dict = optimize_ligand_in_pocket(
protein_file=temp_protein_file_path,
ligand_file=temp_ligand_file_path,
output_file=output_file_path,
protein_output_file=protein_output_file_path,
complex_output_file=complex_output_file_path,
temp_dir=temp_directory,
prep_only=prep_only,
name=cfg.name,
platform_name=cfg.platform,
cuda_device_index=cfg.cuda_device_index,
add_solvent=cfg.add_solvent,
relax_protein=cfg.relax_protein,
assign_each_ligand_unique_force=cfg.assign_each_ligand_unique_force,
report_initial_energy_only=cfg.report_initial_energy_only,
model_ions=cfg.model_ions,
cache_files=cfg.cache_files,
tolerance=2.39 if cfg.relax_protein else 0.01,
)
num_attempts += 1
shutil.copyfile(protein_output_file_path, temp_protein_file_path)
# measure initial protein violations
protein_score, violation_residue_idx = compute_protein_local_geometry_violations(
str(temp_protein_file_path)
)
num_attempts = 0
while (
protein_score > 0 or results_dict["e_final"] > 0
) and num_attempts < cfg.max_num_attempts:
parser = (
MMCIFParser(QUIET=True)
if os.path.splitext(temp_protein_file_path)[-1] == ".cif"
else PDBParser(QUIET=True)
)
structure = parser.get_structure("x", temp_protein_file_path)
randomly_rotate_protein_side_chains(
structure, violation_residue_idx, min_angle=-np.pi / 2, max_angle=np.pi / 2
)
save_biopython_structure_to_pdb(structure, temp_protein_file_path, ca_only=False)
results_dict = optimize_ligand_in_pocket(
protein_file=temp_protein_file_path,
ligand_file=temp_ligand_file_path,
output_file=output_file_path,
protein_output_file=protein_output_file_path,
complex_output_file=complex_output_file_path,
temp_dir=temp_directory,
prep_only=prep_only,
name=cfg.name,
platform_name=cfg.platform,
cuda_device_index=cfg.cuda_device_index,
add_solvent=cfg.add_solvent,
relax_protein=cfg.relax_protein,
assign_each_ligand_unique_force=cfg.assign_each_ligand_unique_force,
report_initial_energy_only=cfg.report_initial_energy_only,
model_ions=cfg.model_ions,
cache_files=cfg.cache_files,
tolerance=2.39 if cfg.relax_protein else 0.01,
)
num_attempts += 1
shutil.copyfile(protein_output_file_path, temp_protein_file_path)
# measure subsequent protein violations
protein_score, violation_residue_idx = compute_protein_local_geometry_violations(
str(temp_protein_file_path)
)
# measure final ligand violations
ligand_score = compute_ligand_local_geometry_violations(
# NOTE: passing `None` for the first argument will ensure the relaxed ligand's
# pairwise distances closely match a random RDKit conformer's pairwise distances
None,
str(temp_ligand_file_path),
)
if (
protein_score > 0
or ligand_score > 0
or results_dict["e_final"] > cfg.max_final_e_value
):
logger.warning(
f"Complex relaxation was not fully successful after all relaxation attempts and scoring, yielding a final protein score of {protein_score}, a final (maximum) ligand score of {ligand_score}, and a final E-value of {results_dict['e_final']}. Copying the input files as the relaxed output files."
)
shutil.copyfile(protein_file_path, protein_output_file_path)
shutil.copyfile(ligand_file_path, output_file_path)
try:
if complex_output_file_path is not None:
os.remove(complex_output_file_path)
except OSError:
pass
except Exception as e:
logger.error(
f"Complex relaxation was not fully successful due to: {e}. Copying the input files as the relaxed output files."
)
# organize output files
shutil.copyfile(protein_file_path, protein_output_file_path)
shutil.copyfile(ligand_file_path, output_file_path)
try:
if complex_output_file_path is not None:
os.remove(complex_output_file_path)
except OSError:
pass
logger.info(f"Finalizing complex relaxation with relax_protein={cfg.relax_protein}")
# clean up temporary files
try:
os.remove(temp_protein_file_path)
except OSError:
pass
try:
os.remove(temp_ligand_file_path)
except OSError:
pass
temp_name = temp_protein_file_path.stem if cfg.name is None else cfg.name
try:
os.remove(temp_directory / f"{temp_name}_prepped_protein.pdb")
except OSError:
pass
try:
os.remove(temp_directory / f"{temp_name}_prepped_ligand.sdf")
except OSError:
pass
return results_dict
[docs]
@hydra.main(
version_base="1.3",
config_path="../../configs/model",
config_name="minimize_energy.yaml",
)
def main(cfg: DictConfig):
"""Minimize the energy of a ligand in a protein pocket."""
minimize_energy(cfg)
if __name__ == "__main__":
register_custom_omegaconf_resolvers()
main()