Source code for posebench.data.binding_site_crop_preparation

# -------------------------------------------------------------------------------------------------------------------------------------
# Following code curated for PoseBench: (https://github.com/BioinfoMachineLearning/PoseBench)
# -------------------------------------------------------------------------------------------------------------------------------------

import logging
import os

import hydra
import rootutils
from beartype import beartype
from beartype.typing import Any, List, Tuple
from Bio.PDB import PDBIO, NeighborSearch, PDBParser, Select
from omegaconf import DictConfig
from rdkit import Chem
from tqdm import tqdm

rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)

from posebench import register_custom_omegaconf_resolvers
from posebench.utils.data_utils import parse_inference_inputs_from_dir

logging.basicConfig(format="[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


[docs] class BindingSiteSelect(Select): """Custom Select class to filter residues based on binding site residue indices.""" def __init__(self, structure_residues: List[Any], binding_site_residue_indices: List[int]): """Initialize the BindingSiteSelect class.""" self.structure_residues = structure_residues self.binding_site_residue_indices = set(binding_site_residue_indices)
[docs] def accept_residue(self, residue: Any) -> bool: """Accept residues based on whether they are part of the binding site or not. :param residue: Residue object from the Bio.PDB module. :return: Boolean indicating whether the residue is part of the binding site. """ return self.structure_residues.index(residue) in self.binding_site_residue_indices
[docs] @beartype def get_binding_site_residue_indices( protein_filepath: str, ligand_filepath: str, protein_ligand_distance_threshold: float = 4.0, num_buffer_residues: int = 7, ) -> List[int]: """Get the zero-based residue indices of the protein binding site based on native protein- ligand interactions. :param protein_filepath: Path to the protein structure PDB file. :param ligand_filepath: Path to the ligand structure SDF file. :param protein_ligand_distance_threshold: Heavy-atom distance threshold (in Angstrom) to use for finding protein binding site residues in interaction with ligand heavy atoms. :param num_buffer_residues: Number of residues to include as a buffer around each binding site residue. :return: List of zero-based residue indices that define the binding site. """ assert os.path.exists( protein_filepath ), f"Protein structure file `{protein_filepath}` does not exist." assert os.path.exists( ligand_filepath ), f"Ligand structure file `{ligand_filepath}` does not exist." # parse the protein structure parser = PDBParser(QUIET=True) structure = parser.get_structure("protein", protein_filepath) # load the ligand using RDKit ligand = Chem.MolFromMolFile(ligand_filepath, removeHs=True) assert ligand is not None, "Failed to load ligand structure." # get the coordinates of ligand heavy atoms ligand_conformer = ligand.GetConformer() ligand_coords = [ ligand_conformer.GetAtomPosition(i) for i in range(ligand.GetNumAtoms()) if ligand.GetAtomWithIdx(i).GetAtomicNum() > 1 ] # flatten the list of coordinates into tuples ligand_coords = [(atom.x, atom.y, atom.z) for atom in ligand_coords] # extract all heavy atoms from the protein structure all_heavy_atoms = [atom for atom in structure.get_atoms() if atom.element != "H"] # use NeighborSearch to find all heavy atoms within the specified distance from any ligand heavy atom neighbor_search = NeighborSearch(all_heavy_atoms) nearby_atoms = set() for coord in ligand_coords: nearby_atoms.update(neighbor_search.search(coord, protein_ligand_distance_threshold)) # collect the zero-based indices of residues that these atoms belong to binding_site_residue_indices = set() structure_residues = list(structure.get_residues()) for atom in nearby_atoms: residue = atom.get_parent() binding_site_residue_indices.add(structure_residues.index(residue)) assert ( len(binding_site_residue_indices) > 0 ), f"No binding site residues found for ligand associated with {protein_filepath}." # pad and connect the binding site residue indices with sequence-regional buffer residues padded_binding_site_residue_indices = list( range( max(0, min(binding_site_residue_indices) - num_buffer_residues), min( len(structure_residues) - 1, max(binding_site_residue_indices) + num_buffer_residues, ), ) ) return padded_binding_site_residue_indices
[docs] @beartype def crop_protein_binding_site( protein_filepath: str, binding_site_residue_indices: List[int], output_dir: str, pdb_id: str, filename_midfix: str = "", filename_suffix: str = "", ): """Crop the protein binding site and save it to a separate file. :param protein_filepath: Path to the input protein structure file. :param binding_site_residue_indices: List of zero-based residue indices that define the binding site. :param output_dir: Path to the output directory. :param pdb_id: PDB ID of the protein-ligand complex. :param filename_midfix: Optional "midfix" to insert into the cropped protein structure filename. :param filename_suffix: Optional suffix to append to the cropped protein structure filename. """ assert os.path.exists( protein_filepath ), f"Protein structure file `{protein_filepath}` does not exist." assert ( len(binding_site_residue_indices) > 0 ), "The binding site residue indices list must contain at least one residue index." assert os.path.exists(output_dir), f"Output directory `{output_dir}` does not exist." # parse the protein structure parser = PDBParser(QUIET=True) structure = parser.get_structure(pdb_id, protein_filepath) # create a selection class instance with the binding site indices select = BindingSiteSelect(list(structure.get_residues()), binding_site_residue_indices) # define the output file path output_filepath = os.path.join( output_dir, f"{pdb_id}{filename_midfix}_protein{filename_suffix}.pdb" ) # write the selected residues to the output file io = PDBIO() io.set_structure(structure) io.save(output_filepath, select)
[docs] @beartype def save_cropped_protein_binding_site( smiles_and_pdb_id_list: List[Tuple[Any, str]], input_data_dir: str, input_protein_structure_dir: str, protein_ligand_distance_threshold: float = 4.0, num_buffer_residues: int = 7, ): """Save the cropped protein binding site to a separate file for each protein-ligand complex. :param smiles_and_pdb_id_list: A list of tuples each containing a SMILES string and a PDB ID. :param dataset: Dataset name. :param input_data_dir: Path to directory of input protein-ligand complex subdirectories. :param input_protein_structure_dir: Path to the directory containing the protein structure input files. :param protein_ligand_distance_threshold: Heavy-atom distance threshold (in Angstrom) to use for finding protein binding site residues in interaction with ligand heavy atoms. :param num_buffer_residues: Number of residues to include as a buffer around each binding site residue. """ output_protein_structure_dir = input_protein_structure_dir + "_bs_cropped" os.makedirs(output_protein_structure_dir, exist_ok=True) for _, pdb_id in tqdm(smiles_and_pdb_id_list, desc="Cropping protein binding sites"): pred_protein_filepath = os.path.join( input_protein_structure_dir, f"{pdb_id}_holo_aligned_predicted_protein.pdb", ) ref_protein_filepath = os.path.join(input_data_dir, pdb_id, f"{pdb_id}_protein.pdb") ref_ligand_filepath = os.path.join(input_data_dir, pdb_id, f"{pdb_id}_ligand.sdf") if not os.path.exists(pred_protein_filepath): logger.warning( f"Predicted protein structure file `{pred_protein_filepath}` does not exist. Skipping binding site cropping for this complex..." ) continue assert os.path.exists( ref_protein_filepath ), f"Reference (ground-truth) protein structure file `{ref_protein_filepath}` does not exist." assert os.path.exists( ref_ligand_filepath ), f"Reference (ground-truth) ligand structure file `{ref_ligand_filepath}` does not exist." binding_site_residue_indices = get_binding_site_residue_indices( ref_protein_filepath, ref_ligand_filepath, protein_ligand_distance_threshold=protein_ligand_distance_threshold, num_buffer_residues=num_buffer_residues, ) crop_protein_binding_site( pred_protein_filepath, binding_site_residue_indices, output_protein_structure_dir, pdb_id, filename_midfix="_holo_aligned_predicted", ) crop_protein_binding_site( ref_protein_filepath, binding_site_residue_indices, os.path.join(input_data_dir, pdb_id), pdb_id, filename_suffix="_bs_cropped", )
[docs] @hydra.main( version_base="1.3", config_path="../../configs/data", config_name="binding_site_crop_preparation.yaml", ) def main(cfg: DictConfig): """Parse a data directory containing subdirectories of protein-ligand complexes and prepare corresponding inference CSV file for the DiffDock model. :param cfg: Configuration dictionary from the hydra YAML file. """ smiles_and_pdb_id_list = parse_inference_inputs_from_dir(cfg.input_data_dir) save_cropped_protein_binding_site( smiles_and_pdb_id_list, cfg.input_data_dir, cfg.input_protein_structure_dir, protein_ligand_distance_threshold=cfg.protein_ligand_distance_threshold, num_buffer_residues=cfg.num_buffer_residues, ) logger.info(f"Protein binding site cropping for dataset `{cfg.dataset}` complete.")
if __name__ == "__main__": register_custom_omegaconf_resolvers() main()