import warnings
import collections
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
from typing import Union, Optional
from pharmacophore.constants import feature_factory, FEATURES, FEATURE_COLORS, color_convert
[docs]
class Pharmacophore:
def __init__(self, features: Union[str, dict] = 'default'):
"""
Initialize the Pharmacophore class.
:param features: Union[str, dict]
Set features to use. Will default to 'default'. Can also accept RDKit features or a dictionary of features.
"""
self.sdf = None
self.features = features
self.features_list = None
[docs]
def read_sdf(self, sdf_file: str = None, verbose: bool = False):
"""
Read sdf files.
:param sdf_file: Str
File path to .sdf file.
:param verbose: Bool
Output description when reading .sdf file.
:return: list
A list of molecules in Chem.Mol format.
"""
supplier = Chem.SDMolSupplier(sdf_file)
# extract ROMols into a list
mol_list = []
if verbose:
for mol in tqdm(supplier, desc="Reading Molecules"):
mol_list.append(mol)
else:
for mol in supplier:
mol_list.append(mol)
self.sdf = mol_list
return mol_list
[docs]
def feature_types(self, features: Optional[Union[str, dict]] = None):
"""
Output feature types that will be displayed in the pharmacophore model. This will default to 'default' settings.
However, default RDKit features can also be used or a dictionary containing a dictionary of features with SMARTS
string can be used.
:param features: Optional[Union[str, dict]]
The feature type to be used in the pharmacophore model. This will default to "default" settings. Using
"rdkit" will set the model to default RDKit features. Custom features can be used by giving a dictionary
with feature:list pair, where the list contains SMARTS string for a given feature.
:return:
"""
# set instance variable
if features is None:
features = self.features
global phrase
if features == "default":
phrase = f"Default features: \n('Donor', 'Acceptor', 'Aromatic', 'Hydrophobe')"
elif features == "rdkit":
phrase = f"Default features from RDKit: \n{feature_factory.GetFeatureFamilies()}"
elif isinstance(features, dict):
phrase = f"Custom features: \n{list(features.keys())}"
return phrase
[docs]
def to_df(self, mols: list = None, mol_name: list = None, features: Optional[Union[str, dict]] = None):
"""
From a list of containing Chem.MOl and a list containing molecule names, create a dataframe displaying matching
features the molecules. Method will default to "default" settings. Users can use "rdkit" to set the model to
default RDKit features, or give a dictionary with feature:list pair, where the list contains SMARTS string for a
given feature.
:param mols: list
A list containing molecules in Chem.Mol format.
:param mol_name: list
A list containing names of molecules.
:param features: Union[str, dict]
The feature type to be used in the pharmacophore model. This will default to "default" settings. Using
"rdkit" will set the model to default RDKit features. Custom features can be used by giving a dictionary
with feature:list pair, where the list contains SMARTS string for a given feature.
:return:
"""
global feat_factory
# set instance variable
if features is None:
features = self.features
molecule_feature_frequencies = []
# use default features
if features == 'rdkit':
feat_factory = feature_factory
# get default features, count frequency, and append into list
for mol in mols:
feats = [feature.GetFamily() for feature in feat_factory.GetFeaturesForMol(mol)]
feature_frequency = collections.Counter(feats)
molecule_feature_frequencies.append(feature_frequency)
# use default features
elif features == 'default':
feature_list = []
# get features from constant file, count pharmacophore type, append to list
for mol in mols:
feats = self._calc_pharmacophore(mol)
for feat in feats:
feature_list.append(feat[0])
feature_frequency = collections.Counter(feature_list)
molecule_feature_frequencies.append(feature_frequency)
feature_list = [] # reset list to avoid double counting
# use custom features
elif isinstance(features, dict):
# reset instance variable
self.features = features
feature_list = []
# get features from constant file, count pharmacophore type, append to list
for mol in mols:
feats = self._calc_pharmacophore(mol, features)
for feat in feats:
feature_list.append(feat[0])
feature_frequency = collections.Counter(feature_list)
molecule_feature_frequencies.append(feature_frequency)
feature_list = [] # reset list to avoid double counting
elif isinstance(features, dict) is False:
raise ValueError("Only 'default', 'rdkit', or custom features as dictionary are supported!")
# check if mol_name is given
if mol_name is None:
mol_name = []
for x in range(len(mols)):
name = "Mol" + str(x)
mol_name.append(name)
# # for troubleshooting
# print(molecule_feature_frequencies)
# reformat table and rename columns to molecule list
df = pd.DataFrame(molecule_feature_frequencies)
feature_frequencies_df = df.transpose()
feature_frequencies_df = feature_frequencies_df.fillna(0).astype(int)
feature_frequencies_df = feature_frequencies_df.set_axis(mol_name, axis=1)
return feature_frequencies_df
[docs]
def calc_pharm(self, mol: Chem.Mol = None, features: Optional[Union[str, dict]] = None):
"""
Generate a list of pharmacophore features and position from a molecule.
:param mol: Chem.Mol
A molecule in ROMol format.
:param features: str
Designate the type of pharmacophore features to calculate from. Will default to "default" settings. This
param can also accept "rdkit" to use RDKit default features or give a dictionary containing feature:list
pair, where the list contains SMARTS string for a given feature.
:return: list
A list of pharmacophore features, matching atom index, and position from a molecule.
"""
global pharmacophore
# set instance variable
if features is None:
features = self.features
# calculate pharmacophore by rdkit or pharmacophore dict
if features == 'rdkit':
pharmacophore = self._calc_rdkit(mol)
elif features == 'default':
pharmacophore = self._calc_pharmacophore(mol)
elif isinstance(features, dict):
pharmacophore = self._calc_pharmacophore(mol, features)
else:
raise ValueError("Only 'default', 'rdkit', or custom features as dictionary are supported!")
# set features to self
self.features_list = pharmacophore
return pharmacophore
[docs]
def add_feats(self, mol: Chem.Mol = None, substruct: str = None, type: str = None):
"""
Optional method to add specific features if not present using default or RDKit rules.
:param mol: Chem.Mol
Main molecule in ROMol format.
:param substruct: str
A SMARTS string for specific substructure to match
:param type: str
Set the type of interaction for substructure. Only 'Donor', 'Acceptor', 'Aromatic', and 'Hydrophobe' is
allowed!
:return:
"""
# convert substruct smarts string into ROMol
query = [Chem.MolFromSmarts(substruct)]
# find matches
matches = find_matches(mol, query)
# check type
if type == 'Donor' or type == 'Acceptor' or type == 'Aromatic' or type == 'Hydrophobe':
pass
else:
raise ValueError(f"Type {type} not supported! Only 'Donor', 'Acceptor', 'Aromatic', and 'Hydrophobe' is "
f"allowed!")
# obtain specific data from each match
for match in matches:
result = [type, match[0], match[1][0], match[1][1], match[1][2]]
# add custom feats to self.features_list
self.features_list.append(result)
return self.features_list
[docs]
def output_features(self, feature_list: Optional[list] = None, savepath: str = None, type: str = "sphere",
sphere_size: float = 0.7, transparency: float = 0.2, color: dict = None):
"""
Output features as a .pml format for visualization in PyMol.
:param feature_list: Optional[list]
A list containing features, corresponding atom number, and 3D position. Preferably genreated using the
calc_pharm method.
:param savepath: str = None
Must be a file in .pml format.
:param type: str = "Sphere"
Set the pharmacophore representation. Can be "sphere" or "surface". If surface is used, transparency can be
set for each pharmacophore.
:param sphere_size: float = 0.5
Set size of spheres.
:param transparency: float = 0.2
Set teh transparency of pharmacophore. Only works when the pharmacophore type is set to "surface".
:param color: dict
Set color for the pharmacophores. Must be given as Acceptor: color where color is a tuple for RGB or a
string for a specific color to be translated into RGB format.
:return:
"""
if feature_list is None:
feature_list = self.features_list
with open(savepath, "w") as f:
# feat_factory = feature_factory
# features = feat_factory.GetFeaturesForMol(mol)
print(f"Number of features: {len(feature_list)}")
# set feature colors
if color is None:
for feat in FEATURE_COLORS:
f.write(f"set_color {feat}_color, {FEATURE_COLORS[feat]}\n")
# if custom color, convert to RGB
elif isinstance(color, dict):
for shade in color:
rgb = color_convert(color[shade])
# print(rgb) # for troubleshooting
f.write(f"set_color {shade}_color, {rgb}\n")
else:
warnings.warn("Issue with color features!")
# to give sequential numbering for each group:
feature_counts = {
"Acceptor": 0, "Donor": 0, "Hydrophobe": 0, "Aromatic": 0, "LumpedHydrophobe": 0, "PosIonizable": 0}
# get features
for feat in feature_list:
feature = feat[0] # extract feature type
feature_counts[feature] += 1 # give feature count
count = feature_counts[feature] # get current count
# get feature position
pos_x = feat[2]
pos_y = feat[3]
pos_z = feat[4]
f.write(
# f"pseudoatom {feature}_{count}, pos=[{pos_x}, {pos_y}, {pos_z}], color={color_type}\n"
f"pseudoatom {feature}_{count}, pos=[{pos_x}, {pos_y}, {pos_z}]\n"
)
#todo add mol name in front of pharmacophre
# set color and sphere size
f.write("\n")
if type == "sphere":
f.write("show spheres, Acceptor_*\n")
f.write("color acceptor_color, Acceptor_*\n")
f.write("\n")
f.write("show spheres, Donor_*\n")
f.write("color donor_color, Donor_*\n")
f.write("\n")
f.write("show spheres, Hydrophobe_*\n")
f.write("color hydrophobe_color, Hydrophobe_*\n")
f.write("\n")
f.write("show spheres, Aromatic_*\n")
f.write("color aromatic_color, Aromatic_*\n")
f.write("\n")
f.write("show spheres, LumpedHydrophobe_*\n")
f.write("color lumpedhydrophobe, LumpedHydrophobe_*\n")
f.write("\n")
f.write("show spheres, PosIonizable*\n")
f.write("color posionizable, PosIonizable*\n")
f.write("\n")
f.write(f"set sphere_scale, {sphere_size}\n") # Adjust sphere size in PyMOL
elif type == "surface":
f.write("show surface, Acceptor_*\n")
f.write("color acceptor_color, Acceptor_*\n")
f.write(f"set transparency, {transparency}, Acceptor_*")
f.write("\n")
f.write("show surface, Donor_*\n")
f.write("color donor_color, Donor_*\n")
f.write(f"set transparency, {transparency}, Donor_*\n")
f.write("\n")
f.write("show surface, Hydrophobe_*\n")
f.write("color hydrophobe_color, Hydrophobe_*\n")
f.write(f"set transparency, {transparency}, Hydrophobe_*\n")
f.write("\n")
f.write("show surface, Aromatic_*\n")
f.write("color aromatic_color, Aromatic_*\n")
f.write(f"set transparency, {transparency}, Aromatic_*\n")
f.write("\n")
f.write("show surface, LumpedHydrophobe_*\n")
f.write("color lumpedhydrophobe, LumpedHydrophobe_*\n")
f.write(f"set transparency, {transparency}, LumpedHydrophobe_*\n")
f.write("\n")
f.write("show surface, PosIonizable*\n")
f.write("color posionizable, PosIonizable*\n")
f.write(f"set transparency, {transparency}, PosIonizable*\n")
f.write("\n")
else:
raise ValueError("Issue with type {type}! Only 'sphere' or 'surface' allowed!")
print(f"Feature visualization script written to {savepath}.")
def _calc_pharmacophore(self, mol: Chem.Mol = None, features: Optional[dict] = None):
"""
Calculate pharmacophore features from a molecule using dict from constants
:param mol: Chem.Mol
Input molecule in ROMol format.
:param features: Optional[dict]
Include custom features for calculating features.
:return:
List of pharmacophore type and centroid coordinates.
"""
global pharmacophore
# read in custom feature dict
if features is None:
constant_feats = FEATURES
else:
constant_feats = features
# hold matches
matches = {}
# Identify matches to feature dict
for key, value in constant_feats.items():
try:
query = [Chem.MolFromSmarts(smarts) for smarts in value]
matches[key] = find_matches(mol, query, verbose=False)
except:
pass
# remove duplicate SMARTS matches
cleaned_matches = {}
for key, value in matches.items():
unique_lists = []
for list in value:
if list not in unique_lists:
unique_lists.append(list)
cleaned_matches[key] = unique_lists
# create list containing pharmacophore and centroid coordinates
pharmacophore = []
for key, value in cleaned_matches.items():
for match in value:
# extarct feature type, atom match, and XYZ position
p = [key, match[0], match[1][0], match[1][1], match[1][2]]
pharmacophore.append(p)
# remove duplicates for aromatic/hydrophobe set
atom_indices = set()
final_pharmacophore = []
# extract aromatic/hydrophobe matches by atom index adn remove hydrophobe
for entry in pharmacophore:
label, atom_indexes, *rest = entry
index = frozenset(atom_indexes) # Convert atom indexes to a frozenset so order does not matter
# Check if the atom indexes are already in the set
if index not in atom_indices:
final_pharmacophore.append(entry)
atom_indices.add(index)
# keep Aromatic
elif label == 'Aromatic':
final_pharmacophore = [e for e in final_pharmacophore if
not (e[0] == 'Hydrophobe' and frozenset(e[1]) == index)]
final_pharmacophore.append(entry)
atom_indices.add(index)
return final_pharmacophore
def _calc_rdkit(self, mol: Chem.Mol = None):
"""
Calculate pharmacophore features from a molecule using default RDKit methods.
:param mol: Chem.Mol
input molecule in ROMol format.
:return:
List of pharmacophore type and centroid coordinates.
"""
global pharmacophore
# load default RDKit feature factory file
feat_factory = feature_factory
# get list of features for molecule
features = feat_factory.GetFeaturesForMol(mol)
# store pharmacophore features as list
pharmacophore = []
for feature in features:
fam = feature.GetFamily()
pos = feature.GetPos()
atom_indices = feature.GetAtomIds()
pharmacophore_item = [fam, atom_indices, pos[0], pos[1], pos[2]]
pharmacophore.append(pharmacophore_item)
return pharmacophore
# Support function to fix bond order of molecule
[docs]
def fix_bond_order(mol: Chem.Mol, template_smi: str, savepath: str = None):
"""
Fix bond order for a given molecule. A template smiles string must be given.
:param mol: Chem.Mol
Molecule to fix. Must be in ROMol format.
:param template_smi: str
A smiles string for molecule to fix. Recommend canonical smiles string if possible.
:param savepath: str
Set the save path for the molecule. Output will be in sdf format.
:return:
"""
# remove hydrogen if present
mol_prep = Chem.RemoveHs(mol)
# read template
template = Chem.MolFromSmiles(template_smi)
# assign bond orders to crystal ligand
fixed_mol = AllChem.AssignBondOrdersFromTemplate(mol_prep, template)
# Map atoms from the template to the docked pose
template_mapping = mol_prep.GetSubstructMatch(template)
if not template_mapping:
raise ValueError("Failed to match template to docked pose.")
# Transfer 3D coordinates from docked_pose to fixed_mol
new_conformer = Chem.Conformer(fixed_mol.GetNumAtoms())
original_conformer = mol_prep.GetConformer()
# set coordinates of heavy atoms to fixed_mol
for new_idx, origin_idx in enumerate(template_mapping):
pos = original_conformer.GetAtomPosition(origin_idx)
new_conformer.SetAtomPosition(new_idx, pos)
# Add 3D coordinates to the molecule
fixed_mol.AddConformer(new_conformer, assignId=True)
# save file
Chem.MolToMolFile(fixed_mol, savepath)
[docs]
def find_matches(mol: Chem.Mol = None, patterns: list[Chem.Mol] = None, verbose=True):
"""
Support function to visualize matches between query molecule and features.
:param mol: Chem.Mol
Query molecule. Must be in ROMol format.
:param patterns: list[Chem.Mol]
A list of molecules to match against the query molecule.
:param verbose: Bool
Output messages for matches.
:return:
"""
matches = []
for pattern in patterns:
matched = mol.GetSubstructMatches(pattern)
# get centroid and coordinates for each match
for m in matched:
try:
centroid = _compute_match_centroid(mol, m)
matches.append([m, centroid])
except:
pass
# output statement if no matches
if verbose is True:
if len(matches) == 0:
output_message = Chem.MolToSmarts(pattern)
print(f"No Matches to {output_message}!")
return matches
return matches
def _compute_match_centroid(mol, matched_pattern):
"""
Support function to calculate centroid of matches between query molecule and features.
:param mol:
:param matched_pattern:
:return:
"""
conf = mol.GetConformer()
positions = [conf.GetAtomPosition(i) for i in matched_pattern]
center = np.mean(positions, axis=0)
# convert result to float
center = center.tolist()
return tuple(center)
if __name__ == "__main__":
import doctest
doctest.testmod()