Commit 49ac416b authored by Ninjani's avatar Ninjani
Browse files

added some comments

parent f5b23f7e
......@@ -4,7 +4,9 @@
- modeller
- RDKit
- shap
- xgboost
- XGBoost
- APBS
- PDB2PQR
- geometricus
- caretta
......
......@@ -12,6 +12,9 @@ import shap
def get_features_dict(features_dir, keys, num_models=5, sequence_only=False, model_dir=None):
"""
Load features from files generated by make_data
"""
features_dict = defaultdict(list)
features_dir = Path(features_dir)
for key in keys:
......@@ -246,7 +249,20 @@ def normalize(row):
return (row - maxv) / (maxv - minv)
def get_compound_features(compound_smiles: list):
def get_compound_features(compound_smiles: list, train_smiles: list):
"""
Get Morgan fingerprints for a set of compounds defined by their SMILES strings.
Parameters
----------
compound_smiles: list of all smiles
train_smiles: list of smiles in training set, used to decide which indices in the morgan bit vector to keep
Returns
-------
array of morgan fingerprint counts for each compound
"""
smiles_to_product_mol = {}
for s in compound_smiles:
mol = Chem.MolFromSmiles(s)
......@@ -262,7 +278,7 @@ def get_compound_features(compound_smiles: list):
useChirality=False,
useFeatures=False,
bitInfo=mol_to_bitinfo[p])
bits_product = list(frozenset().union(*(set(mol_to_bitinfo[s].keys()) for s in compound_smiles)))
bits_product = list(frozenset().union(*(set(mol_to_bitinfo[s].keys()) for s in train_smiles)))
compound_features = np.zeros((len(compound_smiles), len(bits_product)))
for c, p in enumerate(compound_smiles):
counts = mol_to_morgan_fingerprints[p].GetNonzeroElements()
......@@ -274,6 +290,10 @@ def get_compound_features(compound_smiles: list):
def make_y(protein_compound_mapping, compound_smiles, multi=False):
"""
Go from a dictionary mapping a protein to its products, to an array of 1s and 0s.
If multi=False, only considers the first compound in the mapping
"""
product_y = {}
for key in protein_compound_mapping:
if multi:
......@@ -284,6 +304,9 @@ def make_y(protein_compound_mapping, compound_smiles, multi=False):
def get_matchmaker_product_predictions(keys, prediction_product):
"""
Go from predicted probabilities to preducted compounds
"""
predictions = []
probabilities = []
for i, k in enumerate(keys):
......@@ -301,6 +324,9 @@ class Matchmaker:
self.y = None
def fit(self, aln_features, compound_features, y, **kwargs):
"""
Fit Matchmaker to training set of aligned protein features with all possible compound features
"""
self.compound_features = compound_features
X_protein = np.vstack([a for a in aln_features for _ in range(len(compound_features))])
X_compound = np.vstack([compound_features[i] for _ in range(len(aln_features)) for i in range(len(compound_features))])
......@@ -311,6 +337,9 @@ class Matchmaker:
self.clf.fit(X, y)
def predict_proba(self, aln_features):
"""
Get compatibility scores for test proteins
"""
X_protein = np.vstack([a for a in aln_features for _ in range(len(self.compound_features))])
X_compound = np.vstack([self.compound_features[i] for _ in range(len(aln_features)) for i in range(len(self.compound_features))])
return self.clf.predict_proba(np.hstack([X_protein, X_compound]))
......@@ -328,13 +357,29 @@ class Matchmaker:
class MatchmakerExplainer:
def __init__(self, trained_clf, X, y, len_alignment, num_per_feature):
"""
Make SHAP explainer from trained Matchmaker classifier.
Parameters
----------
trained_clf: trained Matchmaker classifier
X: feature matrix used for training
y: response variable used for training
len_alignment: number of alignment positions used to calculate protein feature matrix
num_per_feature: number of indices used by each feature (e.g. pssm uses 20, fluctuations use 1)
"""
self.y = y
self.explainer = shap.TreeExplainer(trained_clf)
self.shap_values = self.explainer.shap_values(X)
self.len_alignment = len_alignment
self.num_per_feature = num_per_feature
def explain(self, protein_indices, num_compounds, keep="positive"):
def explain(self, protein_indices, num_compounds, compound_index, keep="positive"):
"""
Get residue predictivity scores calculated across certain proteins for a
certain compound
"""
def sum_importances(row, keep="positive"):
per_residue = np.zeros(self.len_alignment)
start = 0
......@@ -355,7 +400,7 @@ class MatchmakerExplainer:
return per_residue
indices = []
for i in protein_indices:
indices += [i * num_compounds + x for x in np.where(self.y[i * num_compounds: (i+1) * num_compounds] == 1)[0]]
indices += [i * num_compounds + compound_index]
indices = np.array(indices)
residue_scores = np.mean([sum_importances(self.shap_values[indices])], axis=0)
return residue_scores
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment