Commit 95f8e784 authored by Patrick Scheibe's avatar Patrick Scheibe
Browse files

Merge branch 'fix_fpcompressed_problems'

# Conflicts:
#	scripts/run-all-cases.sh
parents 1e835bfc 7b6dfeea
......@@ -14,28 +14,28 @@ from dfpl import predictions
project_directory = pathlib.Path(__file__).parent.parent.absolute()
test_train_args = options.TrainOptions(
inputFile=f"{project_directory}/data/Sun_etal_dataset.csv",
inputFile=f"{project_directory}/data/Sun_etal_dataset.pkl",
outputDir=f"{project_directory}/modeltraining",
ecWeightsFile="",
type='smiles',
fpType='topological',
epochs=512,
epochs=3000,
fpSize=2048,
encFPSize=256,
enableMultiLabel=False,
testingFraction=0.2,
kFolds=5,
verbose=2,
trainAC=False,
trainAC=True,
trainFNN=True,
sampleFractionOnes=0.5
compressFeatures=True
)
test_predict_args = options.PredictOptions(
inputFile=f"{project_directory}/data/Sun_etal_dataset.cids.predictionSet.csv",
outputDir=f"{project_directory}/validation/case_01/results/",
ecWeightsFile=f"{project_directory}/validation/case_01/results/Sun_etal_dataset.AC.encoder.weights.hdf5",
ecWeightsFile=f"/home/hertelj/git-hertelj/deepFPlearn_CODE/validation/case_00/results_AC_S/ac_S.encoder.hdf5",
model=f"{project_directory}/validation/case_01/results/AR_compressed-True.full.FNN-.model.hdf5",
target="AR",
fpSize=2048,
......@@ -49,17 +49,20 @@ def train(opts: options.TrainOptions):
Run the main training procedure
:param opts: Options defining the details of the training
"""
df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
# Create output dir if it doesn't exist
createDirectory(opts.outputDir)
if opts.compressFeatures: # compress features
encoder = None
if opts.trainAC:
# train an autoencoder on the full feature matrix
encoder = ac.train_full_ac(df, opts)
if opts.compressFeatures:
if opts.trainAC:
# train an autoencoder on the full feature matrix
encoder = ac.train_full_ac(df, opts)
else:
if not opts.trainAC:
# load trained model for autoencoder
(_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
encoder.load_weights(makePathAbsolute(opts.ecWeightsFile))
......@@ -82,6 +85,7 @@ def predict(opts: options.PredictOptions) -> None:
:param opts: Options defining the details of the prediction
"""
df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
# df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
# Create output dir if it doesn't exist
createDirectory(opts.outputDir)
......
......@@ -136,10 +136,11 @@ def train_full_ac(df: pd.DataFrame, opts: options.TrainOptions) -> Model:
if opts.ecWeightsFile == "":
logging.info("No AC encoder weights file specified")
base_file_name = os.path.splitext(basename(opts.inputFile))[0]
logging.info(f"(auto)encoder weights will be saved in {base_file_name}.[auto]encoder.hdf5")
ac_weights_file = os.path.join(opts.outputDir, base_file_name + ".autoencoder.hdf5")
ec_weights_file = os.path.join(opts.outputDir, base_file_name + ".encoder.hdf5")
else:
logging.info(f"AC encoder will be saved")
logging.info(f"AC encoder will be saved in {opts.ecWeightsFile}")
base_file_name = os.path.splitext(basename(opts.ecWeightsFile))[0]
ac_weights_file = os.path.join(opts.outputDir, base_file_name + ".autoencoder.hdf5")
ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile)
......@@ -188,10 +189,12 @@ def compress_fingerprints(dataframe: pd.DataFrame,
:return: The input dataframe extended by a column containing the compressed version of the fingerprints
"""
logging.info("Adding compressed fingerprints")
idx = dataframe[dataframe["fp"].notnull()].index
fp_matrix = np.array(dataframe[dataframe["fp"].notnull()]["fp"].to_list(),
dtype=settings.ac_fp_numpy_type,
copy=settings.numpy_copy_values)
logging.info(f"Using input matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}")
dataframe['fpcompressed'] = pd.Series([pd.Series(s) for s in encoder.predict(fp_matrix)])
logging.info("Compressed fingerprints are added to input dataframe.")
dataframe['fpcompressed'] = pd.DataFrame({'fpcompressed': [s for s in encoder.predict(fp_matrix)]}, idx)
return dataframe
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap
from dfpl import fingerprint as fp
# %matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize': (10, 14)})
# get penguin data for testing
penguins = pd.read_csv("https://github.com/allisonhorst/palmerpenguins/raw/5b5891f01b52ae26ad8cb9755ec93672f49328a8"
"/data/penguins_size.csv")
penguins = penguins.dropna()
penguins.species_short.value_counts()
penguins.island.value_counts()
sns.pairplot(penguins, hue='species_short')
plt.show()
reducer = umap.UMAP()
# clean up
# no NAs, only measurement columsn
penguin_data = penguins[
[
"culmen_length_mm",
"culmen_depth_mm",
"flipper_length_mm",
"body_mass_g"]
].values
# convert each feature into zscores since they are on different scales
scaled_penguin_data = StandardScaler().fit_transform(penguin_data)
# embed data into two-dim space
embedding = reducer.fit_transform(scaled_penguin_data)
embedding.shape
# --> each row in the original df retreivew 2D coordinates
# visualize this
plt.scatter(
embedding[:, 0],
embedding[:, 1],
c=[sns.color_palette()[x] for x in penguins.species_short.map({
"Adelie": 0, "Chinstrap": 1, "Gentoo": 2
})]
)
plt.gca().set_aspect('equal', 'datalim')
plt.title("UMAP projection of the Penguin dataset", fontsize=24)
plt.show()
# lets try with some chemical fingerprints
df = fp.importDataFile("data/S_dataset_extended.pkl")
df2 = np.array(df[df['fp'].notnull()]['fp'].to_list())
er = df[df['fp'].notnull()]['ER'].fillna(-1)
fit = umap.UMAP(metric="jaccard")
%time u = fit.fit_transform(df2)
plt.title("UMAP projection of Sun et al dataset using jaccard metric")
plt.scatter(u[:,0], u[:,1],
c=[sns.color_palette()[x] for x in er.map({-1.0:0, 0.0:1, 1.0:2})])
# plt.legend(loc='upper right')
plt.show()
df_d = fp.importDataFile("data/dsstox_20160701.pkl")
df_d2 = np.array(df_d[df_d['fp'].notnull()]['fp'].to_list())
fit_d = umap.UMAP(metric="jaccard")
%time u_d = fit_d.fit_transform(df_d2)
plt.title("UMAP projection of Sun et al dataset using jaccard metric")
plt.scatter(u[:,0], u[:,1])
# plt.legend(loc='upper right')
plt.show()
......@@ -112,13 +112,9 @@ def importDstoxTSV(tsvfilename: str) -> pd.DataFrame:
conversion_rules = {
"Sun_etal_dataset.csv": importSmilesCSV,
"SunBDBTox21.merged4training.csv": importSmilesCSV,
"dsstox_20160701.tsv": importDstoxTSV,
"01_combinedSUN-BDB.dataset.4training.csv": importSmilesCSV,
"07_BindingDB.trainingSet.csv": importSmilesCSV,
"T_tox21ChallengeData_4training.csv": importSmilesCSV,
"S_dataset_extended.csv": importSmilesCSV
"S_dataset.csv": importSmilesCSV,
"S_dataset_extended.csv": importSmilesCSV,
"D_dataset.tsv": importDstoxTSV
}
......
......@@ -4,4 +4,4 @@ source /home/patrick/build/local/miniconda3/etc/profile.d/conda.sh
conda activate dfpl_env
conda develop dfpl
bash scripts/run-all-cases.sh
bash scripts/run-all-publication-cases.sh
......@@ -23,11 +23,10 @@
#SBATCH --time=24:00:00
module purge
module load cuda
module load anaconda/3/2020.02
source $ANACONDA_HOME/etc/profile.d/conda.sh
source /u/pscheibe/conda/etc/profile.d/conda.sh
conda activate dfpl_env
conda develop dfpl
# Run the program:
srun scripts/run-all-publication-cases.sh > mpcdf_dfpl_stdout.txt
\ No newline at end of file
srun scripts/run-all-publication-cases.sh &> mpcdf_dfpl_run.log
......@@ -4,49 +4,96 @@
# Importantly, the conda environment needs to be set up and activated! For certain machines/HPC,
# we have a batch-job that does exactly that and then calls this file
python -m dfpl convert -f "data"
python -m dfpl train -f "validation/case_00/train_AC_S.json"
python -m dfpl train -f "validation/case_00/train_AC_X.json"
python -m dfpl train -f "validation/case_00/train_AC_D.json"
python -m dfpl train -f "validation/case_00/train_AC_T.json"
python -m dfpl train -f "validation/case_01/train.json"
python -m dfpl train -f "validation/case_01/train_0p5.json"
python -m dfpl train -f "validation/case_01/train_0p6.json"
python -m dfpl train -f "validation/case_01/train_0p7.json"
python -m dfpl train -f "validation/case_01/train_0p8.json"
python -m dfpl train -f "validation/case_01/train_0p9.json"
python -m dfpl train -f "validation/case_01/train_1p0.json"
python -m dfpl train -f "validation/case_02/train.json"
python -m dfpl train -f "validation/case_02/train_0p5.json"
python -m dfpl train -f "validation/case_02/train_0p6.json"
python -m dfpl train -f "validation/case_02/train_0p7.json"
python -m dfpl train -f "validation/case_02/train_0p8.json"
python -m dfpl train -f "validation/case_02/train_0p9.json"
python -m dfpl train -f "validation/case_02/train_1p0.json"
python -m dfpl train -f "validation/case_03/train.json"
python -m dfpl train -f "validation/case_04/train.json"
python -m dfpl train -f "validation/case_05/train.json"
python -m dfpl train -f "validation/case_06/train.json"
python -m dfpl predict -f "validation/case_07/predict_bestER03.json"
python -m dfpl predict -f "validation/case_07/predict_bestED03.json"
python -m dfpl predict -f "validation/case_07/predict_bestAR03.json"
python -m dfpl predict -f "validation/case_07/predict_fullER03.json"
python -m dfpl train -f "validation/case_11/train.json"
python -m dfpl train -f "validation/case_12/train.json"
python -m dfpl train -f "validation/case_13/train.json"
python -m dfpl train -f "validation/case_14/train.json"
python -m dfpl train -f "validation/case_15/train.json"
python -m dfpl train -f "validation/case_16/train.json"
python -m dfpl train -f "validation/case_31/train.json"
python -m dfpl train -f "validation/case_32/train.json"
python -m dfpl train -f "validation/case_33/train.json"
python -m dfpl train -f "validation/case_41/train.json"
python -m dfpl train -f "validation/case_42/train.json"
python -m dfpl train -f "validation/case_43/train.json"
\ No newline at end of file
function log_error() {
echo "$@" 1>&2
}
function call_convert() {
if [ -d "$1" ]; then
python -m dfpl convert -f "$1"
else
log_error "Could not find directory for data conversion $1"
fi
}
function call_train() {
if [ -f "$1" ]; then
python -m dfpl train -f "$1"
else
log_error "Could not find training file $1"
fi
}
function call_predict() {
if [ -f "$1" ]; then
python -m dfpl predict -f "$1"
else
log_error "Could not find prediction file $1"
fi
}
call_convert "data"
call_train "validation/case_00/train_AC_S.json"
call_train "validation/case_00/train_AC_D.json"
call_train "validation/case_01/train.json"
call_train "validation/case_01/train_0p5.json"
call_train "validation/case_01/train_0p6.json"
call_train "validation/case_01/train_0p7.json"
call_train "validation/case_01/train_0p8.json"
call_train "validation/case_01/train_0p9.json"
call_train "validation/case_01/train_1p0.json"
call_train "validation/case_02/train.json"
call_train "validation/case_02/train_0p5.json"
call_train "validation/case_02/train_0p6.json"
call_train "validation/case_02/train_0p7.json"
call_train "validation/case_02/train_0p8.json"
call_train "validation/case_02/train_0p9.json"
call_train "validation/case_02/train_1p0.json"
call_train "validation/case_03_S/train.json"
call_train "validation/case_03_Sext/train.json"
call_predict "validation/case_07_D/predict_bestAR03.json"
call_predict "validation/case_07_D/predict_bestED03.json"
call_predict "validation/case_07_D/predict_bestER03.json"
call_predict "validation/case_07_D/predict_fullAR03.json"
call_predict "validation/case_07_D/predict_fullED03.json"
call_predict "validation/case_07_D/predict_fullER03.json"
call_predict "validation/case_07_S/predict_bestAR03.json"
call_predict "validation/case_07_S/predict_bestED03.json"
call_predict "validation/case_07_S/predict_bestER03.json"
call_predict "validation/case_07_S/predict_fullAR03.json"
call_predict "validation/case_07_S/predict_fullED03.json"
call_predict "validation/case_07_S/predict_fullER03.json"
call_predict "validation/case_07_Sext/predict_bestAR03.json"
call_predict "validation/case_07_Sext/predict_bestED03.json"
call_predict "validation/case_07_Sext/predict_bestER03.json"
call_predict "validation/case_07_Sext/predict_fullAR03.json"
call_predict "validation/case_07_Sext/predict_fullED03.json"
call_predict "validation/case_07_Sext/predict_fullER03.json"
call_predict "validation/case_08_D/predict_bestARext03.json"
call_predict "validation/case_08_D/predict_bestEDext03.json"
call_predict "validation/case_08_D/predict_bestERext03.json"
call_predict "validation/case_08_D/predict_fullARext03.json"
call_predict "validation/case_08_D/predict_fullEDext03.json"
call_predict "validation/case_08_D/predict_fullERext03.json"
call_predict "validation/case_08_S/predict_bestARext03.json"
call_predict "validation/case_08_S/predict_bestEDext03.json"
call_predict "validation/case_08_S/predict_bestERext03.json"
call_predict "validation/case_08_S/predict_fullARext03.json"
call_predict "validation/case_08_S/predict_fullEDext03.json"
call_predict "validation/case_08_S/predict_fullERext03.json"
call_predict "validation/case_08_Sext/predict_bestARext03.json"
call_predict "validation/case_08_Sext/predict_bestEDext03.json"
call_predict "validation/case_08_Sext/predict_bestERext03.json"
call_predict "validation/case_08_Sext/predict_fullARext03.json"
call_predict "validation/case_08_Sext/predict_fullEDext03.json"
call_predict "validation/case_08_Sext/predict_fullERext03.json"
......@@ -4,31 +4,96 @@
# Importantly, the conda environment needs to be set up and actived! For certain machines/HPC,
# we have a batch-job that does exactly that and then calls this file
D="data"; if [ -d $D ] python -m dfpl convert -f $D; fi
function log_error() {
echo "$@" 1>&2
}
F="validation/case_00/train_AC_S.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_00/train_AC_D.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
function call_convert() {
if [ -d "$1" ]; then
python -m dfpl convert -f "$1"
else
log_error "Could not find directory for data conversion $1"
fi
}
F="validation/case_01/train.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F= "validation/case_02/train.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
function call_train() {
if [ -f "$1" ]; then
python -m dfpl train -f "$1"
else
log_error "Could not find training file $1"
fi
}
F="validation/case_03/train.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
function call_predict() {
if [ -f "$1" ]; then
python -m dfpl predict -f "$1"
else
log_error "Could not find prediction file $1"
fi
}
F="validation/case_07/predict_bestER03.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_07/predict_bestARext03.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_07/predict_bestED03.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
call_convert "data"
F="validation/case_01/train_0p5.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_01/train_0p6.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_01/train_0p7.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_01/train_0p8.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_01/train_0p9.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_01/train_1p0.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
call_train "validation/case_00/train_AC_S.json"
call_train "validation/case_00/train_AC_D.json"
F="validation/case_02/train_0p5.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_02/train_0p6.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_02/train_0p7.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_02/train_0p8.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_02/train_0p9.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
F="validation/case_02/train_1p0.json"; if [ -f $F ]; then python -m dfpl train -f $F; fi
call_train "validation/case_01/train.json"
call_train "validation/case_01/train_0p5.json"
call_train "validation/case_01/train_0p6.json"
call_train "validation/case_01/train_0p7.json"
call_train "validation/case_01/train_0p8.json"
call_train "validation/case_01/train_0p9.json"
call_train "validation/case_01/train_1p0.json"
call_train "validation/case_02/train.json"
call_train "validation/case_02/train_0p5.json"
call_train "validation/case_02/train_0p6.json"
call_train "validation/case_02/train_0p7.json"
call_train "validation/case_02/train_0p8.json"
call_train "validation/case_02/train_0p9.json"
call_train "validation/case_02/train_1p0.json"
call_train "validation/case_03_S/train.json"
call_train "validation/case_03_Sext/train.json"
call_predict "validation/case_07_D/predict_bestAR03.json"
call_predict "validation/case_07_D/predict_bestED03.json"
call_predict "validation/case_07_D/predict_bestER03.json"
call_predict "validation/case_07_D/predict_fullAR03.json"
call_predict "validation/case_07_D/predict_fullED03.json"
call_predict "validation/case_07_D/predict_fullER03.json"
call_predict "validation/case_07_S/predict_bestAR03.json"
call_predict "validation/case_07_S/predict_bestED03.json"
call_predict "validation/case_07_S/predict_bestER03.json"
call_predict "validation/case_07_S/predict_fullAR03.json"
call_predict "validation/case_07_S/predict_fullED03.json"
call_predict "validation/case_07_S/predict_fullER03.json"
call_predict "validation/case_07_Sext/predict_bestAR03.json"
call_predict "validation/case_07_Sext/predict_bestED03.json"
call_predict "validation/case_07_Sext/predict_bestER03.json"
call_predict "validation/case_07_Sext/predict_fullAR03.json"
call_predict "validation/case_07_Sext/predict_fullED03.json"
call_predict "validation/case_07_Sext/predict_fullER03.json"
call_predict "validation/case_08_D/predict_bestARext03.json"
call_predict "validation/case_08_D/predict_bestEDext03.json"
call_predict "validation/case_08_D/predict_bestERext03.json"
call_predict "validation/case_08_D/predict_fullARext03.json"
call_predict "validation/case_08_D/predict_fullEDext03.json"
call_predict "validation/case_08_D/predict_fullERext03.json"
call_predict "validation/case_08_S/predict_bestARext03.json"
call_predict "validation/case_08_S/predict_bestEDext03.json"
call_predict "validation/case_08_S/predict_bestERext03.json"
call_predict "validation/case_08_S/predict_fullARext03.json"
call_predict "validation/case_08_S/predict_fullEDext03.json"
call_predict "validation/case_08_S/predict_fullERext03.json"
call_predict "validation/case_08_Sext/predict_bestARext03.json"
call_predict "validation/case_08_Sext/predict_bestEDext03.json"
call_predict "validation/case_08_Sext/predict_bestERext03.json"
call_predict "validation/case_08_Sext/predict_fullARext03.json"
call_predict "validation/case_08_Sext/predict_fullEDext03.json"
call_predict "validation/case_08_Sext/predict_fullERext03.json"
import dfpl.fingerprint as fp
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Draw
import pandas as pd
import numpy as np
from dfpl import autoencoder as ac
from dfpl import feedforwardNN as fNN
from dfpl import predictions
from dfpl import options as opt
# read both datasets
dfS = fp.importDataFile("data/S_dataset_extended.pkl", import_function=fp.importSmilesCSV, fp_size=2048)
dfS.dropna(axis=0, subset=['cid'], inplace=True)
dfS['cid'] = dfS['cid'].apply(int).astype(str)
dfD = fp.importDataFile("data/dsstox_20160701.pkl", import_function=fp.importSmilesCSV, fp_size=2048)
# ids and structures of interest
cid_of_interest = ["87587", "77328", "2734118", "2736548", "154257"]
toxid_of_interest = ["DTXSID3027798", "DTXSID7041461", "DTXSID9048067", "DTXSID7049344", "DTXSID70173593"]
df = pd.DataFrame(list(zip(cid_of_interest, toxid_of_interest)), columns=["cid", "toxid"])
# add smiles
smiles_of_interest = dfS[dfS['cid'].isin(cid_of_interest)][['cid', 'smiles']]
df = df.merge(smiles_of_interest, on="cid")
# add inchi
inchi_of_interest = dfD[dfD['toxid'].isin(toxid_of_interest)][['toxid', 'inchi']]
df = df.merge(inchi_of_interest, on="toxid")
# get pre-calculated boolean fingerprints from input .pkl data
fpboolS_of_interest = dfS[dfS['cid'].isin(cid_of_interest)][['cid', 'fp']]
df = df.merge(fpboolS_of_interest, on="cid")
df.rename(columns={'fp': 'fpSbool'}, inplace=True)
fpboolD_of_interest = dfD[dfD['toxid'].isin(toxid_of_interest)][['toxid', 'fp']]
df = df.merge(fpboolD_of_interest, on="toxid")
df.rename(columns={'fp': 'fpDbool'}, inplace=True)
# calculate AND result of bool fingerprints
df['allBoolEqual'] = [all(s == d) for s, d in zip(df['fpSbool'].to_list(), df['fpDbool'].to_list())]
# generate binary fingerprints
df['fpSbin'] = [Chem.RDKFingerprint(Chem.MolFromSmiles(x)) for x in df['smiles']]
df['fpDbin'] = [Chem.RDKFingerprint(Chem.MolFromInchi(x)) for x in df['inchi']]
# calculate Tanimoto Similarity of both compounds
df['tanimoto'] = [DataStructs.FingerprintSimilarity(s, d) for s, d in zip(df['fpSbin'], df['fpDbin'])]
# generate mol structures for drawing
df['molS'] = [Chem.MolFromSmiles(x) for x in df['smiles']]
df['molD'] = [Chem.MolFromInchi(x) for x in df['inchi']]
legend = [c + " (" + str(round(t, 2)) + ", bool: " + str(b) + ")" for c, t, b in
zip(df['cid'], df['tanimoto'], df['allBoolEqual'])] + \
[t for t in df['toxid']]
img = Draw.MolsToGridImage(df['molS'].to_list() + df['molD'].to_list(),
molsPerRow=df.shape[0],
subImgSize=(200, 200),
legends=legend)
img.save('cidVStoxid.structures.png')
img.show()
project_directory = ""
opts = opt.PredictOptions(
inputFile=f"",
outputDir=f"/home/hertelj/tmp/",
model=f"/home/hertelj/git-hertelj/deepFPlearn_CODE/validation/case_03/results/ER_compressed-True_sampled-None.best.FNN.model.hdf5",
target="ER",
fpSize=2048,
type="smiles",
fpType="topological"
)
(_, encoder) = ac.define_ac_model(input_size=2048, encoding_dim=256)
encoder.load_weights("/home/hertelj/git-hertelj/deepFPlearn_CODE/modeltraining/Sun_etal_dataset.encoder.hdf5")
data = ac.compress_fingerprints(dfS, encoder)
s_compressed = data[data['cid'].isin(cid_of_interest)]['fpcompressed']
df2 = predictions.predict_values(df=data,
opts=opts,
use_compressed=True)
s_predictions = df2[df2['cid'].isin(cid_of_interest)][['cid', 'trained']]
data2 = ac.compress_fingerprints(dfD, encoder)
d_compressed = data2[data2['toxid'].isin(toxid_of_interest)]['fpcompressed']
df3 = predictions.predict_values(df=data2,
opts=opts,
use_compressed=True)
d_predictions = df3[df3['toxid'].isin(toxid_of_interest)][['toxid', 'trained']]
fp_matrix_S = np.array(df['fpSbool'].to_list(), dtype=bool, copy=False)
predictions_S = encoder.predict(fp_matrix_S)
fp_matrix_D = np.array(df['fpDbool'].to_list(), dtype=bool, copy=False)
predictions_D = encoder.predict(fp_matrix_D)
df['fpcompressedS'] = [s for s in predictions_S]
df['fpcompressedD'] = [s for s in predictions_D]
# compressed fp equal?
df['fpcEqual'] = [all(s == d) for s, d in zip(df['fpcompressedS'].to_list(), df['fpcompressedD'].to_list())]
{
"py/object": "dfpl.options.TrainOptions",
"inputFile": "data/dsstox_20160701.pkl",
"inputFile": "data/D_dataset.pkl",
"outputDir": "validation/case_00/results_AC_D/",
"ecWeightsFile": "ac_D.encoder.hdf5",
"type": "smiles",
......@@ -14,5 +14,5 @@
"verbose": 2,
"trainAC": true,
"trainFNN": false,
"compressFeatures": true
"compressFeatures": false
}
\ No newline at end of file
{
"py/object": "dfpl.options.TrainOptions",
"inputFile": "data/S_dataset_extended.pkl",
"inputFile": "data/S_dataset.pkl",
"outputDir": "validation/case_00/results_AC_S/",
"ecWeightsFile": "ac_S.encoder.hdf5",
"type": "smiles",
......@@ -14,5 +14,5 @@
"verbose": 2,
"trainAC": true,