Commit bff43d9e authored by Jana Schor's avatar Jana Schor
Browse files

add module history for plotting and storing histories, adjust AC EC weights file input

parent 5db939a4
......@@ -16,7 +16,7 @@ project_directory = pathlib.Path(__file__).parent.parent.absolute()
test_train_args = options.TrainOptions(
inputFile=f"{project_directory}/data/Sun_etal_dataset.csv",
outputDir=f"{project_directory}/modeltraining",
acFile="",
ecWeightsFile="",
type='smiles',
fpType='topological',
epochs=512,
......@@ -35,7 +35,7 @@ test_train_args = options.TrainOptions(
test_predict_args = options.PredictOptions(
inputFile=f"{project_directory}/data/Sun_etal_dataset.cids.predictionSet.csv",
outputDir=f"{project_directory}/validation/case_01/results/",
acFile=f"{project_directory}/validation/case_01/results/Sun_etal_dataset.AC.encoder.weights.hdf5",
ecWeightsFile=f"{project_directory}/validation/case_01/results/Sun_etal_dataset.AC.encoder.weights.hdf5",
model=f"{project_directory}/validation/case_01/results/AR_compressed-True.full.FNN-.model.hdf5",
target="AR",
fpSize=2048,
......@@ -59,11 +59,11 @@ def train(opts: options.TrainOptions):
if opts.trainAC:
# train an autoencoder on the full feature matrix
encoder = ac.train_full_ac(df, opts)
encoder.save_weights(path.join(opts.outputDir, opts.acFile))
encoder.save_weights(path.join(opts.outputDir, opts.ecWeightsFile))
else:
# load trained model for autoencoder
(_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
encoder.load_weights(path.join(opts.outputDir, opts.acFile))
encoder.load_weights(path.join(opts.outputDir, opts.ecWeightsFile))
# compress the fingerprints using the autoencoder
df = ac.compress_fingerprints(df, encoder)
......@@ -91,11 +91,11 @@ def predict(opts: options.PredictOptions) -> None:
df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
use_compressed = False
if opts.acFile:
if opts.ecWeightsFile:
use_compressed = True
# load trained model for autoencoder
(_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
encoder.load_weights(opts.acFile)
encoder.load_weights(opts.ecWeightsFile)
# compress the fingerprints using the autoencoder
df = ac.compress_fingerprints(df, encoder)
......
import os.path
from os.path import basename
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
from keras.models import Model
......@@ -12,6 +13,7 @@ from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint, EarlyStopping
import options
import history as ht
def define_ac_model(
......@@ -116,7 +118,6 @@ def autoencoder_callback(checkpoint_path: str, patience: int) -> list:
return [checkpoint, early_stop]
def train_full_ac(df: pd.DataFrame, opts: options.TrainOptions) -> Model:
"""
Train an autoencoder on the given feature matrix X. Response matrix is only used to
......@@ -131,11 +132,19 @@ def train_full_ac(df: pd.DataFrame, opts: options.TrainOptions) -> Model:
(autoencoder, encoder) = define_ac_model(input_size=opts.fpSize,
encoding_dim=opts.encFPSize)
# if opts.acFile != "": # don't train, use existing weights file and load it into AC model
# encoder.load_weights(opts.acFile)
# else:
# define output file for autoencoder and encoder weights
if opts.ecWeightsFile == "":
base_file_name = os.path.splitext(basename(opts.inputFile))[0]
ac_weights_file = os.path.join(opts.outputDir, base_file_name + ".autoencoder.hdf5")
ec_weights_file = os.path.join(opts.outputDir, base_file_name + ".encoder.hdf5")
else:
base_file_name = os.path.splitext(basename(opts.ecWeightsFile))[0]
ac_weights_file = os.path.join(opts.outputDir, base_file_name + ".autoencoder.hdf5")
ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile)
# collect the callbacks for training
callback_list = autoencoder_callback(checkpoint_path=opts.outputDir + "/autoencoder.checkpoint_path.hdf5",
callback_list = autoencoder_callback(checkpoint_path=ac_weights_file,
# opts.outputDir + "/autoencoder.checkpoint_path.hdf5",
patience=20)
# Select all fps that are valid and turn them into a numpy array
......@@ -153,29 +162,14 @@ def train_full_ac(df: pd.DataFrame, opts: options.TrainOptions) -> Model:
batch_size=256,
verbose=opts.verbose,
validation_data=(x_test, x_test))
# history
ac_loss = auto_hist.history['loss']
ac_val_loss = auto_hist.history['val_loss']
ac_epochs = range(ac_loss.__len__())
pd.DataFrame(data={'loss': ac_loss,
'val_loss': ac_val_loss,
'epoch': ac_epochs}).to_csv(opts.outputDir + "/ACmodel_trainValLoss_AC.csv",
index=False)
# generate a figure of the losses for this fold
plt.figure()
plt.plot(ac_epochs, ac_loss, 'bo',
label='Training loss')
plt.plot(ac_epochs, ac_val_loss, 'b',
label='Validation loss')
plt.title('Training and validation loss of AC')
plt.legend()
plt.savefig(fname=opts.outputDir + "/ACmodel_trainValLoss_AC.svg",
format='svg')
plt.close()
# write the losses to .csv file for later data visualization
# model needs to be saved and restored when predicting new input!
# use encode() of train data as input for DL model to associate to chemical
logging.info(f"Autoencoder weights stored in file: {ac_weights_file}")
ht.store_and_plot_history(base_file_name=os.path.join(opts.outputDir, base_file_name),
hist=auto_hist)
encoder.save_weights(ec_weights_file)
logging.info(f"Encoder weights stored in file: {ec_weights_file}")
return encoder
......
......@@ -26,6 +26,8 @@ from sklearn.metrics import f1_score
import options
import autoencoder as ac
import history as ht
from time import time
......@@ -116,9 +118,7 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl
model_file_path_weights = str(path_prefix) + model_name + '.weights.h5'
model_file_path_json = str(path_prefix) + model_name + '.json'
model_hist_plot_path_l = str(path_prefix) + model_name + '.loss.svg'
model_hist_plot_path_a = str(path_prefix) + model_name + '.acc.svg'
model_hist_plot_path = str(path_prefix) + model_name + '.history.svg'
model_hist_path = str(path_prefix) + model_name
model_hist_csv_path = str(path_prefix) + model_name + '.history.csv'
model_validation = str(path_prefix) + model_name + '.validation.csv'
model_auc_file = str(path_prefix) + model_name + '.auc_value.svg'
......@@ -128,8 +128,8 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl
model_heatmap_x = str(path_prefix) + model_name + '.heatmap.X.svg'
model_heatmap_z = str(path_prefix) + model_name + '.AC.heatmap.Z.svg'
return (model_file_path_weights, model_file_path_json, model_hist_plot_path_l, model_hist_plot_path_a,
model_hist_plot_path, model_hist_csv_path, model_validation, model_auc_file,
return (model_file_path_weights, model_file_path_json, model_hist_path,
model_validation, model_auc_file,
model_auc_file_data, out_file_path, checkpoint_path,
model_heatmap_x, model_heatmap_z)
......@@ -448,8 +448,8 @@ def train_nn_models(df: pd.DataFrame,
logging.info("Training of fold number:" + str(fold_no))
# define all the output file/path names
(model_file_path_weights, model_file_path_json, model_hist_plot_path_loss, model_hist_plot_path_acc,
model_hist_plot_path, model_hist_csv_path, model_validation, model_auc_file,
(model_file_path_weights, model_file_path_json, model_hist_path,
model_validation, model_auc_file,
model_auc_file_data, outfile_path, checkpoint_path,
model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir,
target=target + "_compressed-" + str(
......@@ -478,7 +478,10 @@ def train_nn_models(df: pd.DataFrame,
if opts.verbose > 0:
logging.info("Computation time for training the single-label FNN:" + trainTime + "min")
pd.DataFrame(hist.history).to_csv(model_hist_csv_path)
ht.store_and_plot_history(base_file_name=model_hist_path,
hist=hist)
# pd.DataFrame(hist.history).to_csv(model_hist_csv_path)
# validate model on test data set (x_test, y_test)
scores = validate_model_on_test_data(x[test], checkpoint_path, y[test],
......@@ -545,7 +548,10 @@ def train_nn_models(df: pd.DataFrame,
if opts.verbose > 0:
logging.info("Computation time for training the full classification FNN: " + trainTime + "min")
pd.DataFrame(hist.history).to_csv(full_model_file.replace(".hdf5", ".history.csv"))
ht.store_and_plot_history(base_file_name=model_hist_path,
hist=hist)
# pd.DataFrame(hist.history).to_csv(full_model_file.replace(".hdf5", ".history.csv"))
del model
# now next target
......@@ -680,8 +686,8 @@ def train_nn_models_multi(df: pd.DataFrame,
# kf = kfold_c_validator.split(fpMatrix, y)
# train, test = next(kf)
(model_file_path_weights, model_file_path_json, model_hist_plot_path_loss, model_hist_plot_path_acc,
model_hist_plot_path, model_hist_csv_path, model_validation, model_auc_file,
(model_file_path_weights, model_file_path_json, model_hist_path,
model_validation, model_auc_file,
model_auc_file_data, out_file_path, checkpoint_path,
model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir,
target="multi" + "_compressed-" + str(
......@@ -710,7 +716,9 @@ def train_nn_models_multi(df: pd.DataFrame,
if opts.verbose > 0:
logging.info("Computation time for training the multi-label FNN: " + trainTime + " min")
pd.DataFrame(hist.history).to_csv(model_hist_csv_path)
ht.store_and_plot_history(base_file_name=model_hist_path,
hist=hist)
# pd.DataFrame(hist.history).to_csv(model_hist_csv_path)
# validate model on test data set (fpMatrix_test, y_test)
scores = validate_multi_model_on_test_data(x_test=fpMatrix[test],
......@@ -757,8 +765,8 @@ def train_nn_models_multi(df: pd.DataFrame,
# AND retrain with full data set
full_model_file = checkpoint_path.replace("Fold-" + str(fold_no) + ".checkpoint", "full.FNN-")
(model_file_path_weights, model_file_path_json, model_hist_plot_path_loss, model_hist_plot_path_acc,
model_hist_plot_path, model_hist_csv_path, model_validation, model_auc_file,
(model_file_path_weights, model_file_path_json, model_hist_path,
model_validation, model_auc_file,
model_auc_file_data, out_file_path, checkpoint_path,
model_heatmap_x, model_heatmap_z) = define_out_file_names(path_prefix=opts.outputDir,
target="multi" + "_compressed-" + str(
......@@ -785,16 +793,19 @@ def train_nn_models_multi(df: pd.DataFrame,
if opts.verbose > 0:
logging.info("Computation time for training the full multi-label FNN: " + trainTime + " min")
pd.DataFrame(hist.history).to_csv(model_hist_csv_path)
ht.store_and_plot_history(base_file_name=model_hist_path,
hist=hist)
model_name = "multi" + "_compressed-" + str(use_compressed) + '.Full'
# pd.DataFrame(hist.history).to_csv(model_hist_csv_path)
plot_history_vis(hist,
model_hist_plot_path.replace("Fold-" + str(fold_no), "full.DNN-model"),
model_hist_csv_path.replace("Fold-" + str(fold_no), "full.DNN-model"),
model_hist_plot_path_acc.replace("Fold-" + str(fold_no), "full.DNN-model"),
model_hist_plot_path_loss.replace("Fold-" + str(fold_no), "full.DNN-model"),
target=model_name)
logging.info("Full models for DNN is saved:\n" + full_model_file)
# model_name = "multi" + "_compressed-" + str(use_compressed) + '.Full'
#
# plot_history_vis(hist,
# model_hist_plot_path.replace("Fold-" + str(fold_no), "full.DNN-model"),
# model_hist_csv_path.replace("Fold-" + str(fold_no), "full.DNN-model"),
# model_hist_plot_path_acc.replace("Fold-" + str(fold_no), "full.DNN-model"),
# model_hist_plot_path_loss.replace("Fold-" + str(fold_no), "full.DNN-model"),
# target=model_name)
# logging.info("Full models for DNN is saved:\n" + full_model_file)
pd.DataFrame(hist.history).to_csv(full_model_file.replace(".hdf5", ".history.csv"))
# pd.DataFrame(hist.history).to_csv(full_model_file.replace(".hdf5", ".history.csv"))
# -*- coding: utf-8 -*-
"""Store and visualise training histories"""
import pandas as pd
import logging
from keras.callbacks import History
import matplotlib.pyplot as plt
def store_and_plot_history(base_file_name: str, hist: History) -> None:
"""
:param base_file_name:
:param hist:
:return:
"""
(ac_history_csv, ac_history_svg) = (base_file_name + ".history.csv",
base_file_name + ".history.svg")
# store history
pd.DataFrame(hist.history).to_csv(ac_history_csv)
logging.info(f"Neural network training history saved in file: {ac_history_csv}")
# plot history
ac_epochs = hist.epoch
# generate a figure of the losses for this fold
plt.figure()
for k in hist.history.keys():
plt.plot(ac_epochs, hist.history[k], label=k)
plt.title('Training and validation metrics of neural network')
plt.legend()
plt.savefig(fname=ac_history_svg,
format='svg')
plt.close()
logging.info(f"Neural network training history plotted in file: {ac_history_svg}")
......@@ -14,7 +14,7 @@ class TrainOptions:
"""
inputFile: str = "data/Sun_etal_dataset.csv"
outputDir: str = "modeltraining"
acFile: str = "Sun_etal_dataset.AC.encoder.weights.hdf5"
ecWeightsFile: str = "Sun_etal_dataset.AC.encoder.weights.hdf5"
type: str = "smiles"
fpType: str = "topological" # also "MACCS", "atompairs"
epochs: int = 512
......@@ -64,7 +64,7 @@ class TrainOptions:
return cls(
inputFile=args.i,
outputDir=args.o,
acFile=args.a,
ecWeightsFile=args.a,
type=args.t,
fpType=args.k,
fpSize=args.s,
......@@ -137,7 +137,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None:
'details).',
default=True)
parser.add_argument('-a', type=str, metavar='FILE', default=None,
help='The .hdf5 file of a trained autoencoder (e.g. from a previous'
help='The .hdf5 file of a trained encoder (e.g. from a previous'
'training run. This avoids a retraining of the autoencoder on the'
'training data set (provided with -i). NOTE that the input and encoding'
'dimensions must fit your data and settings. Default: train new autoencoder.')
......@@ -190,7 +190,7 @@ class PredictOptions:
"""
inputFile: str = ""
outputDir: str = ""
acFile: str = ""
ecWeightsFile: str = ""
model: str = ""
target: str = ""
fpSize: int = 2048
......@@ -233,7 +233,7 @@ class PredictOptions:
return cls(
inputFile=args.i,
outputDir=args.o,
acFile=args.ACmodel,
ecWeightsFile=args.ECmodel,
model=args.model,
target=args.target,
fpSize=args.s,
......@@ -265,8 +265,8 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None:
"numbered in the order of their appearance in the input file."
"A header is expected and respective column names are used.",
required=True)
parser.add_argument('--ACmodel', metavar='FILE', type=str,
help='The autoencoder model weights. If provided the fingerprints are compressed prior '
parser.add_argument('--ECmodel', metavar='FILE', type=str,
help='The encoder model weights. If provided the fingerprints are compressed prior '
'to prediction.',
required=False)
parser.add_argument('--model', metavar='FILE', type=str,
......
......@@ -9,16 +9,18 @@ project_directory = pathlib.Path(__file__).parent.parent.absolute()
test_train_args = opt.TrainOptions(
inputFile=f"{project_directory}/data/Sun_etal_dataset.csv",
outputDir=f"{project_directory}/modeltraining",
acFile="Sun_etal_encoder.weights.hdf5",
ecWeightsFile="Sun_etal_dataset.encoder.hdf5",
type='smiles',
fpType='topological',
epochs=512,
epochs=11,
fpSize=2048,
encFPSize=256,
enableMultiLabel=False,
testingFraction=0.2,
kFolds=5,
verbose=1
verbose=2,
trainFNN=False,
trainAC=True
)
......
......@@ -10,7 +10,7 @@ project_directory = pathlib.Path(__file__).parent.parent.absolute()
test_train_args = opt.TrainOptions(
inputFile=f"{project_directory}/data/Sun_etal_dataset.csv",
outputDir=f"{project_directory}/modeltraining/",
acFile="Sun_etal_encoder.weights.hdf5",
ecWeightsFile="",
type='smiles',
fpType='topological',
epochs=11,
......@@ -20,7 +20,7 @@ test_train_args = opt.TrainOptions(
testingFraction=0.2,
kFolds=2,
verbose=2,
trainAC=False,
trainAC=True,
trainFNN=True
)
......@@ -30,13 +30,13 @@ def run_fnn_training(opts: opt.TrainOptions) -> None:
logging.info("Adding fingerprint to dataset")
df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
t = opts.acFile
opts.acFile = opts.outputDir + t
# t = opts.ecWeightsFile
# opts.ecWeightsFile = opts.outputDir + t
if opts.trainAC:
logging.info("Training autoencoder")
encoder = ac.train_full_ac(df, opts)
encoder.save_weights(opts.acFile)
# encoder.save_weights(opts.acFile)
else:
logging.info("Using trained autoencoder")
(_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
......@@ -61,13 +61,13 @@ def run_fnn_training_multi(opts: opt.TrainOptions) -> None:
df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
t = opts.acFile
opts.acFile = opts.outputDir + t
t = opts.ecWeightsFile
opts.ecWeightsFile = opts.outputDir + t
if opts.trainAC:
logging.info("Training autoencoder")
encoder = ac.train_full_ac(df, opts)
encoder.save_weights(opts.acFile)
# encoder.save_weights(opts.acFile)
else:
logging.info("Using trained autoencoder")
(_, encoder) = ac.define_ac_model(input_size=opts.fpSize,
......
......@@ -11,7 +11,7 @@ project_directory = pathlib.Path(__file__).parent.parent.absolute()
test_predict_args = opt.PredictOptions(
inputFile=f"{project_directory}/data/Sun_etal_dataset.cids.predictionSet.csv",
outputDir=f"{project_directory}/validation/case_01/results/",
acFile=f"{project_directory}/validation/case_01/results/Sun_etal_dataset.AC.encoder.weights.hdf5",
ecWeightsFile=f"{project_directory}/validation/case_01/results/Sun_etal_dataset.AC.encoder.weights.hdf5",
model=f"{project_directory}/validation/case_01/results/AR_compressed-True.full.FNN-.model.hdf5",
target="AR",
fpSize=2048,
......@@ -28,11 +28,11 @@ def test_predictions(opts: opt.PredictOptions) -> None:
df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize)
use_compressed = False
if opts.acFile:
if opts.ecWeightsFile:
use_compressed = True
# load trained model for autoencoder
(_, encoder) = ac.define_ac_model(input_size=opts.fpSize, encoding_dim=opts.encFPSize)
encoder.load_weights(opts.acFile)
encoder.load_weights(opts.ecWeightsFile)
# compress the fingerprints using the autoencoder
df = ac.compress_fingerprints(df, encoder)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment