Commit eae53086 authored by Patrick Scheibe's avatar Patrick Scheibe
Browse files

Fix the shit out of single_label_model.py

- Use StratifiedKFold instead of KFold to also stratify each single fold
- Fix bug that would always down-sample when running uncompressed FNN
- Fix all log outputs and string formattings
- Add forgotten y_one_hot in a conditional branch
- Make unknown loss function fail explicitly
- Improve collecting the performance metrics for all targets and folds

God, I hope I didn't break something. Looks good so far.
parent a0a4d014
......@@ -12,15 +12,15 @@ from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Dropout, AlphaDropout
from tensorflow.keras.models import Model
# for NN model functions
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import BinaryCrossentropy, MeanSquaredError
from tensorflow.keras.utils import to_categorical
from dfpl import callbacks as cb
......@@ -43,23 +43,23 @@ def prepare_nn_training_data(df: pd.DataFrame, target: str, opts: options.Option
logging.info(
f" Your training data is extremely unbalanced ({target}): 0 - {vc[0]}, and 1 - {vc[1]} values.")
if opts.sampleDown:
logging.info(f" I will downsample your data")
logging.info(f" I will down-sample your data")
opts.sampleFractionOnes = allowed_imbalance
else:
logging.info(f" I will not downsample your data automatically.")
logging.info(f" I will not down-sample your data automatically.")
logging.info(f" Consider to enable down sampling of the 0 values with --sampleDown option.")
logging.info("Preparing training data matrices")
if opts.compressFeatures:
logging.info("Using compressed fingerprints")
df_fpc = df[df[target].notna() & df["fpcompressed"].notnull()]
logging.info(f"DataSet has {str(df_fpc.shape)} entries with not NA values in fpcompressed and {target}")
logging.info(f"DataSet has {df_fpc.shape[0]} valid entries in fpcompressed and {target}")
if opts.sampleDown:
assert 0.0 < opts.sampleFractionOnes < 1.0
logging.info(f"Using fractional sampling {opts.sampleFractionOnes}")
# how many ones
counts = df_fpc[target].value_counts()
logging.info(f"Number of fraction sampling values: {str(counts)}")
logging.info(f"Number of sampling values: {counts.to_dict()}")
# add sample of 0s to df of 1s
dfX = df_fpc[df_fpc[target] == 1].append(
......@@ -77,11 +77,12 @@ def prepare_nn_training_data(df: pd.DataFrame, target: str, opts: options.Option
dtype=settings.nn_target_numpy_type,
copy=settings.numpy_copy_values
)
y_one_hot = to_categorical(y)
else:
logging.info("Fraction sampling is OFF")
# how many ones, how many zeros
counts = df_fpc[target].value_counts()
logging.info(f"Number of values (total): 0 - {str(counts[0])}, 1 - {str(counts[1])}")
logging.info(f"Number of sampling values: {counts.to_dict()}")
x = np.array(
df_fpc["fpcompressed"].to_list(),
......@@ -97,11 +98,11 @@ def prepare_nn_training_data(df: pd.DataFrame, target: str, opts: options.Option
else:
logging.info("Using uncompressed fingerprints")
df_fp = df[df[target].notna() & df["fp"].notnull()]
logging.info(f"DataSet has {str(df_fp.shape)} entries with not NA values in fpcompressed and {target}")
if opts.sampleFractionOnes:
logging.info(f"DataSet has {df_fp.shape[0]} valid entries in fpcompressed and {target}")
if opts.sampleDown:
logging.info(f"Using fractional sampling {opts.sampleFractionOnes}")
counts = df_fp[target].value_counts()
logging.info(f"Number of fraction sampling values: {str(counts)}")
logging.info(f"Number of sampling values: {counts.to_dict()}")
dfX = df_fp[df_fp[target] == 1.0].append(
df_fp[df_fp[target] == 0.0].sample(
......@@ -114,7 +115,7 @@ def prepare_nn_training_data(df: pd.DataFrame, target: str, opts: options.Option
logging.info("Fraction sampling is OFF")
# how many ones, how many zeros
counts = df_fp[target].value_counts()
logging.info(f"Number of values (total): 0 - {str(counts[0])}, 1 - {str(counts[1])}")
logging.info(f"Number of sampling values: {counts.to_dict()}")
x = np.array(df_fp["fp"].to_list(), dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values)
y = np.array(df_fp[target].to_list(), copy=settings.numpy_copy_values)
......@@ -187,16 +188,21 @@ def build_snn_network(input_size: int, opts: options.Options) -> Model:
def define_single_label_model(input_size: int,
opts: options.Options) -> Model:
lf = dict({"mse": "mean_squared_error",
"bce": "binary_crossentropy"})
if opts.lossFunction == "bce":
loss_function = BinaryCrossentropy()
elif opts.lossFunction == "mse":
loss_function = MeanSquaredError()
else:
logging.error(f"Your selected loss is not supported: {opts.lossFunction}.")
sys.exit("Unsupported loss function")
if opts.optimizer == 'Adam':
my_optimizer = optimizers.Adam(learning_rate=opts.learningRate)
elif opts.optimizer == 'SGD':
my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9)
else:
logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.")
sys.exit("Unsupported optimizer.")
logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.")
sys.exit("Unsupported optimizer")
if opts.fnnType == "FNN":
model = build_fnn_network(input_size, opts)
......@@ -204,11 +210,10 @@ def define_single_label_model(input_size: int,
model = build_snn_network(input_size, opts)
else:
raise ValueError(f"Option FNN Type is not \"FNN\" or \"SNN\", but {opts.fnnType}.")
logging.info(f"Network type: {opts.fnnType}")
model.summary(print_fn=logging.info)
# compile model
model.compile(loss=lf[opts.lossFunction],
model.compile(loss=loss_function,
optimizer=my_optimizer,
metrics=['accuracy',
metrics.AUC(),
......@@ -221,29 +226,43 @@ def define_single_label_model(input_size: int,
def evaluate_model(x_test: np.ndarray, y_test: np.ndarray, file_prefix: str, model: Model,
target: str, fold: int) -> pd.DataFrame:
name = path.basename(file_prefix).replace("_", " ")
performance = pd.DataFrame()
logging.info(f"Evaluating trained model '{name}' on test data")
# predict test set to compute MCC, AUC, ROC curve, etc. (see below)
y_predict = model.predict(x_test).flatten()
y_predict_int = [int(round(y)) for y in y_predict]
y_test_int = [int(y) for y in y_test]
pd.DataFrame({"y_true": y_test, "y_predicted": y_predict,
"y_true_int": y_test_int, "y_predicted_int": y_predict_int,
"target": target, "fold": fold}). \
to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.csv")
# TODO: Introduce thresholds different from 0.5!
y_predict = model.predict(x_test)
y_predict_int = np.argmax(y_predict, axis=1)
y_test_int = np.argmax(y_test, axis=1)
(pd
.DataFrame({
"y_true": y_test_int,
"y_predicted": y_predict[:, 1],
"y_predicted_int": y_predict_int,
"target": target,
"fold": fold})
.to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.csv")
)
# compute the metrics that depend on the class label
precision_recall = classification_report(y_test_int, y_predict_int, output_dict=True)
prf = pd.DataFrame.from_dict(precision_recall)[['0', '1']]
prf.to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.prec_rec_f1.csv")
# validation loss, accuracy
loss, acc = tuple(model.evaluate(x=x_test, y=y_test)[:2])
# MCC
# validation loss, accuracy, auc, precision, recall
loss, acc, auc_value, precision, recall = tuple(model.evaluate(x=x_test, y=y_test))
logging.info(f"Loss: {round(loss, 4)}")
logging.info(f"Accuracy: {round(acc, 4)}")
logging.info(f"AUC: {round(auc_value, 4)}")
logging.info(f"Precision: {round(precision, 4)}")
logging.info(f"Recall: {round(recall, 4)}")
MCC = matthews_corrcoef(y_test_int, y_predict_int)
logging.info(f"MCC: {round(MCC, 4)}")
# generate the AUC-ROC curve data from the validation data
FPR, TPR, thresholds_keras = roc_curve(y_true=y_test_int,
y_score=y_predict_int,
y_score=y_predict[:, 1],
drop_intermediate=False)
AUC = auc(FPR, TPR)
# save AUC data to csv
......@@ -259,12 +278,8 @@ def evaluate_model(x_test: np.ndarray, y_test: np.ndarray, file_prefix: str, mod
cfm = confusion_matrix(y_true=y_test_int,
y_pred=y_predict_int)
logging.info(f"Model '{name}' (trained) evaluation on test data")
logging.info(f"[loss, accuracy, precision, recall]: "
f"{[round(i, ndigits=4) for i in performance]}")
logging.info('MCC: ' + str(MCC.__round__(2)))
logging.info('CFM: \n\tTN=' + str(cfm[0][0]) + '\tFP=' + str(cfm[0][1]) + '\n\tFN=' + str(cfm[1][0]) +
'\tTP=' + str(cfm[1][1]))
logging.info(f"TP/FP: {cfm[1][1]}/{cfm[0][1]}")
logging.info(f"TN/FN: {cfm[0][0]}/{cfm[1][0]}")
return pd.DataFrame.from_dict({'p_0': prf['0']['precision'],
'r_0': prf['0']['recall'],
......@@ -282,10 +297,9 @@ def evaluate_model(x_test: np.ndarray, y_test: np.ndarray, file_prefix: str, mod
def fit_and_evaluate_model(x_train: np.ndarray, x_test: np.ndarray, y_train: np.ndarray, y_test: np.ndarray,
fold: int, target: str, opts: options.Options) -> pd.DataFrame:
logging.info("Training of fold number:" + str(fold))
logging.info(f"The distribution of 0 and 1 values is:")
logging.info(f"\ttrain data:\t{pd.DataFrame(y_train)[0].value_counts().to_list()}")
logging.info(f"\ttest data:\t{pd.DataFrame(y_test)[0].value_counts().to_list()}")
logging.info(f"Training of fold number: {fold}")
logging.info(f"Training sample distribution: train data: {pd.DataFrame(y_train)[0].value_counts().to_dict()}"
f"test data: {pd.DataFrame(y_test)[0].value_counts().to_dict()}")
model_file_prefix = path.join(opts.outputDir, f"{target}_single-labeled_Fold-{fold}")
......@@ -303,7 +317,6 @@ def fit_and_evaluate_model(x_train: np.ndarray, x_test: np.ndarray, y_train: np.
epochs=opts.epochs,
batch_size=opts.batchSize,
verbose=opts.verbose,
# validation_split=opts.testSize
validation_data=(x_test, y_test)
)
trainTime = str(round((time() - start) / 60, ndigits=2))
......@@ -337,17 +350,16 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None:
names_y = [c for c in df.columns if c not in ['cid', 'ID', 'id', 'mol_id', 'smiles', 'fp', 'inchi', 'fpcompressed']]
if opts.wabTracking:
# For W&B tracking, we only train one target.
# We love to have the ER target, but in case it's not there, we use the first one available
# For W&B tracking, we only train one target that's specified as wabTarget "ER".
# In case it's not there, we use the first one available
if opts.wabTarget in names_y:
names_y = [opts.wabTarget]
else:
logging.error(f"The specified wabTarget for Weights & Biases tracking does not exist: {opts.wabTarget}")
names_y = [names_y[0]]
model_evaluation = pd.DataFrame(
columns=["p_0", "r_0", "f1_0", "p_1", "r_1", "f1_1", # class label dependent metrics
"loss", "accuracy", "MCC", "AUC", "target", "fold"]) # total metrics
# Collect metrics for each fold and target
performance_list = []
# For each individual target train a model
for target in names_y: # [:1]:
......@@ -366,6 +378,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None:
test_size=opts.testSize,
stratify=y,
random_state=1)
logging.info(f"Splitting train/test data with fixed random initializer")
else:
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=opts.testSize,
......@@ -373,18 +386,16 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None:
performance = fit_and_evaluate_model(x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test,
fold=0, target=target, opts=opts)
model_evaluation = model_evaluation.append(performance, ignore_index=True)
performance_list.append(performance)
# save complete model
trained_model = define_single_label_model(input_size=x[0].__len__(),
opts=opts)
trained_model.load_weights(path.join(opts.outputDir,
f"{target}_single-labeled_Fold-0.model.weights.hdf5"))
# create output directory and store complete model
trained_model.save(filepath=path.join(opts.outputDir,
f"{target}_saved_model"))
trained_model = define_single_label_model(input_size=len(x[0]), opts=opts)
trained_model.load_weights(path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5"))
trained_model.save(filepath=path.join(opts.outputDir, f"{target}_saved_model"))
elif 1 < opts.kFolds < int(x.shape[0] / 100):
# do a kfold cross-validation
kfold_c_validator = KFold(n_splits=opts.kFolds, shuffle=True, random_state=42)
kfold_c_validator = StratifiedKFold(n_splits=opts.kFolds, shuffle=True, random_state=42)
fold_no = 1
# split the data
for train, test in kfold_c_validator.split(x, y):
......@@ -394,31 +405,37 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None:
performance = fit_and_evaluate_model(x_train=x[train], x_test=x[test],
y_train=y[train], y_test=y[test],
fold=fold_no, target=target, opts=opts)
model_evaluation = model_evaluation.append(performance, ignore_index=True)
performance_list.append(performance)
fold_no += 1
# now next fold
# select and copy best model - how to define the best model?
best_fold = model_evaluation.sort_values(by=['p_1', 'r_1', 'MCC'],
ascending=False,
ignore_index=True)['fold'][0]
best_fold = (
pd
.concat(performance_list, ignore_index=True)
.sort_values(
by=['p_1', 'r_1', 'MCC'],
ascending=False,
ignore_index=True)['fold'][0]
)
# copy checkpoint model weights
shutil.copy(
src=path.join(opts.outputDir, f"{target}_single-labeled_Fold-{best_fold}.model.weights.hdf5"),
dst=path.join(opts.outputDir, f"{target}_single-labeled_Fold-{best_fold}.best.model.weights.hdf5"))
# save complete model
best_model = define_single_label_model(input_size=x[0].__len__(),
opts=opts)
best_model = define_single_label_model(input_size=len(x[0]), opts=opts)
best_model.load_weights(path.join(opts.outputDir,
f"{target}_single-labeled_Fold-{best_fold}.model.weights.hdf5"))
# create output directory and store complete model
best_model.save(filepath=path.join(opts.outputDir,
f"{target}_saved_model"))
best_model.save(filepath=path.join(opts.outputDir, f"{target}_saved_model"))
else:
logging.info("Your selected number of folds for Cross validation is out of range. "
"It must be 1 or smaller than 1 hundredth of the number of samples.")
exit(1)
sys.exit("Number of folds out of range")
# now next target
# store the evaluation data of all trained models (all targets, all folds)
model_evaluation.to_csv(path_or_buf=path.join(opts.outputDir, 'single_label_model.evaluation.csv'))
(pd
.concat(performance_list, ignore_index=True)
.to_csv(path_or_buf=path.join(opts.outputDir, 'single_label_model.evaluation.csv'))
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment