dfplmodule.py 69 KB
Newer Older
1
import argparse
2
3
4

# Python module for deepFPlearn tools
import re
Jana Schor's avatar
Jana Schor committed
5
import math
6
7
8
import csv
import numpy as np
import pandas as pd
9
import shutil
10
import matplotlib.pyplot as plt
11
12
import matplotlib
#matplotlib.use('Agg')
Jana Schor's avatar
Jana Schor committed
13
14
15
16
17
import matplotlib.patches as mpatches
from matplotlib.colors import LinearSegmentedColormap
#%matplotlib inline
# for drawing the heatmaps
import seaborn as sns
18
19
20
21
22
23
24
25
26
27

# for fingerprint generation
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem.AtomPairs import Torsions

# for NN model functions
from keras.models import Sequential
Jana Schor's avatar
Jana Schor committed
28
29
from keras.layers import Input, Dense, Dropout
from keras.models import Model
30
31
from keras import regularizers
from keras import optimizers
Jana Schor's avatar
Jana Schor committed
32
from keras.optimizers import SGD
33
from keras.models import model_from_json
34
35
36
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
37
38
39
40
41
42
43
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import KFold
44

Jana Schor's avatar
Jana Schor committed
45
46
47
from time import time


Jana Schor's avatar
Jana Schor committed
48

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
###############################################################################
# GENERAL FUNCTIONS --------------------------------------------------------- #

def gather(df, key, value, cols):
    """
    Simple emulation of R's gather function using pandas melt() function in python

    :param df: The data frame
    :param key: The key column name
    :param value: The value column name
    :param cols: list of column names to gather
    :return:
    """
    id_vars = [col for col in df.columns if col not in cols]
    id_values = cols
    var_name = key
    value_name = value
    return pd.melt(df, id_vars, id_values, var_name, value_name)

# ------------------------------------------------------------------------------------- #

def shuffleDataPriorToTraining(x, y):
    """
    Returns a gathered variant of the input x and y matrices
    :param x:
    :param y:
    :return:
    """

    # merge x and y
    df0 = pd.concat([x.reset_index(drop=True), y.reset_index(drop=True)], axis=1)
    # shuffle rows, drop NAs, reset index
    df1 = df0.sample(frac=1).dropna(axis=0).reset_index()

    return (df1.iloc[:, 0:x.shape[1]], df1.iloc[:,x.shape[1]:])

    #return gather(df0, key="target", value="association",
    #             cols=y.columns)
Jana Schor's avatar
Jana Schor committed
87

Jana Schor's avatar
Jana Schor committed
88
89
90
91
92
93
94
95
96
97
98
99
# ------------------------------------------------------------------------------------- #

def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

Jana Schor's avatar
Jana Schor committed
100

101
102
# ------------------------------------------------------------------------------------- #

Jana Schor's avatar
Jana Schor committed
103
def smi2fp(smile, fptype, size=2048):
104
105
106
107
108
109
110
    """
    Convert a SMILES string to a fingerprint object of a certain type using functions
    from the RDKIT python library.

    :param smile: A single SMILES string
    :param fptype: The type of fingerprint to which the SMILES should be converted. Valid
                   values are: 'topological' (default), 'MACCS'
Jana Schor's avatar
Jana Schor committed
111
112
    :return: A fingerprint object or None, if fingerprint object could not be created
    (Respective error message is provided in STDERR).
113
114
    """
    # generate a mol object from smiles string
115
116

    print(smile)
117
    cs = None
118
    # first transform to canoncial smiles
119
    try:
120
        cs = Chem.CanonSmiles(smile)
121
    except:
122
123
124
        print(f'[WARNING]: Not able to transform your smile to a canonical version of it: {smile}')
    if not cs:
        return None
125

126
    mol = None
127
128
129
130
131
    try:
        mol = Chem.MolFromSmiles(cs)
    except:
        print(
            f'[WARNING]: Not able to extract molecule from (canonically transformed) SMILES: {cs}\n          Original SMILE: {smile}')
132
133
    if not mol:
        return None
134
135

    # init fp, any better idea? e.g. calling a constructor?
Jana Schor's avatar
Jana Schor committed
136
    fp = Chem.Mol #FingerprintMols.FingerprintMol(mol)
137
138
139
140
141
142
143
144
145

    if fptype == 'topological':  # 2048 bits
        # Topological Fingerprints:
        # The fingerprinting algorithm used is similar to that used in the Daylight
        # fingerprinter: it identifies and hashes topological paths (e.g. along bonds)
        # in the molecule and then uses them to set bits in a fingerprint of user-specified
        # lengths. After all paths have been identified, the fingerprint is typically
        # folded down until a particular density of set bits is obtained.
        try:
Jana Schor's avatar
Jana Schor committed
146
147
            #fp = Chem.RDKFingerprint(mol, fpSize=size)
            return(Chem.RDKFingerprint(mol, fpSize=size))
148
149
150
151
        except:
            print('SMILES not convertable to topological fingerprint:')
            assert isinstance(smile, object)
            print('SMILES: ' + smile)
Jana Schor's avatar
Jana Schor committed
152
            return(None)
153
154
155
156
157
158
159
160
161

    elif fptype == 'MACCS':
        # MACCS Keys:
        # There is a SMARTS-based implementation of the 166 public MACCS keys.
        # The MACCS keys were critically evaluated and compared to other MACCS
        # implementations in Q3 2008. In cases where the public keys are fully defined,
        # things looked pretty good.

        try:
Jana Schor's avatar
Jana Schor committed
162
163
            #fp = MACCSkeys.GenMACCSKeys(mol)
            return(MACCSkeys.GenMACCSKeys(mol))
164
165
166
167
        except:
            print('SMILES not convertable to MACSS fingerprint:')
            assert isinstance(smile, object)
            print('SMILES: ' + smile)
Jana Schor's avatar
Jana Schor committed
168
            return(None)
169
170
171
172
173
174
175
176

    elif fptype == 'atompairs':
        # Atom Pairs:
        # Atom-pair descriptors [3] are available in several different forms.
        # The standard form is as fingerprint including counts for each bit instead
        # of just zeros and ones. Nevertheless we use the BitVect variant here.

        try:
Jana Schor's avatar
Jana Schor committed
177
178
            #fp = Pairs.GetAtomPairFingerprintAsBitVect(mol)
            return(Pairs.GetAtomPairFingerprintAsBitVect(mol))
179
180
181
182
183
184
            # counts if features also possible here! needs to be parsed differently
            # fps.update({i:Pairs.GetAtomPairFingerprintAsIntVect(mols[i])})
        except:
            print('SMILES not convertable to atompairs fingerprint:')
            assert isinstance(smile, object)
            print('SMILES: ' + smile)
Jana Schor's avatar
Jana Schor committed
185
            return(None)
186
187
188
189
190
191
192
193

    else:
        # Topological Torsions:
        # At the time of this writing, topological torsion fingerprints have too
        # many bits to be encodeable using the BitVector machinery, so there is no
        # GetTopologicalTorsionFingerprintAsBitVect function.

        try:
Jana Schor's avatar
Jana Schor committed
194
195
            #fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol)
            return(Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol))
196
197
198
199
        except:
            print('SMILES not convertable to torsions fingerprint:')
            assert isinstance(smile, object)
            print('SMILES: ' + smile)
Jana Schor's avatar
Jana Schor committed
200
            return(None)
201
202
203

# ------------------------------------------------------------------------------------- #

204
def XfromInput(csvfilename, rtype, fptype, printfp=False, retNames=False, size=2048, verbose=2):
205
206
207
208
209
210
211
    """
    Return the matrix of features for training and testing NN models (X) as numpy array.
    Provided SMILES are transformed to fingerprints, fingerprint strings are then split
    into vectors and added as row to the array which is returned.

    :param csvfilename: Filename of CSV files containing the training data. The
    SMILES/Fingerprints are stored 1st column
212
    :param rtype: Type of structure represetation. Valid values are: 'fp' and 'smile'
213
214
215
    :param fptype: Type of fingerprint to be generated out
    :param printfp: Print generated fingerprints to file, namely the input file with the
    file ending '.fingerprints.csv'. Default:False
216
217
    :return: A pandas dataframe containing the X matrix for training a NN model,
    rownames/numbers of rows, colnames are the positions of the fp vector.
218
219
220
221
    """

    # dict to store the fingerprints
    fps = {}
Jana Schor's avatar
Jana Schor committed
222
    rows = {} # remove this from memory!
223
    rnames = []
224
225
226
227
228

    # read csv and generate/add fingerprints to dict
    with open(csvfilename, 'r') as f:
        reader = csv.DictReader(f, delimiter=',')
        names = reader.fieldnames
229
        #print(names)
Jana Schor's avatar
Jana Schor committed
230
        feature = names[names.index(rtype)]  # rtype column ('smiles' or 'fp')
231
232
233
234
        if 'id' in names:
            rnameIDX = names[names.index('id')]
        else:
            rnameIDX = None
235
236
237
238

        i = 0

        for row in reader:
Jana Schor's avatar
Jana Schor committed
239
240
            #if i==5:
            #    break
241

242
            #print(f'i={i}: {row}')
Jana Schor's avatar
Jana Schor committed
243
244
            if printfp:
                rows.update({i:row})
245
246
247
            #print(rnames[i] + ' ' + row[feature])

            # add fp or smile
248
249
            if rtype == 'fp':
                # type == fp, fine - add it
Jana Schor's avatar
Jana Schor committed
250
251
252
253
254
255
256
257
                # add rowname or number
                if rnameIDX is None:
                    rnames.append(str(i))
                else:
                    rnames.append(row['id'])
                # add fingerprint to dictionay
                fps.update({i: row[feature]})
                i = i + 1
258
259
            else:
                # smiles, need to be converted to fp first
260
261
262
263
                fptmp = smi2fp(smile=row[feature], fptype=fptype, size=size)
                if fptmp:
                    fp=fptmp.ToBitString()

Jana Schor's avatar
Jana Schor committed
264
265
266
267
268
269
270
271
272
273
                if fp:
                    # add rowname or number
                    if rnameIDX is None:
                        rnames.append(str(i))
                    else:
                        rnames.append(row['id'])

                    # add fingerprint to dictionay
                    fps.update({i: fp})
                    i = i + 1
274
            #print(f' .. Row done.\n')
275
276
277

    # split all fingerprints into vectors
    Nrows = len(fps)
278
279
280
281
    Ncols = len(fps[0])
    #Ncols = len(DataStructs.BitVectToText(fps[0]))
    if verbose > 0:
        print(f'[INFO] Returned # of fingerprints: {Nrows}')
282

283
284
285
286
    # Store all fingerprints in numpy array
    x = np.empty((Nrows, Ncols), int)

    if printfp:
Jana Schor's avatar
Jana Schor committed
287
        csvoutfilename=csvfilename.replace(".csv", "." + str(size) + ".fingerprints.csv")
288
289
290
291
292
293
294
        fnames=names.copy()
        fnames.append('fp')
        f=open(csvoutfilename, 'w')
        writer=csv.DictWriter(f, fieldnames=fnames)
        writer.writeheader()

        for i in fps:
295
296
            #fp=DataStructs.BitVectToText(fps[i])
            fp = fps[i]
297
298
299
300
301
            rows[i]['fp']=fp
            writer.writerow(rows[i])
            x[i] = list(map(int, [char for char in fp]))

        f.close()
Jana Schor's avatar
Jana Schor committed
302
        del rows
303
304
305
    else:
        for i in fps:
            # get fingerprint as string
306
307
            #fp=DataStructs.BitVectToText(fps[i])
            fp = fps[i]
308
309
310
            # split fp into list of integers
            x[i] = list(map(int, [char for char in fp]))

311
312
313
    pdx = pd.DataFrame(data=x, index=rnames)

    return pdx
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335


# ------------------------------------------------------------------------------------- #

def YfromInput(csvfilename):
    """
    Extract the matrix of outcomes for training/testing NN models that belongs to the
    feature matrix.

    :param csvfilename: Filename of comma separated CSV files containing the training data.
    Target associations start in column 2nd column
    :return: A pandas dataframe containing the Y matrix for training a NN model including
    the names of the targets (each column is a different target)
    """

    df = pd.read_csv(csvfilename)
    y = df[df.columns[1:]]

    return y

# ------------------------------------------------------------------------------------- #

Jana Schor's avatar
Jana Schor committed
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def TrainingDataHeatmap(x, y):

    x['ID'] = x.index
    y['ID'] = y.index

    #xy = pd.merge(x,y,on="ID")

    # clustermap dies because of too many iterations..
    #sns.clustermap(x, metric="correlation", method="single", cmap="Blues", standard_scale=1) #, row_colors=row_colors)

    # try to cluster prior to viz
    # check this out
    # https://stackoverflow.com/questions/2455761/reordering-matrix-elements-to-reflect-column-and-row-clustering-in-naiive-python

    # viz using matplotlib heatmap https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html

    return 1

# ------------------------------------------------------------------------------------- #
355
356
357
def removeDuplicates(x, y):
    """
    Remove duplicated feature - outcome pairs from feature matrix and outcome vector combination.
Jana Schor's avatar
Jana Schor committed
358

359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
    :param x: Feature matrix
    :param y: Outcome vector

    :return: Tuple of unique Feature matrix and outcome vector combinations
    """

    # add y as additional column to x
    joined = np.append(x, [[x] for x in y], axis=1)
    # merge all columns into one string per row

    fpstrings = []
    for i in range(0, len(y)):
        fpstrings.append(["".join([str(int(c)) for c in joined[i]])])

    fpstrings_unique = np.unique(fpstrings, return_index=True)

    return (x[fpstrings_unique[1]], y[fpstrings_unique[1]])

# ------------------------------------------------------------------------------------- #

def defineCallbacks(checkpointpath, patience, rlrop=False, rlropfactor=0.1, rlroppatience=50):
380
381
    """

382
383
384
385
386
387
    :param checkpointpath:
    :param patience:
    :param rlrop:
    :param rlropfactor:
    :param rlroppatience:
    :return:
388
389
    """

390
391
392
393
394
395
396
397
398
399
400
    # enable this checkpoint to restore the weights of the best performing model
    checkpoint = ModelCheckpoint(checkpointpath, monitor='val_loss', verbose=1,
                                 save_best_only=True, mode='min', save_weights_only=True)

    # enable early stopping if val_loss is not improving anymore
    earlystop = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=patience,
                              verbose=1,
                              restore_best_weights=True)

Jana Schor's avatar
Jana Schor committed
401
    callbacks = []
402
403
    if rlrop:
        rlrop = ReduceLROnPlateau(monitor='val_loss', factor=rlropfactor, patience=rlroppatience)
Jana Schor's avatar
Jana Schor committed
404
        callbacks = [checkpoint, earlystop, rlrop]
405
406
407
408
409
410
411
412
    else:
        callbacks = [checkpoint, earlystop]

    # Return list of callbacks - collect the callbacks for training
    return(callbacks)

# ------------------------------------------------------------------------------------- #

413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
def defineNNmodelMulti(inputSize=2048, outputSize=None, l2reg=0.001, dropout=0.2,
                       activation='relu', optimizer='Adam', lr=0.001, decay=0.01):

    if optimizer=='Adam':
        myoptimizer = optimizers.Adam(learning_rate=lr, decay=decay)#, beta_1=0.9, beta_2=0.999, amsgrad=False)
    elif optimizer=='SGD':
        myoptimizer = SGD(lr=lr, momentum=0.9, decay=decay)
    else:
        myoptimizer = optimizer

    nhl = int(math.log2(inputSize) / 2 - 1)

    model = Sequential()
    # From input to 1st hidden layer
    model.add(Dense(units=int(inputSize / 2), input_dim=inputSize,
                    activation=activation,
                    kernel_regularizer=regularizers.l2(l2reg)))
    model.add(Dropout(dropout))
    # next hidden layers
    for i in range(1, nhl):
        factorunits = 2 ** (i + 1)
        factordropout = 2 * i
        model.add(Dense(units=int(inputSize / factorunits),
                        activation=activation,
                        kernel_regularizer=regularizers.l2(l2reg)))
        model.add(Dropout(dropout / factordropout))
    # multi-class output layer
    # use sigmoid to get independent probabilities for each output node
    # (need not add up to one, as they would using softmax)
    # https://www.depends-on-the-definition.com/guide-to-multi-label-classification-with-neural-networks/
    model.add(Dense(units=outputSize, activation='sigmoid'))

    model.summary()

    # compile model
    model.compile(loss="binary_crossentropy", optimizer=myoptimizer, metrics=['accuracy'])

    return model

# ------------------------------------------------------------------------------------- #

454
455
456
457
458
459
460
461
462
463
464
465
def defineNNmodel(inputSize=2048, l2reg=0.001, dropout=0.2, activation='relu', optimizer='Adam', lr=0.001, decay=0.01):
    """

    :param inputSize:
    :param l2reg:
    :param dropout:
    :param activation:
    :param optimizer:
    :param lr:
    :param decay:
    :return:
    """
466

Jana Schor's avatar
Jana Schor committed
467
468
469
470
471
472
    if optimizer=='Adam':
        myoptimizer = optimizers.Adam(learning_rate=lr, decay=decay)#, beta_1=0.9, beta_2=0.999, amsgrad=False)
    elif optimizer=='SGD':
        myoptimizer = SGD(lr=lr, momentum=0.9, decay=decay)
    else:
        myoptimizer = optimizer
473

Jana Schor's avatar
Jana Schor committed
474
    myhiddenlayers = {"2048":6, "1024":5, "999":5, "512":4, "256":3}
475

Jana Schor's avatar
Jana Schor committed
476
477
478
479
    if not str(inputSize) in myhiddenlayers.keys():
        print("Wrong inputsize. Must be in {2048, 1024, 999, 512, 256}.")
        return None

480
    nhl = int(math.log2(inputSize)/2 - 1)
Jana Schor's avatar
Jana Schor committed
481
482
483
484
485

    model = Sequential()
    # From input to 1st hidden layer
    model.add(Dense(units=int(inputSize/2), input_dim=inputSize,
                    activation=activation,
486
487
                    kernel_regularizer=regularizers.l2(l2reg)))
    model.add(Dropout(dropout))
Jana Schor's avatar
Jana Schor committed
488
489
490
491
492
493
    # next hidden layers
    for i in range(1,nhl):
        factorunits = 2**(i+1)
        factordropout = 2*i
        model.add(Dense(units=int(inputSize/factorunits),
                    activation=activation,
494
                    kernel_regularizer=regularizers.l2(l2reg)))
Jana Schor's avatar
Jana Schor committed
495
496
        model.add(Dropout(dropout/factordropout))
    #output layer
497
498
    model.add(Dense(units=1, activation='sigmoid'))

Jana Schor's avatar
Jana Schor committed
499
500
    model.summary()

Jana Schor's avatar
Jana Schor committed
501
    # compile model
Jana Schor's avatar
Jana Schor committed
502
    model.compile(loss="mse", optimizer=myoptimizer, metrics=['accuracy'])
Jana Schor's avatar
Jana Schor committed
503

504
505
    return model

Jana Schor's avatar
Jana Schor committed
506
507
# ------------------------------------------------------------------------------------- #

Jana Schor's avatar
Jana Schor committed
508
509
510
511
512
513
514
515
516
517
518
519
520
521
def autoencoderModel(input_size=2048, encoding_dim=256, myactivation='relu', myloss='binary_crossentropy', myregularization=10-3, mylr=0.001, mydecay=0.01):
    """
    This function provides an autoencoder model to reduce a certain input to a compressed version.

    :param encoding_dim: Size of the compressed representation. Default: 85
    :param input_size: Size of the input. Default: 2048
    :param myactivation: Activation function, see Keras activation functions for potential values. Default: relu
    :param myoptimizer: Optimizer, see Keras optmizers for potential values. Default: adadelta
    :param myloss: Loss function, see Keras Loss functions for potential values. Default: binary_crossentropy
    :return: a tuple of autoencoder, encoder and decoder models
    """

    myoptimizer = optimizers.Adam(learning_rate=mylr, decay=mydecay)  # , beta_1=0.9, beta_2=0.999, amsgrad=False)

522
    # get the number of meaningful hidden layers (latent space included)
Jana Schor's avatar
Jana Schor committed
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
    nhl = round(math.log2(input_size/encoding_dim))

    # the input placeholder
    input_vec = Input(shape=(input_size,))

    # 1st hidden layer, that receives weights from input layer
    # equals bottle neck layer, if nhl==1!
    encoded = Dense(units=int(input_size/2), activation='relu')(input_vec)

    if nhl > 1:
        # encoding layers, incl. bottle neck
        for i in range(1, nhl):
            factorunits = 2 ** (i + 1)
            #print(f'{factorunits}: {int(input_size / factorunits)}')
            encoded = Dense(units=int(input_size / factorunits), activation='relu')(encoded)

#        encoding_dim = int(input_size/factorunits)

        # 1st decoding layer
        factorunits = 2 ** (nhl - 1)
        decoded = Dense(units=int(input_size/factorunits), activation='relu')(encoded)

        # decoding layers
        for i in range(nhl-2, 0, -1):
            factorunits = 2 ** i
            #print(f'{factorunits}: {int(input_size/factorunits)}')
            decoded = Dense(units=int(input_size/factorunits), activation='relu')(decoded)

        # output layer
        # The output layer needs to predict the probability of an output which needs to either 0 or 1 and hence we use sigmoid activation function.
        decoded = Dense(units=input_size, activation='sigmoid')(decoded)

    else:
        # output layer
        decoded = Dense(units=input_size, activation='sigmoid')(encoded)

    autoencoder = Model(input_vec, decoded)
    encoder = Model(input_vec, encoded)

    autoencoder.summary()
    encoder.summary()

    # We compile the autoencoder model with adam optimizer.
    # As fingerprint positions have a value of 0 or 1 we use binary_crossentropy as the loss function
    autoencoder.compile(optimizer=myoptimizer, loss=myloss)

    return (autoencoder, encoder)

# ------------------------------------------------------------------------------------- #

Jana Schor's avatar
Jana Schor committed
573
def predictValues(acmodelfilepath, modelfilepath, pdx):
574
    """
Jana Schor's avatar
Jana Schor committed
575
    Predict a set of chemicals using a selected model.
576

Jana Schor's avatar
Jana Schor committed
577
    :param modelfilepath: Path to the model weights of the prediction DNN
Jana Schor's avatar
Jana Schor committed
578
579
580
581
    :param pdx: A matrix containing the fingerprints of the chemicals, generated via XfromInput function
    :return: A dataframe of 2 columns: random - predicted values using random model, trained - predicted values
    using trained model. Rownames are consecutive numbers of the input rows, or if provided in the input file
    the values of the id column
582
    """
Jana Schor's avatar
Jana Schor committed
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619

    #acmodelfilepath="/data/bioinf/projects/data/2019_IDA-chem/deepFPlearn/modeltraining/model.2048.256.ER.checkpoint.AC-model.hdf5"
    #modelfilepath  ="/data/bioinf/projects/data/2019_IDA-chem/deepFPlearn/modeltraining/model.2048.256.ER.checkpoint.model.hdf5"

    print(f"[INFO:] Loaded model weights for autoencoder: '{modelfilepath}'")
    print(f"[INFO:] Loaded model weights for prediction DNN: '{modelfilepath}'")

    start = time()
    # create autoencoder
    (autoencoder, encoder) = autoencoderModel(input_size=pdx.shape[1], encoding_dim=256)
    # load AC weights
    autoencoder.load_weights(acmodelfilepath)
    # encode input
    encodedX = encoder.predict(pdx)

    trainTime = str(round((time() - start) / 60, ndigits=2))
    print(f"[INFO:] Computation time used for encoding: {trainTime} min")

    start = time()
    # create DNN
    predictor = defineNNmodel(inputSize=encodedX.shape[1])
    # predict with random weights
    predictions_random = predictor.predict(encodedX)
    # load DNN weights
    predictor.load_weights(modelfilepath)
    # predict encoded input
    predictions = predictor.predict(encodedX)

    trainTime = str(round((time() - start) / 60, ndigits=2))
    print(f"[INFO:] Computation time used for the predictions: {trainTime} min")

    df = pd.DataFrame(data={'random':predictions_random.flatten(),
                            'trained':predictions.flatten()},
                      columns=['random','trained'],
                      index=pdx.index)
    print(df)
    return(df)
620

621
622
# ------------------------------------------------------------------------------------- #

623
624
#def trainAutoencoder(checkpointpath, X_train, X_test, y_train, y_test, epochs, enc_dim, split, verbose, kfold):
def trainAutoencoder(checkpointpath, Xtrain, Xtest, epochs, enc_dim, verbose, fold):
625
626
627
628
629
630
631
632
633
634
635
    """

    :param checkpointpath:
    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :param epochs:
    :return:
    """

636
637
    #checkpointpath = checkpointpath.replace(".hdf5", "_Fold-" + str(fold) + ".hdf5")

638
639
640
    # collect the callbacks for training
    callback_list = defineCallbacks(checkpointpath=checkpointpath, patience=20, rlrop=False)
    # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!)
641
    (autoencoder, encoder) = autoencoderModel(input_size=Xtrain.shape[1], encoding_dim=enc_dim)
642
643

    # Fit the AC
644
    autohist = autoencoder.fit(Xtrain, Xtrain,
645
                               callbacks=callback_list,
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
                               epochs=epochs, batch_size=128, shuffle=True, verbose=verbose,
                               validation_data=(Xtest, Xtest))
    # history
    ac_loss = autohist.history['loss']
    ac_val_loss = autohist.history['val_loss']
    ac_epochs = range(ac_loss.__len__())
    # generate a figure of the losses for this fold
    plt.figure()
    plt.plot(ac_epochs, ac_loss, 'bo', label='Training loss')
    plt.plot(ac_epochs, ac_val_loss, 'b', label='Validation loss')
    plt.title(f'Training and validation loss of AC (Fold = {fold}')
    plt.legend()
    plt.savefig(fname=checkpointpath.replace(".hdf5",
                                             "_trainValLoss_AC_Fold-" + str(fold) + ".svg"),
                format='svg')
    plt.close()
    # write the losses to .csv file for later data visualization
    df = pd.DataFrame(data={'loss': ac_loss, 'val_loss': ac_val_loss, 'epoch': ac_epochs})
    df.to_csv(checkpointpath.replace(".hdf5",
                                     "_trainValLoss_AC_Fold-" + str(fold) + ".csv"))
666
667
668

    # model needs to be saved and restored when predicting new input!
    # use encode() of train data as input for DL model to associate to chemical
669
    return (encoder.predict(Xtrain), encoder.predict(Xtest), ac_loss.pop(), ac_val_loss.pop())
670

671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
# ------------------------------------------------------------------------------------- #

def plotTrainHistory(hist, target, fileAccuracy, fileLoss):
    """
    Plot the training performance in terms of accuracy and loss values for each epoch.
    :param hist: The history returned by model.fit function
    :param target: The name of the target of the model
    :param fileAccuracy: The filename for plotting accuracy values
    :param fileLoss: The filename for plotting loss values
    :return: none
    """

    # plot accuracy
    plt.figure()
    plt.plot(hist.history['accuracy'])
Jana Schor's avatar
Jana Schor committed
686
687
    if 'val_accuracy' in hist.history.keys():
        plt.plot(hist.history['val_accuracy'])
688
689
690
    plt.title('Model accuracy - ' + target)
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
Jana Schor's avatar
Jana Schor committed
691
692
693
694
    if 'val_accuracy' in hist.history.keys():
        plt.legend(['Train', 'Test'], loc='upper left')
    else:
        plt.legend(['Train'], loc='upper_left')
695
696
697
698
699
700
701
702
703
704
705
706
    plt.savefig(fname=fileAccuracy, format='svg')

    # Plot training & validation loss values
    plt.figure()
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('Model loss - ' + target)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    #        plt.show()
    plt.savefig(fname=fileLoss, format='svg')
Jana Schor's avatar
Jana Schor committed
707
    plt.close()
708

Jana Schor's avatar
Jana Schor committed
709
710
711
712
713
714
715
716
717
718
719
720
# ------------------------------------------------------------------------------------- #

def plotAUC(fpr, tpr, auc, target, filename, title=""):

    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='Keras (area = {:.3f})'.format(auc))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve {target}')
    plt.legend(loc='best')
    plt.savefig(fname=filename, format='svg')
Jana Schor's avatar
Jana Schor committed
721
    plt.close()
Jana Schor's avatar
Jana Schor committed
722
723
724
725
726
727

# ------------------------------------------------------------------------------------- #

def plotHeatmap(matrix, filename, title=""):

    plt.figure()
728
    plt.imshow(matrix, cmap='Greys', interpolation='nearest')
Jana Schor's avatar
Jana Schor committed
729
730
    plt.title(title)
    plt.savefig(fname=filename, format='svg')
Jana Schor's avatar
Jana Schor committed
731
    plt.close()
Jana Schor's avatar
Jana Schor committed
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762

# ------------------------------------------------------------------------------------- #

# plot model history more easily, see whyboris@github: https://gist.github.com/whyboris/91ee793ddc92cf1e824978cf31bb790c
# when plotting, smooth out the points by some factor (0.5 = rough, 0.99 = smooth)
# method taken from `Deep Learning with Python` by François Chollet

# 01 --------------------------------------------------------------------------------- #

def smooth_curve(points, factor=0.75):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points

# 02 ---------------------------------------------------------------------------------- #

def set_plot_history_data(ax, history, which_graph):

    if which_graph == 'acc':
        train = smooth_curve(history.history['accuracy'])
        valid = smooth_curve(history.history['val_accuracy'])

    if which_graph == 'loss':
        train = smooth_curve(history.history['loss'])
        valid = smooth_curve(history.history['val_loss'])

763
    #plt.xkcd() # make plots look like xkcd
Jana Schor's avatar
Jana Schor committed
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835

    epochs = range(1, len(train) + 1)

    trim = 0 # remove first 5 epochs
    # when graphing loss the first few epochs may skew the (loss) graph

    ax.plot(epochs[trim:], train[trim:], 'dodgerblue', linewidth=15, alpha=0.1)
    ax.plot(epochs[trim:], train[trim:], 'dodgerblue', label=('Training'))

    ax.plot(epochs[trim:], valid[trim:], 'g', linewidth=15, alpha=0.1)
    ax.plot(epochs[trim:], valid[trim:], 'g', label=('Validation'))

# 03 ---------------------------------------------------------------------------------- #

def get_max_validation_accuracy(history):
    validation = smooth_curve(history.history['val_accuracy'])
    ymax = max(validation)
    return 'Max validation accuracy ≈ ' + str(round(ymax, 3)*100) + '%'

def get_max_training_accuracy(history):
    training = smooth_curve(history.history['accuracy'])
    ymax = max(training)
    return 'Max training accuracy ≈ ' + str(round(ymax, 3)*100) + '%'

# 04---------------------------------------------------------------------------------- #

def plot_history(history, file):
    fig, (ax1, ax2) = plt.subplots(nrows=2,
                                   ncols=1,
                                   figsize=(10, 6),
                                   sharex=True,
                                   gridspec_kw={'height_ratios': [5, 2]})

    set_plot_history_data(ax1, history, 'acc')

    set_plot_history_data(ax2, history, 'loss')

    # Accuracy graph
    ax1.set_ylabel('Accuracy')
    ax1.set_ylim(bottom=0.5, top=1)
    ax1.legend(loc="lower right")
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.xaxis.set_ticks_position('none')
    ax1.spines['bottom'].set_visible(False)

    # max accuracy text
    plt.text(0.5,
             0.6,
             get_max_validation_accuracy(history),
             horizontalalignment='right',
             verticalalignment='top',
             transform=ax1.transAxes,
             fontsize=12)
    plt.text(0.5,
             0.8,
             get_max_training_accuracy(history),
             horizontalalignment='right',
             verticalalignment='top',
             transform=ax1.transAxes,
             fontsize=12)

    # Loss graph
    ax2.set_ylabel('Loss')
    ax2.set_yticks([])
    ax2.plot(legend=False)
    ax2.set_xlabel('Epochs')
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)

    plt.tight_layout()
    plt.savefig(fname=file, format='svg')
Jana Schor's avatar
Jana Schor committed
836
    plt.close()
Jana Schor's avatar
Jana Schor committed
837
838


839
840

# ------------------------------------------------------------------------------------- #
Jana Schor's avatar
Jana Schor committed
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870

def drawHeatmap(data, anno):

    #(data=pd.DataFrame(Xt), anno = pd.DataFrame(Yt.astype(int)))

    # annotation bar colors
    my_ann_colors = dict(zip(anno[0].unique(), ["blue", "red"]))
    row_colors = anno[0].map(my_ann_colors)


    cl = sns.clustermap(data, metric="euclidean", method="single", z_score=None, standard_scale=None,
                        col_cluster=False, cmap="Greys", row_colors=row_colors, yticklabels=False)
    cl.fig.suptitle('Distributions of [1,0] in fingerprints of target: AR')

    url = 'https://python-graph-gallery.com/wp-content/uploads/mtcars.csv'
    df = pd.read_csv(url)
    # set df index using existing columns
    df = df.set_index('model')
    # remove name of index column, numbered rows have been the index before, they are gone now
    del df.index.name
    df
    my_palette = dict(zip(df.cyl.unique(), ["orange", "yellow", "brown"]))
    row_colors = df.cyl.map(my_palette)

    # Default plot
    #sns.clustermap(df)

    # Clustermethods
    my_palette = dict()
    sns.clustermap(df, metric="correlation", standard_scale=1, method="single", cmap="Blues", row_colors=row_colors)
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897


###############################################################################
# TRAIN FUNCTIONS --------------------------------------------------------- #

def parseInputTrain(parser):
    """
    Parse the input arguments.

    :return: A namespace object built up from attributes parsed out of the cmd line.
    """
#    parser = argparse.ArgumentParser(description='Train a DNN to associate chemical fingerprints with a (set of)'
#                                     'target(s). Trained models are saved to disk including fitted weights and '
#                                     'can be used in the deepFPlearn-Predict.py tool to make predictions.')
    parser.add_argument('-i', metavar='FILE', type=str,
                        help="The file containin the data for training in (unquoted) "
                             "comma separated CSV format. First column contain the feature string in "
                             "form of a fingerprint or a SMILES (see -t option). "
                             "The remaining columns contain the outcome(s) (Y matrix). "
                             "A header is expected and respective column names are used "
                             "to refer to outcome(s) (target(s)).", required=True)
    parser.add_argument('-o', metavar='FILE', type=str,
                        help='Prefix of output file name. Trained model(s) and '
                             'respective stats will be returned in 2 output files '
                             'with this prefix. Default: prefix of input file name.')
    parser.add_argument('-t', metavar='STR', type=str, choices=['fp', 'smiles'],
                        help="Type of the chemical representation. Choices: 'fp', 'smiles'.",
898
                        default='smiles')
899
900
901
    parser.add_argument('-k', metavar='STR', type=str,
                        choices=['topological', 'MACCS'],  # , 'atompairs', 'torsions'],
                        help='The type of fingerprint to be generated/used in input file.',
902
                        default='topological')
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
    parser.add_argument('-s', type=int,
                        help = 'Size of fingerprint that should be generated.',
                        default=2048)
    parser.add_argument('-a', action='store_true',
                        help='Use autoencoder to reduce dimensionality of fingerprint. Default: not set.')
    parser.add_argument('-d', metavar='INT', type=int,
                        help='Size of encoded fingerprint (z-layer of autoencoder).',
                        default=256)
    parser.add_argument('-e', metavar='INT', type=int,
                        help='Number of epochs that should be trained',
                        default=50)
    parser.add_argument('-p', metavar='FILE', type=str,
                        help="CSV file containing the parameters for the epochs per target model."
                             "The target abbreviation should be the same as in the input file and"
                             "the columns/parameters are: \n\ttarget,batch_size,epochs,optimizer,activation."
                             "Note that values in from file overwrite -e option!")
    parser.add_argument('-m', action='store_true',
                        help='Train multi-label classification model in addition to the individual models.')
    parser.add_argument('-l', metavar='INT', type=float,
                        help='Fraction of the dataset that should be used for testing. Value in [0,1].',
                        default=0.2)
    parser.add_argument('-K', metavar='INT', type=int,
                        help='K that is used for K-fold cross validation in the training procedure.',
                        default=5)
    parser.add_argument('-v', metavar='INT', type=int, choices=[0,1, 2],
                        help='Verbosity level. O: No additional output, 1: Some additional output, 2: full additional output',
                        default=2
                        )

#    return parser.parse_args()
#return parser

# ------------------------------------------------------------------------------------- #

def defineOutfileNames(pathprefix, mtype, target, fold):
    """
    This function returns the required paths for output files or directories.

    :param pathprefix: A file path prefix for all files.
    :param mtype: The model type. Its set by the trainNNmodels function with information on autoencoder or not,
    and if AC is used, then with its parameters.
    :param target: The name of the target.

    :return: A tuple of 14 output file names.
    """
    
    modelname = mtype + '.' + target + '.Fold-' + str(fold)

    modelfilepathW = str(pathprefix) + '/model.' + modelname + '.weights.h5'
    modelfilepathM = str(pathprefix) + '/model.' + modelname + '.json'
    modelhistplotpathL = str(pathprefix) + '/model.' + modelname + '.loss.svg'
    modelhistplotpathA = str(pathprefix) + '/model.' + modelname + '.acc.svg'
    modelhistplotpath = str(pathprefix) + '/model.' + modelname + '.history.svg'
    modelhistcsvpath = str(pathprefix) + '/model.' + modelname + '.history.csv'
    modelvalidation = str(pathprefix) + '/model.' + modelname + '.validation.csv'
    modelAUCfile = str(pathprefix) + '/model.' + modelname + '.auc.svg'
    modelAUCfiledata = str(pathprefix) + '/model.' + modelname + '.auc.data.csv'
    outfilepath = str(pathprefix) + '/model.' + modelname + '.trainingResults.txt'
    checkpointpath = str(pathprefix) + '/model.' + modelname + '.checkpoint.model.hdf5'
    checkpointpathAC = str(pathprefix) + '/model.' + modelname + '.checkpoint.AC-model.hdf5'
    modelheatmapX = str(pathprefix) + '/model.' + modelname + '.heatmap.X.svg'
    modelheatmapZ = str(pathprefix) + '/model.' + modelname + '.AC.heatmap.Z.svg'

    return (modelfilepathW, modelfilepathM, modelhistplotpathL, modelhistplotpathA,
            modelhistplotpath, modelhistcsvpath, modelvalidation, modelAUCfile,
            modelAUCfiledata, outfilepath, checkpointpath, checkpointpathAC,
            modelheatmapX, modelheatmapZ)

# ------------------------------------------------------------------------------------- #

def eval01Distributions(Xt, Yt, y_train, y_test, verbosity=0):
    """
    Evaluate the percentage of 0 values in the outcome variable of the whole dataset and the splitted (train,test)
    dataset, and the percentage of 0 values in the feature matrix.

    :param Xt: The whole feature matrix
    :param Yt: The whole outcome vector
    :param y_train: The outcome vector of the training set
    :param y_test: The outcome vector of the test set
    :param verbosity: The verbosity level. Info is only printed if verbosity is not 0.

    :return: Nothing is returned.
    """

    if verbosity == 0:
        return
    else:
        unique, counts = np.unique(Yt, return_counts=True)
        perc = round(100 / len(Yt) * counts[1])
        print(f"[INFO:] Percentage of '1' values in outcome variable (whole dataset): {perc}\n")

        uniqueRtr, countsRtr = np.unique(y_train, return_counts=True)
        uniqueRte, countsRte = np.unique(y_test, return_counts=True)
        perc = round(100 / len(y_train) * countsRtr[1])
        print(f"[INFO:] Percentage of '1' values in training outcomes: {perc}\n")
        perc = round(100 / len(y_test) * countsRte[1])
        print(f"[INFO:] Percentage of '1' values in test outcomes: {perc}\n")