Skip to content

Commit 5f7d7f4

Browse files
Optimizers
1 parent e3a4d62 commit 5f7d7f4

File tree

9 files changed

+51
-84
lines changed

9 files changed

+51
-84
lines changed

mlfromscratch/deep_learning/layers.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,31 @@
1111
class Layer(object):
1212

1313
def set_input_shape(self, shape):
14+
""" Sets the shape that the layer expects of the input in the forward
15+
pass method """
1416
self.input_shape = shape
1517

1618
def layer_name(self):
19+
""" The name of the layer. Used in model summary. """
1720
return self.__class__.__name__
1821

1922
def parameters(self):
23+
""" The number of trainable parameters used by the layer """
2024
return 0
2125

2226
def forward_pass(self, X, training):
27+
""" Propogates the signal forward in the network """
2328
raise NotImplementedError()
2429

2530
def backward_pass(self, acc_grad):
31+
""" Propogates the accumulated gradient backwards in the network.
32+
If the has trainable weights then these weights are also tuned in this method.
33+
As input (acc_grad) it receives the gradient with respect to the output of the layer and
34+
returns the gradient with respect to the output of the previous layer. """
2635
raise NotImplementedError()
2736

2837
def output_shape(self):
38+
""" The shape of the output produced by forward_pass """
2939
raise NotImplementedError()
3040

3141

@@ -65,7 +75,6 @@ def forward_pass(self, X, training=True):
6575
return X.dot(self.W) + self.w0
6676

6777
def backward_pass(self, acc_grad):
68-
6978
# Save weights used during forwards pass
7079
W = self.W
7180

mlfromscratch/deep_learning/neural_network.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
from __future__ import print_function
2-
from sklearn import datasets
32
from terminaltables import AsciiTable
4-
import sys
5-
import os
6-
import math
73
import copy
8-
import pandas as pd
9-
import matplotlib.pyplot as plt
104
import numpy as np
115
import progressbar
126

@@ -134,7 +128,7 @@ def summary(self, name="Model Summary"):
134128

135129
# Print network configuration table
136130
print (AsciiTable(table_data).table)
137-
131+
138132
print ("Total Parameters: %d\n" % tot_params)
139133

140134
def predict(self, X):

mlfromscratch/deep_learning/optimizers.py

Lines changed: 27 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
import numpy as np
22
from mlfromscratch.utils.data_manipulation import make_diagonal, normalize
33

4-
# Optimizers for models that use gradient methods for finding the
4+
# Optimizers for models that use gradient based methods for finding the
55
# weights that minimizes the loss.
6-
# A good resource:
6+
# A great resource for understanding these methods:
77
# http://sebastianruder.com/optimizing-gradient-descent/index.html
88

9-
class GradientDescent():
9+
class StochasticGradientDescent():
1010
def __init__(self, learning_rate=0.01, momentum=0):
1111
self.learning_rate = learning_rate
1212
self.momentum = momentum
13-
self.w_updt = np.array([])
13+
self.w_updt = None
1414

1515
def update(self, w, grad_wrt_w):
16-
if not self.w_updt.any():
16+
# If not initialized
17+
if self.w_updt is None:
1718
self.w_updt = np.zeros(np.shape(w))
1819
# Use momentum if set
19-
self.w_updt = self.momentum * self.w_updt + grad_wrt_w
20+
self.w_updt = self.momentum * self.w_updt + (1 - self.momentum) * grad_wrt_w
2021
# Move against the gradient to minimize loss
2122
return w - self.learning_rate * self.w_updt
2223

@@ -40,43 +41,35 @@ def update(self, w, grad_func):
4041
class Adagrad():
4142
def __init__(self, learning_rate=0.01):
4243
self.learning_rate = learning_rate
43-
self.G = np.array([]) # Sum of squares of the gradients
44+
self.G = None # Sum of squares of the gradients
4445
self.eps = 1e-8
4546

4647
def update(self, w, grad_wrt_w):
47-
# Gradient clipping to avoid exploding grads
48-
grad_at_w = np.clip(grad_wrt_w, -1, 1)
4948
# If not initialized
50-
if not self.G.any():
49+
if self.G is None:
5150
self.G = np.zeros(np.shape(w))
5251
# Add the square of the gradient of the loss function at w
53-
self.G += np.power(grad_at_w, 2)
52+
self.G += np.power(grad_wrt_w, 2)
5453
# Adaptive gradient with higher learning rate for sparse data
55-
w_updt = self.learning_rate / np.sqrt(self.G + self.eps).T * grad_at_w
56-
57-
return w - w_updt
58-
54+
return w - self.learning_rate * grad_wrt_w / np.sqrt(self.G + self.eps)
5955

6056
class Adadelta():
6157
def __init__(self, rho=0.95, eps=1e-6):
62-
self.E_w_updt = np.array([]) # Running average of squared parameter updates
63-
self.E_grad = np.array([]) # Running average of the squared gradient of w
64-
self.w_updt = np.array([]) # Parameter update
58+
self.E_w_updt = None # Running average of squared parameter updates
59+
self.E_grad = None # Running average of the squared gradient of w
60+
self.w_updt = None # Parameter update
6561
self.eps = eps
6662
self.rho = rho
6763

6864
def update(self, w, grad_wrt_w):
69-
# Gradient clipping to avoid exploding grads
70-
grad_at_w = np.clip(grad_wrt_w, -1, 1)
71-
7265
# If not initialized
73-
if not self.w_updt.any():
66+
if self.w_updt is None:
7467
self.w_updt = np.zeros(np.shape(w))
7568
self.E_w_updt = np.zeros(np.shape(w))
76-
self.E_grad = np.zeros(np.shape(grad_at_w))
69+
self.E_grad = np.zeros(np.shape(grad_wrt_w))
7770

7871
# Update average of gradients at w
79-
self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_at_w, 2)
72+
self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_wrt_w, 2)
8073

8174
RMS_delta_w = np.sqrt(self.E_w_updt + self.eps)
8275
RMS_grad = np.sqrt(self.E_grad + self.eps)
@@ -85,7 +78,7 @@ def update(self, w, grad_wrt_w):
8578
adaptive_lr = RMS_delta_w / RMS_grad
8679

8780
# Calculate the update
88-
self.w_updt = adaptive_lr * grad_at_w
81+
self.w_updt = adaptive_lr * grad_wrt_w
8982

9083
# Update the running average of w updates
9184
self.E_w_updt = self.rho * self.E_w_updt + (1 - self.rho) * np.power(self.w_updt, 2)
@@ -95,40 +88,34 @@ def update(self, w, grad_wrt_w):
9588
class RMSprop():
9689
def __init__(self, learning_rate=0.01, rho=0.9):
9790
self.learning_rate = learning_rate
98-
self.Eg = np.array([]) # Running average of the square gradients at w
91+
self.Eg = None # Running average of the square gradients at w
9992
self.eps = 1e-8
10093
self.rho = rho
10194

10295
def update(self, w, grad_wrt_w):
103-
# Gradient clipping to avoid exploding grads
104-
grad_at_w = np.clip(grad_wrt_w, -1, 1)
105-
10696
# If not initialized
107-
if not self.Eg.any():
108-
self.Eg = np.zeros(np.shape(grad_at_w))
97+
if self.Eg is None:
98+
self.Eg = np.zeros(np.shape(grad_wrt_w))
10999

110-
self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(grad_at_w, 2)
100+
self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(grad_wrt_w, 2)
111101

112102
# Divide the learning rate for a weight by a running average of the magnitudes of recent
113103
# gradients for that weight
114-
self.w_updt = self.learning_rate * np.linalg.pinv(np.sqrt(self.Eg + self.eps)).T * grad_at_w
115-
116-
return w - self.w_updt
104+
return w - self.learning_rate * grad_wrt_w / np.sqrt(self.Eg + self.eps)
117105

118106
class Adam():
119107
def __init__(self, learning_rate=0.001, b1=0.9, b2=0.999):
120108
self.learning_rate = learning_rate
121109
self.eps = 1e-8
122-
self.m = np.array([])
123-
self.v = np.array([])
110+
self.m = None
111+
self.v = None
124112
# Decay rates
125113
self.b1 = b1
126114
self.b2 = b2
127115

128116
def update(self, w, grad_wrt_w):
129-
130117
# If not initialized
131-
if not self.m.any():
118+
if self.m is None:
132119
self.m = np.zeros(np.shape(grad_wrt_w))
133120
self.v = np.zeros(np.shape(grad_wrt_w))
134121

@@ -138,7 +125,7 @@ def update(self, w, grad_wrt_w):
138125
m_hat = self.m / (1 - self.b1)
139126
v_hat = self.v / (1 - self.b2)
140127

141-
self.w_updt = self.learning_rate / (np.sqrt(v_hat) + self.eps) * m_hat
128+
self.w_updt = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.eps)
142129

143130
return w - self.w_updt
144131

mlfromscratch/examples/convolutional_neural_network.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
1111
from mlfromscratch.utils.data_manipulation import get_random_subsets, shuffle_data
1212
from mlfromscratch.utils.data_operation import accuracy_score
13-
from mlfromscratch.deep_learning.optimizers import GradientDescent, Adam, RMSprop, Adagrad, Adadelta
13+
from mlfromscratch.deep_learning.optimizers import StochasticGradientDescent, Adam, RMSprop, Adagrad, Adadelta
1414
from mlfromscratch.deep_learning.loss_functions import CrossEntropy
1515
from mlfromscratch.utils.misc import bar_widgets
1616
from mlfromscratch.utils import Plot
@@ -25,7 +25,7 @@ def main():
2525
# Conv Net
2626
#----------
2727

28-
optimizer = Adam()
28+
optimizer = Adadelta()
2929

3030
data = datasets.load_digits()
3131
X = data.data

mlfromscratch/examples/multilayer_perceptron.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
1111
from mlfromscratch.utils.data_manipulation import get_random_subsets, shuffle_data
1212
from mlfromscratch.utils.data_operation import accuracy_score
13-
from mlfromscratch.deep_learning.optimizers import GradientDescent, Adam, RMSprop, Adagrad, Adadelta
13+
from mlfromscratch.deep_learning.optimizers import StochasticGradientDescent, Adam, RMSprop, Adagrad, Adadelta
1414
from mlfromscratch.deep_learning.loss_functions import CrossEntropy
1515
from mlfromscratch.utils.misc import bar_widgets
1616
from mlfromscratch.utils import Plot
@@ -19,7 +19,7 @@
1919

2020
def main():
2121

22-
optimizer = Adam()
22+
optimizer = Adadelta()
2323

2424
#-----
2525
# MLP

mlfromscratch/supervised_learning/logistic_regression.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,9 @@
11
from __future__ import print_function
2-
import sys
3-
import os
4-
import math
5-
from sklearn import datasets
62
import numpy as np
7-
import pandas as pd
8-
import matplotlib.pyplot as plt
93

104
# Import helper functions
11-
from mlfromscratch.utils.data_manipulation import make_diagonal, normalize, train_test_split
12-
from mlfromscratch.utils.data_operation import accuracy_score
5+
from mlfromscratch.utils.data_manipulation import make_diagonal
136
from mlfromscratch.utils.activation_functions import Sigmoid
14-
from mlfromscratch.utils.optimizers import GradientDescent
15-
from mlfromscratch.unsupervised_learning import PCA
167
from mlfromscratch.utils import Plot
178

189

@@ -42,7 +33,7 @@ def fit(self, X, y, n_iterations=4000):
4233
# Initialize parameters between [-1/sqrt(N), 1/sqrt(N)]
4334
limit = 1 / math.sqrt(n_features)
4435
self.param = np.random.uniform(-limit, limit, (n_features,))
45-
36+
4637
# Tune parameters for n iterations
4738
for i in range(n_iterations):
4839
# Make a new prediction

mlfromscratch/supervised_learning/naive_bayes.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def fit(self, X, y):
3737
parameters = {"mean": col.mean(), "var": col.var()}
3838
self.parameters[i].append(parameters)
3939

40-
def _calculate_probability(self, mean, var, x):
40+
def _calculate_likelihood(self, mean, var, x):
4141
""" Gaussian likelihood of the data x given mean and var """
4242
coeff = (1.0 / (math.sqrt((2.0 * math.pi) * var)))
4343
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var)))
@@ -54,7 +54,7 @@ def _calculate_prior(self, c):
5454

5555
def _classify(self, sample):
5656
""" Classify using Bayes Rule, P(Y|X) = P(X|Y)*P(Y)/P(X)
57-
P(X|Y) - Probability. Gaussian distribution (given by _calculate_probability)
57+
P(X|Y) - Likelihood. Gaussian distribution (given by _calculate_likelihood)
5858
P(Y) - Prior (given by _calculate_prior)
5959
P(X) - Scales the posterior to make it a proper probability distribution.
6060
This term is ignored in this implementation since it doesn't affect
@@ -65,24 +65,20 @@ def _classify(self, sample):
6565
# Go through list of classes
6666
for i in range(len(self.classes)):
6767
c = self.classes[i]
68-
prior = self._calculate_prior(c)
69-
posterior = prior
70-
# multiply with the additional probabilties
68+
posterior = self._calculate_prior(c)
7169
# Naive assumption (independence):
7270
# P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
71+
# Multiply with the class likelihoods
7372
for j, params in enumerate(self.parameters[i]):
7473
sample_feature = sample[j]
7574
# Determine P(x|Y)
76-
likelihood = self._calculate_probability(params["mean"], params["var"], sample_feature)
77-
# Multiply with the rest
75+
likelihood = self._calculate_likelihood(params["mean"], params["var"], sample_feature)
76+
# Multiply with the accumulated probability
7877
posterior *= likelihood
7978
# Total posterior = P(Y)*P(x1|Y)*P(x2|Y)*...*P(xN|Y)
8079
posteriors.append(posterior)
81-
# Get the largest probability and return the class corresponding
82-
# to that probability
80+
# Return the class with the largest posterior probability
8381
index_of_max = np.argmax(posteriors)
84-
max_value = posteriors[index_of_max]
85-
8682
return self.classes[index_of_max]
8783

8884
def predict(self, X):

mlfromscratch/supervised_learning/perceptron.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
1111
from mlfromscratch.utils.data_operation import accuracy_score
1212
from mlfromscratch.deep_learning.activation_functions import Sigmoid, ReLU, SoftPlus, LeakyReLU, TanH, ELU
13-
from mlfromscratch.deep_learning.optimizers import GradientDescent
1413
from mlfromscratch.deep_learning.loss_functions import CrossEntropy, SquareLoss
1514
from mlfromscratch.utils import Plot
1615

@@ -52,14 +51,11 @@ def fit(self, X, y):
5251
# Calculate outputs
5352
linear_output = X.dot(self.W) + self.w0
5453
y_pred = self.activation.function(linear_output)
55-
5654
# Calculate the loss gradient w.r.t the input of the activation function
5755
error_gradient = self.loss.gradient(y, y_pred) * self.activation.gradient(linear_output)
58-
5956
# Calculate the gradient of the loss with respect to each weight
6057
grad_wrt_w = X.T.dot(error_gradient)
6158
grad_wrt_w0 = np.sum(error_gradient, axis=0, keepdims=True)
62-
6359
# Update weights
6460
self.W -= self.learning_rate * grad_wrt_w
6561
self.w0 -= self.learning_rate * grad_wrt_w0

mlfromscratch/supervised_learning/regression.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,5 @@
11
from __future__ import print_function
2-
import matplotlib.pyplot as plt
32
import numpy as np
4-
import pandas as pd
5-
from sklearn import datasets
6-
import sys
7-
import os
8-
import math
93
# Import helper functions
104
from mlfromscratch.utils.data_manipulation import normalize
115
from mlfromscratch.utils.data_manipulation import polynomial_features

0 commit comments

Comments
 (0)