Optimizers

eriklindernoren · eriklindernoren · commit 5f7d7f4bdfec · 2017-09-20T16:40:30.000+02:00
diff --git a/mlfromscratch/deep_learning/layers.py b/mlfromscratch/deep_learning/layers.py
@@ -11,21 +11,31 @@
 class Layer(object):
 
     def set_input_shape(self, shape):
+        """ Sets the shape that the layer expects of the input in the forward 
+        pass method """
         self.input_shape = shape
 
     def layer_name(self):
+        """ The name of the layer. Used in model summary. """
         return self.__class__.__name__
 
     def parameters(self):
+        """ The number of trainable parameters used by the layer """
         return 0
 
     def forward_pass(self, X, training):
+        """ Propogates the signal forward in the network """
         raise NotImplementedError()
 
     def backward_pass(self, acc_grad):
+        """ Propogates the accumulated gradient backwards in the network.
+        If the has trainable weights then these weights are also tuned in this method.
+        As input (acc_grad) it receives the gradient with respect to the output of the layer and
+        returns the gradient with respect to the output of the previous layer. """
         raise NotImplementedError()
 
     def output_shape(self):
+        """ The shape of the output produced by forward_pass """
         raise NotImplementedError()
 
 
@@ -65,7 +75,6 @@ def forward_pass(self, X, training=True):
         return X.dot(self.W) + self.w0
 
     def backward_pass(self, acc_grad):
-        
         # Save weights used during forwards pass
         W = self.W
 
diff --git a/mlfromscratch/deep_learning/neural_network.py b/mlfromscratch/deep_learning/neural_network.py
@@ -1,12 +1,6 @@
 from __future__ import print_function
-from sklearn import datasets
 from terminaltables import AsciiTable
-import sys
-import os
-import math
 import copy
-import pandas as pd
-import matplotlib.pyplot as plt
 import numpy as np
 import progressbar
 
@@ -134,7 +128,7 @@ def summary(self, name="Model Summary"):
 
         # Print network configuration table
         print (AsciiTable(table_data).table)
-        
+
         print ("Total Parameters: %d\n" % tot_params)
 
     def predict(self, X):
diff --git a/mlfromscratch/deep_learning/optimizers.py b/mlfromscratch/deep_learning/optimizers.py
@@ -1,22 +1,23 @@
 import numpy as np
 from mlfromscratch.utils.data_manipulation import make_diagonal, normalize
 
-# Optimizers for models that use gradient methods for finding the 
+# Optimizers for models that use gradient based methods for finding the 
 # weights that minimizes the loss.
-# A good resource: 
+# A great resource for understanding these methods: 
 # http://sebastianruder.com/optimizing-gradient-descent/index.html
 
-class GradientDescent():
+class StochasticGradientDescent():
     def __init__(self, learning_rate=0.01, momentum=0):
         self.learning_rate = learning_rate 
         self.momentum = momentum
-        self.w_updt = np.array([])
+        self.w_updt = None
 
     def update(self, w, grad_wrt_w):
-        if not self.w_updt.any():
+        # If not initialized
+        if self.w_updt is None:
             self.w_updt = np.zeros(np.shape(w))
         # Use momentum if set
-        self.w_updt = self.momentum * self.w_updt + grad_wrt_w
+        self.w_updt = self.momentum * self.w_updt + (1 - self.momentum) * grad_wrt_w
         # Move against the gradient to minimize loss
         return w - self.learning_rate * self.w_updt
 
@@ -40,43 +41,35 @@ def update(self, w, grad_func):
 class Adagrad():
     def __init__(self, learning_rate=0.01):
         self.learning_rate = learning_rate
-        self.G = np.array([]) # Sum of squares of the gradients
+        self.G = None # Sum of squares of the gradients
         self.eps = 1e-8
 
     def update(self, w, grad_wrt_w):
-        # Gradient clipping to avoid exploding grads
-        grad_at_w = np.clip(grad_wrt_w, -1, 1)
         # If not initialized
-        if not self.G.any():
+        if self.G is None:
             self.G = np.zeros(np.shape(w))
         # Add the square of the gradient of the loss function at w
-        self.G += np.power(grad_at_w, 2)
+        self.G += np.power(grad_wrt_w, 2)
         # Adaptive gradient with higher learning rate for sparse data
-        w_updt = self.learning_rate / np.sqrt(self.G + self.eps).T * grad_at_w
-
-        return w - w_updt
-
+        return w - self.learning_rate * grad_wrt_w / np.sqrt(self.G + self.eps)
 
 class Adadelta():
     def __init__(self, rho=0.95, eps=1e-6):
-        self.E_w_updt = np.array([]) # Running average of squared parameter updates
-        self.E_grad = np.array([]) # Running average of the squared gradient of w
-        self.w_updt = np.array([]) # Parameter update
+        self.E_w_updt = None # Running average of squared parameter updates
+        self.E_grad = None   # Running average of the squared gradient of w
+        self.w_updt = None   # Parameter update
         self.eps = eps
         self.rho = rho
 
     def update(self, w, grad_wrt_w):
-        # Gradient clipping to avoid exploding grads
-        grad_at_w = np.clip(grad_wrt_w, -1, 1)
-
         # If not initialized
-        if not self.w_updt.any():
+        if self.w_updt is None:
             self.w_updt = np.zeros(np.shape(w))
             self.E_w_updt = np.zeros(np.shape(w))
-            self.E_grad = np.zeros(np.shape(grad_at_w))
+            self.E_grad = np.zeros(np.shape(grad_wrt_w))
 
         # Update average of gradients at w
-        self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_at_w, 2)
+        self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_wrt_w, 2)
         
         RMS_delta_w = np.sqrt(self.E_w_updt + self.eps)
         RMS_grad = np.sqrt(self.E_grad + self.eps)
@@ -85,7 +78,7 @@ def update(self, w, grad_wrt_w):
         adaptive_lr = RMS_delta_w / RMS_grad
 
         # Calculate the update
-        self.w_updt = adaptive_lr * grad_at_w
+        self.w_updt = adaptive_lr * grad_wrt_w
 
         # Update the running average of w updates
         self.E_w_updt = self.rho * self.E_w_updt + (1 - self.rho) * np.power(self.w_updt, 2)
@@ -95,40 +88,34 @@ def update(self, w, grad_wrt_w):
 class RMSprop():
     def __init__(self, learning_rate=0.01, rho=0.9):
         self.learning_rate = learning_rate
-        self.Eg = np.array([]) # Running average of the square gradients at w
+        self.Eg = None # Running average of the square gradients at w
         self.eps = 1e-8
         self.rho = rho
 
     def update(self, w, grad_wrt_w):
-        # Gradient clipping to avoid exploding grads
-        grad_at_w = np.clip(grad_wrt_w, -1, 1)
-
         # If not initialized
-        if not self.Eg.any():
-            self.Eg = np.zeros(np.shape(grad_at_w))
+        if self.Eg is None:
+            self.Eg = np.zeros(np.shape(grad_wrt_w))
 
-        self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(grad_at_w, 2)
+        self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(grad_wrt_w, 2)
 
         # Divide the learning rate for a weight by a running average of the magnitudes of recent
         # gradients for that weight
-        self.w_updt = self.learning_rate * np.linalg.pinv(np.sqrt(self.Eg + self.eps)).T * grad_at_w
-
-        return w - self.w_updt
+        return w - self.learning_rate *  grad_wrt_w / np.sqrt(self.Eg + self.eps)
 
 class Adam():
     def __init__(self, learning_rate=0.001, b1=0.9, b2=0.999):
         self.learning_rate = learning_rate
         self.eps = 1e-8
-        self.m = np.array([])
-        self.v = np.array([]) 
+        self.m = None
+        self.v = None
         # Decay rates
         self.b1 = b1
         self.b2 = b2
 
     def update(self, w, grad_wrt_w):
-
         # If not initialized
-        if not self.m.any():
+        if self.m is None:
             self.m = np.zeros(np.shape(grad_wrt_w))
             self.v = np.zeros(np.shape(grad_wrt_w))
         
@@ -138,7 +125,7 @@ def update(self, w, grad_wrt_w):
         m_hat = self.m / (1 - self.b1)
         v_hat = self.v / (1 - self.b2)
 
-        self.w_updt = self.learning_rate / (np.sqrt(v_hat) + self.eps) * m_hat
+        self.w_updt = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.eps)
 
         return w - self.w_updt
 
diff --git a/mlfromscratch/examples/convolutional_neural_network.py b/mlfromscratch/examples/convolutional_neural_network.py
@@ -10,7 +10,7 @@
 from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
 from mlfromscratch.utils.data_manipulation import get_random_subsets, shuffle_data
 from mlfromscratch.utils.data_operation import accuracy_score
-from mlfromscratch.deep_learning.optimizers import GradientDescent, Adam, RMSprop, Adagrad, Adadelta
+from mlfromscratch.deep_learning.optimizers import StochasticGradientDescent, Adam, RMSprop, Adagrad, Adadelta
 from mlfromscratch.deep_learning.loss_functions import CrossEntropy
 from mlfromscratch.utils.misc import bar_widgets
 from mlfromscratch.utils import Plot
@@ -25,7 +25,7 @@ def main():
     # Conv Net
     #----------
 
-    optimizer = Adam()
+    optimizer = Adadelta()
 
     data = datasets.load_digits()
     X = data.data
diff --git a/mlfromscratch/examples/multilayer_perceptron.py b/mlfromscratch/examples/multilayer_perceptron.py
@@ -10,7 +10,7 @@
 from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
 from mlfromscratch.utils.data_manipulation import get_random_subsets, shuffle_data
 from mlfromscratch.utils.data_operation import accuracy_score
-from mlfromscratch.deep_learning.optimizers import GradientDescent, Adam, RMSprop, Adagrad, Adadelta
+from mlfromscratch.deep_learning.optimizers import StochasticGradientDescent, Adam, RMSprop, Adagrad, Adadelta
 from mlfromscratch.deep_learning.loss_functions import CrossEntropy
 from mlfromscratch.utils.misc import bar_widgets
 from mlfromscratch.utils import Plot
@@ -19,7 +19,7 @@
 
 def main():
 
-    optimizer = Adam()
+    optimizer = Adadelta()
 
     #-----
     # MLP
diff --git a/mlfromscratch/supervised_learning/logistic_regression.py b/mlfromscratch/supervised_learning/logistic_regression.py
@@ -1,18 +1,9 @@
 from __future__ import print_function
-import sys
-import os
-import math
-from sklearn import datasets
 import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
 
 # Import helper functions
-from mlfromscratch.utils.data_manipulation import make_diagonal, normalize, train_test_split
-from mlfromscratch.utils.data_operation import accuracy_score
+from mlfromscratch.utils.data_manipulation import make_diagonal
 from mlfromscratch.utils.activation_functions import Sigmoid
-from mlfromscratch.utils.optimizers import GradientDescent
-from mlfromscratch.unsupervised_learning import PCA
 from mlfromscratch.utils import Plot
 
 
@@ -42,7 +33,7 @@ def fit(self, X, y, n_iterations=4000):
         # Initialize parameters between [-1/sqrt(N), 1/sqrt(N)]
         limit = 1 / math.sqrt(n_features)
         self.param = np.random.uniform(-limit, limit, (n_features,))
-        
+
         # Tune parameters for n iterations
         for i in range(n_iterations):
             # Make a new prediction
diff --git a/mlfromscratch/supervised_learning/naive_bayes.py b/mlfromscratch/supervised_learning/naive_bayes.py
@@ -37,7 +37,7 @@ def fit(self, X, y):
                 parameters = {"mean": col.mean(), "var": col.var()}
                 self.parameters[i].append(parameters)
 
-    def _calculate_probability(self, mean, var, x):
+    def _calculate_likelihood(self, mean, var, x):
         """ Gaussian likelihood of the data x given mean and var """
         coeff = (1.0 / (math.sqrt((2.0 * math.pi) * var)))
         exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var)))
@@ -54,7 +54,7 @@ def _calculate_prior(self, c):
 
     def _classify(self, sample):
         """ Classify using Bayes Rule, P(Y|X) = P(X|Y)*P(Y)/P(X)
-        P(X|Y) - Probability. Gaussian distribution (given by _calculate_probability)
+        P(X|Y) - Likelihood. Gaussian distribution (given by _calculate_likelihood)
         P(Y) - Prior (given by _calculate_prior)
         P(X) - Scales the posterior to make it a proper probability distribution.
                This term is ignored in this implementation since it doesn't affect
@@ -65,24 +65,20 @@ def _classify(self, sample):
         # Go through list of classes
         for i in range(len(self.classes)):
             c = self.classes[i]
-            prior = self._calculate_prior(c)
-            posterior = prior
-            # multiply with the additional probabilties
+            posterior = self._calculate_prior(c)
             # Naive assumption (independence):
             # P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
+            # Multiply with the class likelihoods
             for j, params in enumerate(self.parameters[i]):
                 sample_feature = sample[j]
                 # Determine P(x|Y)
-                likelihood = self._calculate_probability(params["mean"], params["var"], sample_feature)
-                # Multiply with the rest
+                likelihood = self._calculate_likelihood(params["mean"], params["var"], sample_feature)
+                # Multiply with the accumulated probability
                 posterior *= likelihood
             # Total posterior = P(Y)*P(x1|Y)*P(x2|Y)*...*P(xN|Y)
             posteriors.append(posterior)
-        # Get the largest probability and return the class corresponding
-        # to that probability
+        # Return the class with the largest posterior probability
         index_of_max = np.argmax(posteriors)
-        max_value = posteriors[index_of_max]
-
         return self.classes[index_of_max]
 
     def predict(self, X):
diff --git a/mlfromscratch/supervised_learning/perceptron.py b/mlfromscratch/supervised_learning/perceptron.py
@@ -10,7 +10,6 @@
 from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
 from mlfromscratch.utils.data_operation import accuracy_score
 from mlfromscratch.deep_learning.activation_functions import Sigmoid, ReLU, SoftPlus, LeakyReLU, TanH, ELU
-from mlfromscratch.deep_learning.optimizers import GradientDescent
 from mlfromscratch.deep_learning.loss_functions import CrossEntropy, SquareLoss
 from mlfromscratch.utils import Plot
 
@@ -52,14 +51,11 @@ def fit(self, X, y):
             # Calculate outputs
             linear_output = X.dot(self.W) + self.w0
             y_pred = self.activation.function(linear_output)
-
             # Calculate the loss gradient w.r.t the input of the activation function
             error_gradient = self.loss.gradient(y, y_pred) * self.activation.gradient(linear_output)
-
             # Calculate the gradient of the loss with respect to each weight
             grad_wrt_w = X.T.dot(error_gradient)
             grad_wrt_w0 = np.sum(error_gradient, axis=0, keepdims=True)
-
             # Update weights
             self.W  -= self.learning_rate * grad_wrt_w
             self.w0 -= self.learning_rate  * grad_wrt_w0
diff --git a/mlfromscratch/supervised_learning/regression.py b/mlfromscratch/supervised_learning/regression.py
@@ -1,11 +1,5 @@
 from __future__ import print_function
-import matplotlib.pyplot as plt
 import numpy as np
-import pandas as pd
-from sklearn import datasets
-import sys
-import os
-import math
 # Import helper functions
 from mlfromscratch.utils.data_manipulation import normalize
 from mlfromscratch.utils.data_manipulation import polynomial_features