Added mnist.py

Hvass-Labs · Hvass-Labs · commit 60c5c46d8223 · 2018-07-16T15:14:02.000+02:00
diff --git a/mnist.py b/mnist.py
@@ -0,0 +1,186 @@
+########################################################################
+#
+# Downloads the MNIST data-set for recognizing hand-written digits.
+#
+# Implemented in Python 3.6
+#
+# Usage:
+# 1) Create a new object instance: data = MNIST(data_dir="data/MNIST/")
+#    This automatically downloads the files to the given dir.
+# 2) Use the training-set as data.x_train, data.y_train and data.y_train_cls
+# 3) Get random batches of training data using data.random_batch()
+# 4) Use the test-set as data.x_test, data.y_test and data.y_test_cls
+#
+########################################################################
+#
+# This file is part of the TensorFlow Tutorials available at:
+#
+# https://github.com/Hvass-Labs/TensorFlow-Tutorials
+#
+# Published under the MIT License. See the file LICENSE for details.
+#
+# Copyright 2016-18 by Magnus Erik Hvass Pedersen
+#
+########################################################################
+
+import numpy as np
+import gzip
+import os
+from dataset import one_hot_encoded
+from download import download
+
+########################################################################
+
+# Base URL for downloading the data-files from the internet.
+base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"
+
+# Filenames for the data-set.
+filename_x_train = "train-images-idx3-ubyte.gz"
+filename_y_train = "train-labels-idx1-ubyte.gz"
+filename_x_test = "t10k-images-idx3-ubyte.gz"
+filename_y_test = "t10k-labels-idx1-ubyte.gz"
+
+########################################################################
+
+
+class MNIST:
+    """
+    The MNIST data-set for recognizing hand-written digits.
+    This automatically downloads the data-files if they do
+    not already exist in the local data_dir.
+
+    Note: Pixel-values are floats between 0.0 and 1.0.
+    """
+
+    # The images are 28 pixels in each dimension.
+    img_size = 28
+
+    # The images are stored in one-dimensional arrays of this length.
+    img_size_flat = img_size * img_size
+
+    # Tuple with height and width of images used to reshape arrays.
+    img_shape = (img_size, img_size)
+
+    # Number of colour channels for the images: 1 channel for gray-scale.
+    num_channels = 1
+
+    # Tuple with height, width and depth used to reshape arrays.
+    # This is used for reshaping in Keras.
+    img_shape_full = (img_size, img_size, num_channels)
+
+    # Number of classes, one class for each of 10 digits.
+    num_classes = 10
+
+    def __init__(self, data_dir="data/MNIST/"):
+        """
+        Load the MNIST data-set. Automatically downloads the files
+        if they do not already exist locally.
+
+        :param data_dir: Base-directory for downloading files.
+        """
+
+        # Copy args to self.
+        self.data_dir = data_dir
+
+        # Number of images in each sub-set.
+        self.num_train = 55000
+        self.num_val = 5000
+        self.num_test = 10000
+
+        # Download / load the training-set.
+        x_train = self._load_images(filename=filename_x_train)
+        y_train_cls = self._load_cls(filename=filename_y_train)
+
+        # Split the training-set into train / validation.
+        # Pixel-values are converted from ints between 0 and 255
+        # to floats between 0.0 and 1.0.
+        self.x_train = x_train[0:self.num_train] / 255.0
+        self.x_val = x_train[self.num_train:] / 255.0
+        self.y_train_cls = y_train_cls[0:self.num_train]
+        self.y_val_cls = y_train_cls[self.num_train:]
+
+        # Download / load the test-set.
+        self.x_test = self._load_images(filename=filename_x_test) / 255.0
+        self.y_test_cls = self._load_cls(filename=filename_y_test)
+
+        # Convert the class-numbers from bytes to ints as that is needed
+        # some places in TensorFlow.
+        self.y_train_cls = self.y_train_cls.astype(np.int)
+        self.y_val_cls = self.y_val_cls.astype(np.int)
+        self.y_test_cls = self.y_test_cls.astype(np.int)
+
+        # Convert the integer class-numbers into one-hot encoded arrays.
+        self.y_train = one_hot_encoded(class_numbers=self.y_train_cls,
+                                       num_classes=self.num_classes)
+        self.y_val = one_hot_encoded(class_numbers=self.y_val_cls,
+                                     num_classes=self.num_classes)
+        self.y_test = one_hot_encoded(class_numbers=self.y_test_cls,
+                                      num_classes=self.num_classes)
+
+    def _load_data(self, filename, offset):
+        """
+        Load the data in the given file. Automatically downloads the file
+        if it does not already exist in the data_dir.
+
+        :param filename: Name of the data-file.
+        :param offset: Start offset in bytes when reading the data-file.
+        :return: The data as a numpy array.
+        """
+
+        # Download the file from the internet if it does not exist locally.
+        download(base_url=base_url, filename=filename, download_dir=self.data_dir)
+
+        # Read the data-file.
+        path = os.path.join(self.data_dir, filename)
+        with gzip.open(path, 'rb') as f:
+            data = np.frombuffer(f.read(), np.uint8, offset=offset)
+
+        return data
+
+    def _load_images(self, filename):
+        """
+        Load image-data from the given file.
+        Automatically downloads the file if it does not exist locally.
+
+        :param filename: Name of the data-file.
+        :return: Numpy array.
+        """
+
+        # Read the data as one long array of bytes.
+        data = self._load_data(filename=filename, offset=16)
+
+        # Reshape to 2-dim array with shape (num_images, img_size_flat).
+        images_flat = data.reshape(-1, self.img_size_flat)
+
+        return images_flat
+
+    def _load_cls(self, filename):
+        """
+        Load class-numbers from the given file.
+        Automatically downloads the file if it does not exist locally.
+
+        :param filename: Name of the data-file.
+        :return: Numpy array.
+        """
+        return self._load_data(filename=filename, offset=8)
+
+    def random_batch(self, batch_size=32):
+        """
+        Create a random batch of training-data.
+
+        :param batch_size: Number of images in the batch.
+        :return: 3 numpy arrays (x, y, y_cls)
+        """
+
+        # Create a random index into the training-set.
+        idx = np.random.randint(low=0, high=self.num_train, size=batch_size)
+
+        # Use the index to lookup random training-data.
+        x_batch = self.x_train[idx]
+        y_batch = self.y_train[idx]
+        y_batch_cls = self.y_train_cls[idx]
+
+        return x_batch, y_batch, y_batch_cls
+
+
+########################################################################