Add user test code

c21 · c21 · commit 69387dd5df49 · 2023-11-03T16:28:40.000-07:00
Signed-off-by: Cheng Su &lt;scnju13@gmail.com&gt;
diff --git a/.config.py b/.config.py
@@ -0,0 +1,59 @@
+from torchvision.transforms import Compose, Normalize, ToTensor
+
+import ray.train
+from ray.train import ScalingConfig
+from ray.train.torch import TorchTrainer
+
+from util import prepare_model
+
+
+# [1] Read the training data (in Parquet file format).
+path = "s3://air-example-data/data-cuj/train"
+ds = ray.data.read_parquet(path)
+
+# [2] Preprocess the training data.
+transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
+
+def transform_image(row):
+    row["image"] = transform(row["image"])
+    return row
+
+ds = ds.map(transform_image)
+
+# [3] Ingest the training data to model in each training worker.
+def train_func_per_worker():
+    # Prepare model.
+    from tqdm import tqdm
+    model, loss_fn, optimizer = prepare_model()
+
+    # Model training loop for 2 epochs.
+    for epoch in range(2):
+        train_data_iterator = ray.train.get_dataset_shard("train")
+        train_data_iterator = train_data_iterator.iter_torch_batches(batch_size=32)
+
+        model.train()
+        for batch in tqdm(train_data_iterator, desc=f"Train Epoch {epoch}"):
+            images = batch["image"]
+            labels = batch["label"]
+            pred = model(images)
+            loss = loss_fn(pred, labels)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        model.eval()
+
+# [4] Start distributed training.
+if __name__ == "__main__":
+    # Initialize a Ray TorchTrainer
+    trainer = TorchTrainer(
+        train_loop_per_worker=train_func_per_worker,
+        scaling_config=ScalingConfig(num_workers=1, use_gpu=True),
+        datasets={"train": ds}
+    )
+
+    # Run `train_func_per_worker` on all workers
+    # =============================================
+    result = trainer.fit()
+    print(f"Training result: {result}")
diff --git a/user_test.ipynb b/user_test.ipynb
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Ray Data User Testing\n",
+    "\n",
+    "In this notebook, you will learn how to use Ray Data for distributed model training. Ray Data is used for data loading for model training. You are asked to fill in the missing code to finish the following 3 tasks:\n",
+    "\n",
+    "- Task 1: Read training data from S3\n",
+    "- Task 2: Preprocess training data\n",
+    "- Task 3: Run distributed training with 4 GPU workers\n",
+    "\n",
+    "You can refer to the Ray Data Documentation for user guides and APIs: https://docs.ray.io/en/master/data/data.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Task 1: Read training data from S3\n",
+    "\n",
+    "In this section, you will read training data from AWS S3 (https://aws.amazon.com/s3/). The training data consists of a list of files in Parquet format (https://parquet.apache.org/). The training data contains images and their labels.\n",
+    "\n",
+    "Success Criteria for this section:\n",
+    "- Successfully create a Ray Dataset to read Parquet files from S3.\n",
+    "- Successfully inspect metadata of Ray Dataset to get the names of columns, and the number of images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Instruction: just run this cell.\n",
+    "# Import required dependencies.\n",
+    "import numpy as np\n",
+    "from torchvision.transforms import Compose, Normalize, ToTensor\n",
+    "from tqdm import tqdm\n",
+    "import ray.train\n",
+    "from ray.train import ScalingConfig\n",
+    "from ray.train.torch import TorchTrainer\n",
+    "from util import prepare_model\n",
+    "\n",
+    "path = \"s3://air-example-data/data-cuj/train\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Instruction: add your code here.\n",
+    "# Read from S3 bucket above: s3://air-example-data/data-cuj/train\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Please fill in the blanks below:\n",
+    "\n",
+    "|                     |                      |\n",
+    "|---------------------|----------------------|\n",
+    "| Columns names       | ___x___              |\n",
+    "| Number of images    | ___x___              |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Task 2: Preprocess images\n",
+    "\n",
+    "In this section, we'll preprocess images to normalize them for training. You are given the preprocess code below to normalize a single image with TorchVision function. You are expected to use the Ray Data API to parallelize the preprocessing logic among all images. \n",
+    "\n",
+    "Success Criteria for this section:\n",
+    "- Successfully use correct Ray Data API to run preprocessing for all images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Instruction: just run this cell.\n",
+    "# The preprocessing function to be applied to every image.\n",
+    "transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])\n",
+    "\n",
+    "# Example of running the preprocessing on a toy image.\n",
+    "example_image = np.array([[1, 0, 1], [1, 1, 0.5]], np.double)\n",
+    "transform(example_image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Instruction: add your code here.\n",
+    "# Use Ray Data to parallelize the preprocessing logic above.\n",
+    "# NOTE: Do not drop labels during preprocessing!\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 3: Run distributed training with 4 GPU workers\n",
+    "\n",
+    "In this section, let's do distributed model training for preprocessed images. To do distributed model training, you are given a script with everything working, except the part of data loading is missing. You are expected to fill in the data loading code and start training!\n",
+    "\n",
+    "Success Criteria for this section:\n",
+    "- Successfully use correct Ray Data API to get shard of Dataset.\n",
+    "- Successfully use correct Ray Data API to iterate the Dataset and pass batch of (images, labels) to model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# The function executed on each training worker.\n",
+    "def train_func_per_worker():\n",
+    "    # Prepare model.\n",
+    "    model, loss_fn, optimizer = prepare_model()\n",
+    "\n",
+    "    # Model training for 2 epochs.\n",
+    "    for epoch in range(2):\n",
+    "        # Instruction: add your code here to get Dataset.\n",
+    "        # NOTE: each training data batch should have 32 images and labels.\n",
+    "        # ...\n",
+    "        # train_data_iterator = ...\n",
+    "\n",
+    "        model.train()\n",
+    "        # Model training for each batch.\n",
+    "        for batch in tqdm(train_data_iterator, desc=f\"Train Epoch {epoch}\"):\n",
+    "            images = batch[\"image\"]\n",
+    "            labels = batch[\"label\"]\n",
+    "            pred = model(images)\n",
+    "            loss = loss_fn(pred, labels)\n",
+    "\n",
+    "            optimizer.zero_grad()\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "\n",
+    "        model.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Initialize a Ray TorchTrainer\n",
+    "trainer = TorchTrainer(\n",
+    "    train_loop_per_worker=train_func_per_worker,\n",
+    "    scaling_config=ScalingConfig(num_workers=1, use_gpu=True),\n",
+    "    # Instruction: add your code here to pass the Dataset variable.\n",
+    "    datasets={\"train\": ...}\n",
+    ")\n",
+    "\n",
+    "# Start model training!\n",
+    "result = trainer.fit()\n",
+    "print(f\"Training result: {result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Please fill in the blanks below:\n",
+    "\n",
+    "|                            |                      |\n",
+    "|----------------------------|----------------------|\n",
+    "| Number of epochs trained   | ___x___ (epochs)     |\n",
+    "| End-to-end training time   | ___x___ (seconds)    |"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "1a1af0ee75eeea9e2e1ee996c87e7a2b11a0bebd85af04bb136d915cefc0abce"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/util.py b/util.py
@@ -0,0 +1,34 @@
+import torch
+from torch import nn
+
+import ray.train
+
+# Model Definition
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28 * 28, 512),
+            nn.ReLU(),
+            nn.Dropout(0.25),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Dropout(0.25),
+            nn.Linear(512, 10),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+# Create model, loss funtion and optimizer.
+def prepare_model():
+    lr = 1e-3
+    model = NeuralNetwork()
+    model = ray.train.torch.prepare_model(model)
+    loss_fn = nn.CrossEntropyLoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
+    return (model, loss_fn, optimizer)