diff --git a/notebooks/ProjectMockup.ipynb b/notebooks/ProjectMockup.ipynb
new file mode 100644
index 00000000..e1070353
--- /dev/null
+++ b/notebooks/ProjectMockup.ipynb
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1c4fd5b5-1b9e-4a8e-82b3-6ab214c203ea",
+   "metadata": {},
+   "source": [
+    "## Project Mockup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "54a52e27-3d9d-4574-990b-329a47d4bf13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "struct S { double val = 1.0; };"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b9e80eda-8924-4f3c-bbc0-7b74ba9b9a36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "python_vec = cppyy.gbl.std.vector(cppyy.gbl.S)(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "697760dd-200e-4fa6-85b2-f1786215eda3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "print(python_vec[3].val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d581f6cc-2737-4c11-b37a-25bb5296936a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<__main__.Derived object at 0x7fffdf639e40>\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<string>:1: RuntimeWarning: class \"S\" has no virtual destructor\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "class Derived(cppyy.gbl.S):\n",
+    "  def __init__(self):\n",
+    "    val = 0\n",
+    "res = Derived()\n",
+    "print(res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f20feaa6-62a2-4805-853b-268b1c1832ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void arr_sum(int n, double *x, double *sum) {\n",
+    "    for(int i = 0; i < n; i++)\n",
+    "        *sum +=x[i];\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "cfd92edb-41e0-4bb8-b7a1-852782964423",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int n = 5;\n",
+    "double h_sum;\n",
+    "double *x = new double[n];"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "90480051-7a57-4145-85c4-ca3bdc8e101f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void setData(const std::vector<S>& a) {\n",
+    "    int i = 0;\n",
+    "    for(auto &s : a) {\n",
+    "        x[i] = s.val;\n",
+    "        i++;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "9d9913b7-db39-4c76-8939-efbca9256143",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "data_list = [1.0, 2.0, 3.0, 4.0, 5.0]\n",
+    "for c, i in enumerate(data_list):\n",
+    "    python_vec[c].val = i\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "846901c7-0cbb-4978-9efe-7f5870b939a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "cppyy.gbl.setData(python_vec)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f57c46fa-addb-4d80-8cc3-29a464911fdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "double *d_x, *d_sum;\n",
+    "cudaMalloc((void **)&d_x, n * sizeof(double));\n",
+    "cudaMalloc((void **)&d_sum, sizeof(double));"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d617ee2c-d9b7-4c4f-8ef2-051df37b2737",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudaMemcpy(d_x, x, n * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "cudaMemcpy(d_sum, &h_sum, sizeof(double), cudaMemcpyHostToDevice);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8417b211-e7f9-448d-969a-27fc0eb32697",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arr_sum<<<1, 1>>>(n, d_x, d_sum);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "4374b687-31a5-4b85-8d66-e738956814b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudaMemcpy(&h_sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "8198bfdf-0ff7-4e27-ba6b-087833055794",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sum: 15\n"
+     ]
+    }
+   ],
+   "source": [
+    "std::cout << \"Sum: \" << h_sum << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "33ffcf05-58cc-4212-aef7-4a5813fdc178",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "delete[] x;\n",
+    "cudaFree(d_x);\n",
+    "cudaFree(d_sum);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "add835ef-6196-47d8-be75-14b872438de1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CUDA (C++17)",
+   "language": "CUDA",
+   "name": "cuda-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/Python-Cpp-Integration-Demo.ipynb b/notebooks/Python-Cpp-Integration-Demo.ipynb
new file mode 100644
index 00000000..41ae6eab
--- /dev/null
+++ b/notebooks/Python-Cpp-Integration-Demo.ipynb
@@ -0,0 +1,273 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ca6bccaa",
+   "metadata": {},
+   "source": [
+    "## Declaring variables in C++ "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "879a6503",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extern \"C\" int printf(const char*,...);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2063086d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int new_var1 = 12;\n",
+    "int new_var2 = 25;\n",
+    "int new_var3 = 64;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e42ad3d",
+   "metadata": {},
+   "source": [
+    "## Running Python with C++ variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7edde9fb",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This is printed from Python: Today is Tue Oct 25 11:38:08 2022\n",
+      "[1, 2, 12, 25, 64]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "from time import time,ctime\n",
+    "print('This is printed from Python: Today is', ctime(time()))\n",
+    "python_array = [1, 2, new_var1, new_var2, new_var3]\n",
+    "print(python_array)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "579f1d82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python \n",
+    "\n",
+    "new_python_var = 1327\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "43391a7e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "new_python_var = 1327\n"
+     ]
+    }
+   ],
+   "source": [
+    "auto k = printf(\"new_python_var = %d\\n\", new_python_var);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9030df20",
+   "metadata": {},
+   "source": [
+    "## Working with a vector type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "400e42d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <iostream>\n",
+    "#include <vector>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "92b2bd05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<int> fill_vector(std::vector<int> g) { for (int i = 1; i <= 5; i++){ g.push_back(i);} return g; }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ea7f2d92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int size_of_vector (std::vector<int> g) { std::cout << \"Size : \" << g.size(); return g.size(); }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "760237a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int capacity_of_vector (std::vector<int> g) { std::cout << \"Capacity : \" << g.capacity(); return g.capacity(); }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "1c02ee6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<int> cpp_vector = fill_vector(cpp_vector);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "a7fb4cb5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Size : 5"
+     ]
+    }
+   ],
+   "source": [
+    "int size = size_of_vector(cpp_vector);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "95f8e6f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Capacity : 5"
+     ]
+    }
+   ],
+   "source": [
+    "int capacity = capacity_of_vector(cpp_vector);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "781feea0",
+   "metadata": {},
+   "source": [
+    "## Plotting data from C++ and Python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "7ccb6fe7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1, 2, 3, 4, 5]\n",
+      "[1, 2, 12, 25, 64]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "print(cpp_vector)\n",
+    "print(python_array)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "531e00ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "plt.plot(python_array, cpp_vector,linewidth=3)\n",
+    "plt.xlabel('Python Array')\n",
+    "plt.ylabel('Cpp Array')\n",
+    "plt.savefig('python_cpp_plot.png')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b1842530",
+   "metadata": {},
+   "source": [
+    "<img src=\"python_cpp_plot.png\" align=left width=\"400\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bf12bfb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++11",
+   "language": "C++11",
+   "name": "xcpp11"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/audio/audio.wav b/notebooks/audio/audio.wav
new file mode 100644
index 00000000..ad425399
Binary files /dev/null and b/notebooks/audio/audio.wav differ
diff --git a/notebooks/clad-demo.ipynb b/notebooks/clad-demo.ipynb
new file mode 100644
index 00000000..34684c2c
--- /dev/null
+++ b/notebooks/clad-demo.ipynb
@@ -0,0 +1,113 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "74f52cc2-bfda-4fee-9cd9-94d683a8b7e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include \"clad/Differentiator/Differentiator.h\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "55453d37-e8af-4cb7-8f82-550e9cf88b3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Rosenbrock function declaration\n",
+    "double rosenbrock_func(double x, double y) {\n",
+    "return (x - 1) * (x - 1) + 100 * (y - x * x) * (y - x * x);\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9098b6e4-87a7-497d-9a1b-dddeb813dc56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "double rosenbrock_forward(double x[], int size) {\n",
+    "    double sum = 0;\n",
+    "    auto rosenbrockX = clad::differentiate(rosenbrock_func, 0);\n",
+    "    auto rosenbrockY = clad::differentiate(rosenbrock_func, 1);\n",
+    "    for (int i = 0; i < size-1; i++) {\n",
+    "        double one = rosenbrockX.execute(x[i], x[i + 1]);\n",
+    "        double two = rosenbrockY.execute(x[i], x[i + 1]);\n",
+    "            sum = sum + one + two;\n",
+    "    }\n",
+    "    return sum;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e771bbdc-3979-43df-9ed9-3429ee713b5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "const int size = 100000000;\n",
+    "double Xarray[size];\n",
+    "for(int i=0;i<size;i++)\n",
+    "  Xarray[i]=((double)rand()/RAND_MAX);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e74f85c2-a3bc-4562-a9ae-321eb306e8ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "double r = rosenbrock_forward(Xarray, size);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "184cebf3-a98f-4c07-97c6-c878be3740c8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3232947666.004954\n"
+     ]
+    }
+   ],
+   "source": [
+    "printf(\"%f\\n\", r);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f68f1fe-24ca-40fb-997b-6f47a6b81ee7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++17 (+Clad)",
+   "language": "C++17",
+   "name": "clad-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "C++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+}
\ No newline at end of file
diff --git a/notebooks/clad-smallpt.ipynb b/notebooks/clad-smallpt.ipynb
new file mode 100644
index 00000000..3f23ce66
--- /dev/null
+++ b/notebooks/clad-smallpt.ipynb
@@ -0,0 +1,729 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "18c0527f-84c8-4fd7-9503-0790bb44ab4f",
+   "metadata": {},
+   "source": [
+    "# SmallPT with Clad + OpenMP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "561cd184-24ca-45be-9534-0172454d4398",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//--------------------------------------------------------------------*- C++ -*-\n",
+    "// clad - The C++ Clang-based Automatic Differentiator\n",
+    "//\n",
+    "// A demo, describing how to use clad in simple AD-aware Path tracer.\n",
+    "//\n",
+    "// author:  Alexander Penev <alexander.p.penev-at-gmail.com>\n",
+    "//          Based on smallpt, a Path Tracer by Kevin Beason, 2008\n",
+    "//----------------------------------------------------------------------------//\n",
+    "\n",
+    "// To compile the demo please type:\n",
+    "// path/to/clang -O3 -Xclang -add-plugin -Xclang clad -Xclang -load -Xclang \\\n",
+    "// path/to/libclad.so -I../../include/ -x c++ -std=c++11 -lstdc++ -lm SmallPT.cpp \\\n",
+    "// -fopenmp=libiomp5 -o SmallPT\n",
+    "//\n",
+    "// To run the demo please type:\n",
+    "// ./SmallPT 500 && xv image.ppm\n",
+    "\n",
+    "// A typical invocation would be:\n",
+    "// ../../../../../obj/Debug+Asserts/bin/clang -O3 -Xclang -add-plugin -Xclang clad \\\n",
+    "// -Xclang -load -Xclang ../../../../../obj/Debug+Asserts/lib/libclad.dylib \\\n",
+    "// -I../../include/ -x c++ -std=c++11 -lstdc++ -lm SmallPT.cpp -fopenmp=libiomp5 -o SmallPT\n",
+    "// ./SmallPT 500 && xv image.ppm\n",
+    "\n",
+    "// Necessary for clad to work include\n",
+    "#include \"clad/Differentiator/Differentiator.h\"\n",
+    "\n",
+    "#include <math.h>\n",
+    "#include <stdlib.h>\n",
+    "#include <stdio.h>\n",
+    "#include <iostream>\n",
+    "\n",
+    "#include <omp.h>\n",
+    "#include <clang/Interpreter/CppInterOp.h>\n",
+    "\n",
+    "Cpp::LoadLibrary(\"libomp\");\n",
+    "\n",
+    "//TODO: Try to fix single precision issues\n",
+    "//#define double float\n",
+    "\n",
+    "// Test types (default is TEST_TYPE_BY_CLAD)\n",
+    "#define TEST_TYPE_BY_HAND\n",
+    "//#define TEST_TYPE_BY_CLAD\n",
+    "//#define TEST_TYPE_BY_NUM\n",
+    "\n",
+    "\n",
+    "struct Vec {\n",
+    "  double x, y, z; // position, also color (r,g,b)\n",
+    "\n",
+    "  Vec(double x_=0, double y_=0, double z_=0) { x=x_; y=y_; z=z_; }\n",
+    "\n",
+    "  Vec operator+(const Vec &b) const { return Vec(x+b.x, y+b.y, z+b.z); }\n",
+    "  Vec operator-(const Vec &b) const { return Vec(x-b.x, y-b.y, z-b.z); }\n",
+    "  Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }\n",
+    "  double operator*(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // dot\n",
+    "  Vec operator%(const Vec &b) const { return Vec(y*b.z-z*b.y, z*b.x-x*b.z, x*b.y-y*b.x); } // cross\n",
+    "  Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }\n",
+    "  Vec& norm() { return *this = *this * (1/sqrt(x*x+y*y+z*z)); }\n",
+    "};\n",
+    "\n",
+    "struct Ray {\n",
+    "  Vec o, d; // Origin and direction\n",
+    "  Ray() : o(), d() {}\n",
+    "  Ray(Vec o_, Vec d_): o(o_), d(d_) {}\n",
+    "};\n",
+    "\n",
+    "enum Refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()\n",
+    "\n",
+    "#define inf 1e6\n",
+    "#define eps 1e-6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "56215b0d-23f8-4866-b745-ee6b3b881eb2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "/*\n",
+    "Copyright (C) 2017 Milo Yip. All rights reserved.\n",
+    "\n",
+    "Redistribution and use in source and binary forms, with or without\n",
+    "modification, are permitted provided that the following conditions are met:\n",
+    "\n",
+    "* Redistributions of source code must retain the above copyright notice, this\n",
+    "  list of conditions and the following disclaimer.\n",
+    "\n",
+    "* Redistributions in binary form must reproduce the above copyright notice,\n",
+    "  this list of conditions and the following disclaimer in the documentation\n",
+    "  and/or other materials provided with the distribution.\n",
+    "\n",
+    "* Neither the name of pngout nor the names of its\n",
+    "  contributors may be used to endorse or promote products derived from\n",
+    "  this software without specific prior written permission.\n",
+    "\n",
+    "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n",
+    "AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n",
+    "IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n",
+    "DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n",
+    "FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n",
+    "DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n",
+    "SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n",
+    "CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n",
+    "OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n",
+    "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n",
+    "*/\n",
+    "\n",
+    "/*! \\file\n",
+    "    \\brief      svpng() is a minimalistic C function for saving RGB/RGBA image into uncompressed PNG.\n",
+    "    \\author     Milo Yip\n",
+    "    \\version    0.1.1\n",
+    "    \\copyright  MIT license\n",
+    "    \\sa         http://github.com/miloyip/svpng\n",
+    "*/\n",
+    "\n",
+    "#define SVPNG_LINKAGE\n",
+    "\n",
+    "#include <stdio.h>\n",
+    "#define SVPNG_OUTPUT FILE* fp\n",
+    "\n",
+    "#define SVPNG_PUT(u) fputc(u, fp)\n",
+    "\n",
+    "/*!\n",
+    "    \\brief Save a RGB/RGBA image in PNG format.\n",
+    "    \\param SVPNG_OUTPUT Output stream (by default using file descriptor).\n",
+    "    \\param w Width of the image. (<16383)\n",
+    "    \\param h Height of the image.\n",
+    "    \\param img Image pixel data in 24-bit RGB or 32-bit RGBA format.\n",
+    "    \\param alpha Whether the image contains alpha channel.\n",
+    "*/\n",
+    "SVPNG_LINKAGE void svpng(SVPNG_OUTPUT, unsigned w, unsigned h, const unsigned char* img, int alpha) {\n",
+    "    static const unsigned t[] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, \n",
+    "    /* CRC32 Table */    0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };\n",
+    "    unsigned a = 1, b = 0, c, p = w * (alpha ? 4 : 3) + 1, x, y, i;   /* ADLER-a, ADLER-b, CRC, pitch */\n",
+    "#define SVPNG_U8A(ua, l) for (i = 0; i < l; i++) SVPNG_PUT((ua)[i]);\n",
+    "#define SVPNG_U32(u) do { SVPNG_PUT((u) >> 24); SVPNG_PUT(((u) >> 16) & 255); SVPNG_PUT(((u) >> 8) & 255); SVPNG_PUT((u) & 255); } while(0)\n",
+    "#define SVPNG_U8C(u) do { SVPNG_PUT(u); c ^= (u); c = (c >> 4) ^ t[c & 15]; c = (c >> 4) ^ t[c & 15]; } while(0)\n",
+    "#define SVPNG_U8AC(ua, l) for (i = 0; i < l; i++) SVPNG_U8C((ua)[i])\n",
+    "#define SVPNG_U16LC(u) do { SVPNG_U8C((u) & 255); SVPNG_U8C(((u) >> 8) & 255); } while(0)\n",
+    "#define SVPNG_U32C(u) do { SVPNG_U8C((u) >> 24); SVPNG_U8C(((u) >> 16) & 255); SVPNG_U8C(((u) >> 8) & 255); SVPNG_U8C((u) & 255); } while(0)\n",
+    "#define SVPNG_U8ADLER(u) do { SVPNG_U8C(u); a = (a + (u)) % 65521; b = (b + a) % 65521; } while(0)\n",
+    "#define SVPNG_BEGIN(s, l) do { SVPNG_U32(l); c = ~0U; SVPNG_U8AC(s, 4); } while(0)\n",
+    "#define SVPNG_END() SVPNG_U32(~c)\n",
+    "    SVPNG_U8A(\"\\x89PNG\\r\\n\\32\\n\", 8);           /* Magic */\n",
+    "    SVPNG_BEGIN(\"IHDR\", 13);                    /* IHDR chunk { */\n",
+    "    SVPNG_U32C(w); SVPNG_U32C(h);               /*   Width & Height (8 bytes) */\n",
+    "    SVPNG_U8C(8); SVPNG_U8C(alpha ? 6 : 2);     /*   Depth=8, Color=True color with/without alpha (2 bytes) */\n",
+    "    SVPNG_U8AC(\"\\0\\0\\0\", 3);                    /*   Compression=Deflate, Filter=No, Interlace=No (3 bytes) */\n",
+    "    SVPNG_END();                                /* } */\n",
+    "    SVPNG_BEGIN(\"IDAT\", 2 + h * (5 + p) + 4);   /* IDAT chunk { */\n",
+    "    SVPNG_U8AC(\"\\x78\\1\", 2);                    /*   Deflate block begin (2 bytes) */\n",
+    "    for (y = 0; y < h; y++) {                   /*   Each horizontal line makes a block for simplicity */\n",
+    "        SVPNG_U8C(y == h - 1);                  /*   1 for the last block, 0 for others (1 byte) */\n",
+    "        SVPNG_U16LC(p); SVPNG_U16LC(~p);        /*   Size of block in little endian and its 1's complement (4 bytes) */\n",
+    "        SVPNG_U8ADLER(0);                       /*   No filter prefix (1 byte) */\n",
+    "        for (x = 0; x < p - 1; x++, img++)\n",
+    "            SVPNG_U8ADLER(*img);                /*   Image pixel data */\n",
+    "    }\n",
+    "    SVPNG_U32C((b << 16) | a);                  /*   Deflate block end with adler (4 bytes) */\n",
+    "    SVPNG_END();                                /* } */\n",
+    "    SVPNG_BEGIN(\"IEND\", 0); SVPNG_END();        /* IEND chunk {} */\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3e489bc-5bec-4b62-a0b2-d7f7916e1ecf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "#include \"xeus-clang-repl/xinterpreter.hpp\"\n",
+    "#include \"xtl/xbase64.hpp\"\n",
+    "#include \"nlohmann/json.hpp\"\n",
+    "#include <string>\n",
+    "#include <fstream>\n",
+    "#include <sstream>\n",
+    "\n",
+    "inline double Clamp(double v, double low = 0.0, double high = 1.0) noexcept\n",
+    "{\n",
+    "    return fmin(fmax(v, low), high);\n",
+    "}\n",
+    "\n",
+    "int toInt(float x)\n",
+    "{\n",
+    "    return (int) (pow(Clamp(x, 0.0f, 1.0f), 1.0f / 2.2f) * 255 + 0.5f);\n",
+    "}\n",
+    "\n",
+    "void save(const char* fileName, int width, int height, Vec* data)\n",
+    "{\n",
+    "    FILE *fp = fopen(fileName, \"wb\");\n",
+    "\n",
+    "    // Convert from Vector3 array to uchar array\n",
+    "    unsigned char* output = new unsigned char[width * height * 3];\n",
+    "\n",
+    "    for(int i = 0; i < width * height; i++)\n",
+    "    {\n",
+    "        //printf_s(\"%f %f %f \\n\", data[i].x, data[i].y, data[i].z);\n",
+    "        output[i * 3 + 0] = toInt(data[i].x);\n",
+    "        output[i * 3 + 1] = toInt(data[i].y);\n",
+    "        output[i * 3 + 2] = toInt(data[i].z);\n",
+    "    }\n",
+    "\n",
+    "    svpng(fp, width, height, output, 0);\n",
+    "\n",
+    "    fclose(fp);\n",
+    "    delete[] output;\n",
+    "}\n",
+    "\n",
+    "namespace nl = nlohmann;\n",
+    "\n",
+    "struct image {   \n",
+    "    inline image(const std::string& filename)\n",
+    "    {\n",
+    "        std::ifstream fin(filename, std::ios::binary);   \n",
+    "        m_buffer << fin.rdbuf();\n",
+    "    }\n",
+    "        \n",
+    "    std::stringstream m_buffer;\n",
+    "};\n",
+    "    \n",
+    "nl::json mime_bundle_repr(const image& i)\n",
+    "{\n",
+    "    auto bundle = nl::json::object();\n",
+    "    bundle[\"image/png\"] = xtl::base64encode(i.m_buffer.str());\n",
+    "    return bundle;\n",
+    "}\n",
+    "\n",
+    "template <class T> void display(const T &t) {\n",
+    "  using ::mime_bundle_repr;\n",
+    "  xeus::get_interpreter().display_data(mime_bundle_repr(t), nl::json::object(),\n",
+    "                                       nl::json::object());\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e6c43ec-7aab-4488-a506-6ea0f720eb70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Abstract Solid\n",
+    "\n",
+    "class Solid {\n",
+    "public:\n",
+    "  Vec e, c; // emission, color\n",
+    "  Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive)\n",
+    "\n",
+    "  Solid() : e(), c(), refl() {}\n",
+    "  Solid(Vec e_, Vec c_, Refl_t refl_): e(e_), c(c_), refl(refl_) {}\n",
+    "\n",
+    "  // returns distance, 0 if nohit\n",
+    "  virtual double intersect(const Ray &r) const { return 0; };\n",
+    "\n",
+    "  // returns normal vector to surface in point pt\n",
+    "  virtual Vec normal(const Vec &pt) const { return Vec(1,0,0); };\n",
+    "};\n",
+    "\n",
+    "// Abstract Implicit Solid\n",
+    "\n",
+    "class ImplicitSolid : public Solid {\n",
+    "public:\n",
+    "\n",
+    "  ImplicitSolid() {}\n",
+    "  ImplicitSolid(Vec e_, Vec c_, Refl_t refl_): Solid(e_, c_, refl_) {}\n",
+    "\n",
+    "  // Return signed distance to nearest point on solid surface\n",
+    "  virtual double distance_func(double x, double y, double z) const {\n",
+    "    return 0;\n",
+    "  }\n",
+    "\n",
+    "  // TODO: Remove when forward for virtual diff methods is not need\n",
+    "  virtual double distance_func_darg0(double x, double y, double z) const;\n",
+    "  virtual double distance_func_darg1(double x, double y, double z) const;\n",
+    "  virtual double distance_func_darg2(double x, double y, double z) const;\n",
+    "\n",
+    "  // implicit surface intersection\n",
+    "  // returns distance, 0 if nohit\n",
+    "  double intersect(const Ray &r) const override {\n",
+    "    double t=2*eps, t1, f;\n",
+    "    Vec pt;\n",
+    "    do {\n",
+    "      pt=r.o+r.d*t;\n",
+    "      f=fabs(distance_func(pt.x, pt.y, pt.z));\n",
+    "      t1=t;\n",
+    "      t+=f;\n",
+    "      if (f<eps || t==t1) return t;\n",
+    "    } while (t<inf);\n",
+    "    return 0;\n",
+    "  }\n",
+    "\n",
+    "  // returns normal vector to surface in point pt\n",
+    "  // by clad\n",
+    "  Vec normal(const Vec &pt) const override {\n",
+    "    // FIXME: Replace the calls to `clad::differentiate` when we fix\n",
+    "    // CladFunction::execute being able to pass the derived class (without\n",
+    "    // it being sliced.\n",
+    "    auto distance_func_dx = clad::differentiate(&ImplicitSolid::distance_func, 0);\n",
+    "    auto distance_func_dy = clad::differentiate(&ImplicitSolid::distance_func, 1);\n",
+    "    auto distance_func_dz = clad::differentiate(&ImplicitSolid::distance_func, 2);\n",
+    "    // FIXME: Uncomment, see above.\n",
+    "    //double Nx = distance_func_dx.execute(*this, pt.x, pt.y, pt.z);\n",
+    "    //double Ny = distance_func_dy.execute(*this, pt.x, pt.y, pt.z);\n",
+    "    //double Nz = distance_func_dz.execute(*this, pt.x, pt.y, pt.z);\n",
+    "    double Nx = distance_func_darg0(pt.x, pt.y, pt.z);\n",
+    "    double Ny = distance_func_darg1(pt.x, pt.y, pt.z);\n",
+    "    double Nz = distance_func_darg2(pt.x, pt.y, pt.z);\n",
+    "\n",
+    "    return Vec(Nx, Ny, Nz).norm();\n",
+    "  }\n",
+    "};\n",
+    "\n",
+    "\n",
+    "// Sphere Solid\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_HAND\n",
+    "// by hand\n",
+    "double sphere_func_dx(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return 2*(x-p.x);\n",
+    "}\n",
+    "\n",
+    "double sphere_func_dy(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return 2*(y-p.y);\n",
+    "}\n",
+    "\n",
+    "double sphere_func_dz(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return 2*(z-p.z);\n",
+    "}\n",
+    "#endif\n",
+    "\n",
+    "double sphere_distance_func(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return sqrt((x-p.x)*(x-p.x) + (y-p.y)*(y-p.y) + (z-p.z)*(z-p.z)) - r;\n",
+    "}\n",
+    "\n",
+    "double sphere_implicit_func(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return (x-p.x)*(x-p.x) + (y-p.y)*(y-p.y) + (z-p.z)*(z-p.z) - r*r;\n",
+    "}\n",
+    "\n",
+    "//\n",
+    "\n",
+    "class Sphere : public ImplicitSolid {\n",
+    "public:\n",
+    "  double r; // radius\n",
+    "  Vec p; // position\n",
+    "\n",
+    "  Sphere() : r(), p() {}\n",
+    "  Sphere(double r_, Vec p_, Vec e_, Vec c_, Refl_t refl_):\n",
+    "    r(r_), p(p_), ImplicitSolid(e_, c_, refl_) {\n",
+    "    // Move to Normal when execute can call polymorphic diffs\n",
+    "    auto sphere_func_dx = clad::differentiate(&Sphere::distance_func, 0);\n",
+    "    auto sphere_func_dy = clad::differentiate(&Sphere::distance_func, 1);\n",
+    "    auto sphere_func_dz = clad::differentiate(&Sphere::distance_func, 2);\n",
+    "  }\n",
+    "\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_HAND\n",
+    "  // by hand\n",
+    "  Vec normal(const Vec &pt) const override {\n",
+    "    double Nx = sphere_func_dx(pt.x, pt.y, pt.z, p, r);\n",
+    "    double Ny = sphere_func_dy(pt.x, pt.y, pt.z, p, r);\n",
+    "    double Nz = sphere_func_dz(pt.x, pt.y, pt.z, p, r);\n",
+    "\n",
+    "    return Vec(Nx, Ny, Nz).norm();\n",
+    "  }\n",
+    "#endif\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_CLAD\n",
+    "  // by clad\n",
+    "  Vec normal(const Vec &pt) const override {\n",
+    "    //auto sphere_func_dx = clad::differentiate(&Sphere::distance_func, 0);\n",
+    "    //auto sphere_func_dy = clad::differentiate(&Sphere::distance_func, 1);\n",
+    "    //auto sphere_func_dz = clad::differentiate(&Sphere::distance_func, 2);\n",
+    "\n",
+    "    //double Nx = sphere_func_dx.execute(*this, pt.x, pt.y, pt.z, p, r);\n",
+    "    //double Ny = sphere_func_dy.execute(*this, pt.x, pt.y, pt.z, p, r);\n",
+    "    //double Nz = sphere_func_dz.execute(*this, pt.x, pt.y, pt.z, p, r);\n",
+    "\n",
+    "    double Nx = distance_func_darg0(pt.x, pt.y, pt.z);\n",
+    "    double Ny = distance_func_darg0(pt.x, pt.y, pt.z);\n",
+    "    double Nz = distance_func_darg0(pt.x, pt.y, pt.z);\n",
+    "\n",
+    "    return Vec(Nx, Ny, Nz).norm();\n",
+    "    //return Vec(Nx, Ny, Nz); // nabla f of signed distance functions is always unit vector\n",
+    "  }\n",
+    "#endif\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_NUM\n",
+    "  // by numeric approximation\n",
+    "  Vec normal(const Vec &pt) const override {\n",
+    "    double f =  sphere_implicit_func(pt.x, pt.y, pt.z, p, r);\n",
+    "    double fx = sphere_implicit_func(pt.x+eps, pt.y, pt.z, p, r);\n",
+    "    double fy = sphere_implicit_func(pt.x, pt.y+eps, pt.z, p, r);\n",
+    "    double fz = sphere_implicit_func(pt.x, pt.y, pt.z+eps, p, r);\n",
+    "\n",
+    "    return Vec((fx-f)/eps, (fy-f)/eps, (fz-f)/eps).norm();\n",
+    "  }\n",
+    "#endif\n",
+    "\n",
+    "  //TODO: Override distance func method when parent method is virtual\n",
+    "  double distance_func(double x, double y, double z) const override {\n",
+    "    return sqrt((x-p.x)*(x-p.x) + (y-p.y)*(y-p.y) + (z-p.z)*(z-p.z)) - r;\n",
+    "  }\n",
+    "};\n",
+    "\n",
+    "\n",
+    "// Hyperbolic Solid\n",
+    "\n",
+    "#define sin_a 0.965925826289068\n",
+    "#define cos_a 0.258819045102521\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_HAND\n",
+    "// by hand\n",
+    "double h_func_dx(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return (2./3.*cos_a)/pow(((x-p.x)*cos_a+(z-p.z)*sin_a),1./3.) - (2./3.*sin_a)/pow(((z-p.z)*cos_a-(x-p.x)*sin_a),1./3.);\n",
+    "}\n",
+    "\n",
+    "double h_func_dy(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return (2./3.)/pow(y-p.y,1./3.);\n",
+    "}\n",
+    "\n",
+    "double h_func_dz(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return (2./3.*sin_a)/pow((x-p.x)*cos_a+(z-p.z)*sin_a,1./3.) + (2./3.*cos_a)/pow((z-p.z)*cos_a-(x-p.x)*sin_a,1./3.);\n",
+    "}\n",
+    "#endif\n",
+    "\n",
+    "//TODO: Check this distance func. Visualized \"octahedron\" do not like as octahedron.\n",
+    "double hyperbolic_func(double x, double y, double z, const Vec &p, double r) {\n",
+    "  return pow((x-p.x)*cos_a+(z-p.z)*sin_a, 2./3.) + pow(y-p.y, 2./3.) + pow((x-p.x)*-sin_a+(z-p.z)*cos_a, 2./3.) - pow(r, 2./3.);\n",
+    "}\n",
+    "\n",
+    "class HyperbolicSolid : public ImplicitSolid {\n",
+    "public:\n",
+    "  double r; // radius\n",
+    "  Vec p; // position\n",
+    "  HyperbolicSolid() : r(), p() {}\n",
+    "  HyperbolicSolid(double r_, Vec p_, Vec e_, Vec c_, Refl_t refl_)\n",
+    "      : r(r_), p(p_), ImplicitSolid(e_, c_, refl_) {\n",
+    "    // FIXME: Move to Normal when execute can call polymorphic diffs.\n",
+    "    auto hyperbolic_func_dx = clad::differentiate(&HyperbolicSolid::distance_func, 0);\n",
+    "    auto hyperbolic_func_dy = clad::differentiate(&HyperbolicSolid::distance_func, 1);\n",
+    "    auto hyperbolic_func_dz = clad::differentiate(&HyperbolicSolid::distance_func, 2);\n",
+    "  }\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_HAND\n",
+    "  // by hand\n",
+    "  Vec normal(const Vec &pt) const override {\n",
+    "    double Nx = h_func_dx(pt.x, pt.y, pt.z, p, r);\n",
+    "    double Ny = h_func_dy(pt.x, pt.y, pt.z, p, r);\n",
+    "    double Nz = h_func_dz(pt.x, pt.y, pt.z, p, r);\n",
+    "\n",
+    "    return Vec(Nx, Ny, Nz).norm();\n",
+    "  }\n",
+    "#endif\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_CLAD\n",
+    "  // by clad\n",
+    "  Vec normal(const Vec &pt) const override {\n",
+    "    //auto hyperbolic_func_dx = clad::differentiate(&HyperbolicSolid::distance_func, 0);\n",
+    "    //auto hyperbolic_func_dy = clad::differentiate(&HyperbolicSolid::distance_func, 1);\n",
+    "    //auto hyperbolic_func_dz = clad::differentiate(&HyperbolicSolid::distance_func, 2);\n",
+    "\n",
+    "    //double Nx = hyperbolic_func_dx.execute(*this, pt.x, pt.y, pt.z, p, r);\n",
+    "    //double Ny = hyperbolic_func_dy.execute(*this, pt.x, pt.y, pt.z, p, r);\n",
+    "    //double Nz = hyperbolic_func_dz.execute(*this, pt.x, pt.y, pt.z, p, r);\n",
+    "\n",
+    "    double Nx = distance_func_darg0(pt.x, pt.y, pt.z);\n",
+    "    double Ny = distance_func_darg1(pt.x, pt.y, pt.z);\n",
+    "    double Nz = distance_func_darg2(pt.x, pt.y, pt.z);\n",
+    "\n",
+    "    // nabla f of signed distance functions is always unit vector, we might\n",
+    "    // need to add support for nabla in clad.\n",
+    "    // If functions are normalized we can skip calling `.norm()`.\n",
+    "    return Vec(Nx, Ny, Nz).norm();\n",
+    "  }\n",
+    "#endif\n",
+    "\n",
+    "#ifdef TEST_TYPE_BY_NUM\n",
+    "  // by numeric approximation\n",
+    "  Vec normal(const Vec &pt) const override {\n",
+    "    double f  = hyperbolic_func(pt.x, pt.y, pt.z, p, r);\n",
+    "    double fx = hyperbolic_func(pt.x+eps, pt.y, pt.z, p, r);\n",
+    "    double fy = hyperbolic_func(pt.x, pt.y+eps, pt.z, p, r);\n",
+    "    double fz = hyperbolic_func(pt.x, pt.y, pt.z+eps, p, r);\n",
+    "\n",
+    "    return Vec((fx-f)/eps, (fy-f)/eps, (fz-f)/eps).norm();\n",
+    "  }\n",
+    "#endif\n",
+    "\n",
+    "  double distance_func(double x, double y, double z) const override {\n",
+    "    return pow((x-p.x)*cos_a+(z-p.z)*sin_a, 2./3.) + pow(y-p.y, 2./3.) + pow((x-p.x)*-sin_a+(z-p.z)*cos_a, 2./3.) - pow(r, 2./3.);\n",
+    "  }\n",
+    "};\n",
+    "\n",
+    "// Scene definition (Sphere: radius, position, emission, color, material)\n",
+    "Solid* scene[] = {\n",
+    "  new Sphere(1e5,  Vec(1e5+1, 40.8, 81.6),   Vec(), Vec(.75, .25, .25), DIFF), // Left\n",
+    "  new Sphere(1e5,  Vec(-1e5+99, 40.8, 81.6), Vec(), Vec(.25, .25, .75), DIFF), // Right\n",
+    "  new Sphere(1e5,  Vec(50, 40.8, 1e5),       Vec(), Vec(.75, .75, .75), DIFF), // Back\n",
+    "  new Sphere(1e5,  Vec(50, 40.8, -1e5+170),  Vec(), Vec(),              DIFF), // Front\n",
+    "  new Sphere(1e5,  Vec(50, 1e5, 81.6),       Vec(), Vec(.75, .75, .75), DIFF), // Bottm\n",
+    "  new Sphere(1e5,  Vec(50, -1e5+81.6, 81.6), Vec(), Vec(.75, .75, .75), DIFF), // Top\n",
+    "  new Sphere(16.5, Vec(27, 16.5, 47),        Vec(), Vec(1,1,1)*.999,    SPEC), //Mirror\n",
+    "  new Sphere(16.5, Vec(73, 16.5, 78),        Vec(), Vec(1, 1, 1)*.999,  REFR), // Glass\n",
+    "  new Sphere(600,  Vec(50, 681.6-.27, 81.6), Vec(12,12,12), Vec(),      DIFF)  // Light\n",
+    "};\n",
+    "\n",
+    "inline double clamp(double x) {\n",
+    "  return x<0 ? 0 : x>1 ? 1 : x;\n",
+    "}\n",
+    "\n",
+    "inline int toInt(double x) {\n",
+    "  return int(pow(clamp(x),1/2.2)*255+.5);\n",
+    "}\n",
+    "\n",
+    "inline bool intersect(const Ray &ray, double &t, int &id) {\n",
+    "  double d;\n",
+    "\n",
+    "  t = inf;\n",
+    "  for(int i=sizeof(scene)/sizeof(scene[0]); i--; ) {\n",
+    "    if ((d = scene[i]->intersect(ray)) && d<t) { t=d; id=i; }\n",
+    "  }\n",
+    "\n",
+    "  return t<inf;\n",
+    "}\n",
+    "\n",
+    "Vec radiance(const Ray &ray, int depth, unsigned short *Xi) {\n",
+    "  double t; // distance to intersection\n",
+    "  int id;  // id of intersected object\n",
+    "\n",
+    "  Ray r=ray;\n",
+    "\n",
+    "  // L0 = Le0 + f0*(L1)\n",
+    "  //    = Le0 + f0*(Le1 + f1*L2)\n",
+    "  //    = Le0 + f0*(Le1 + f1*(Le2 + f2*(L3))\n",
+    "  //    = Le0 + f0*(Le1 + f1*(Le2 + f2*(Le3 + f3*(L4)))\n",
+    "  //    = ...\n",
+    "  //    = Le0 + f0*Le1 + f0*f1*Le2 + f0*f1*f2*Le3 + f0*f1*f2*f3*Le4 + ...\n",
+    "  //\n",
+    "  // So:\n",
+    "  // F = 1\n",
+    "  // while (1) {\n",
+    "  //   L += F*Lei\n",
+    "  //   F *= fi\n",
+    "  // }\n",
+    "\n",
+    "  // accumulated color\n",
+    "  Vec cl(0,0,0);\n",
+    "  // accumulated reflectance\n",
+    "  Vec cf(1,1,1);\n",
+    "\n",
+    "  while (1) {\n",
+    "    // if miss, return accumulated color (black)\n",
+    "    if (!intersect(r, t, id)) return cl;\n",
+    "\n",
+    "    // the hit object\n",
+    "    const Solid &obj = *scene[id];\n",
+    "\n",
+    "    // calculate intersection point\n",
+    "    Vec x=r.o+r.d*t;\n",
+    "\n",
+    "    // calculate surface normal vector in point x\n",
+    "    Vec n=obj.normal(x);\n",
+    "    Vec nl=n*r.d<0 ? n : n*-1;\n",
+    "\n",
+    "    // normal map test\n",
+    "    //return n;\n",
+    "\n",
+    "    // object base color\n",
+    "    Vec f=obj.c;\n",
+    "    double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl\n",
+    "\n",
+    "    // object intersection id map test\n",
+    "    //return obj.c;\n",
+    "\n",
+    "    cl = cl + cf.mult(obj.e);\n",
+    "\n",
+    "    if (++depth>5 || !p) {\n",
+    "      if (erand48(Xi)<p) f=f*(1/p); else return cl;\n",
+    "    } // R.R.\n",
+    "\n",
+    "//    if (depth==0) return obj.c;\n",
+    "\n",
+    "    cf = cf.mult(f);\n",
+    "\n",
+    "    if (obj.refl == DIFF) { // Ideal DIFFUSE reflection\n",
+    "      double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);\n",
+    "      Vec w=nl, u=((fabs(w.x)>.1 ? Vec(0,1) : Vec(1))%w).norm(), v=w%u;\n",
+    "      Vec d = (u*cos(r1)*r2s+v*sin(r1)*r2s+w*sqrt(1-r2)).norm();\n",
+    "\n",
+    "      r = Ray(x, d);\n",
+    "      continue;\n",
+    "    } else if (obj.refl == SPEC) { // Ideal SPECULAR reflection\n",
+    "      //return obj.e + f.mult(radiance(Ray(x,r.d-n*2*(n*r.d)), depth, Xi));\n",
+    "      r = Ray(x, r.d-n*2*(n*r.d));\n",
+    "      continue;\n",
+    "    }\n",
+    "\n",
+    "    // Ideal dielectric REFRACTION\n",
+    "    Ray reflRay(x, r.d-n*2*(n*r.d));\n",
+    "    bool into = n*nl>0; // Ray from outside going in?\n",
+    "    double nc=1, nt=1.5, nnt=into ? nc/nt : nt/nc, ddn=r.d*nl, cos2t;\n",
+    "\n",
+    "    if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) { // Total internal reflection\n",
+    "      //return obj.e + f.mult(radiance(reflRay, depth, Xi));\n",
+    "      r = reflRay;\n",
+    "      continue;\n",
+    "    }\n",
+    "\n",
+    "    Vec tdir = (r.d*nnt-n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();\n",
+    "    double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c=1-(into?-ddn:tdir*n);\n",
+    "    double Re=R0+(1-R0)*c*c*c*c*c, Tr=1-Re, P=.25+.5*Re, RP=Re/P, TP=Tr/(1-P);\n",
+    "\n",
+    "    //return obj.e + f.mult(depth>2 ? // Russian roulette\n",
+    "    //  (erand48(Xi)<P ? radiance(reflRay, depth, Xi)*RP : radiance(Ray(x,tdir), depth, Xi)*TP) :\n",
+    "    //  (radiance(reflRay, depth, Xi)*Re + radiance(Ray(x,tdir), depth, Xi)*Tr) );\n",
+    "\n",
+    "    if (erand48(Xi)<P) {\n",
+    "      cf = cf*RP;\n",
+    "      r = reflRay;\n",
+    "    } else {\n",
+    "      cf = cf*TP;\n",
+    "      r = Ray(x,tdir);\n",
+    "    }\n",
+    "    continue;\n",
+    "  }\n",
+    "\n",
+    "}\n",
+    "\n",
+    "void render(int nb_samples = 40) {\n",
+    "\n",
+    "//  int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 1; // # samples//\n",
+    "//  int w=512, h=384, samps = nb_samples; // # samples\n",
+    "  int w=256, h=192, samps = nb_samples; // # samples\n",
+    "\n",
+    "  int fr = 0;\n",
+    "  Ray cam(Vec(50,52,295.6-fr*10), Vec(0,-0.042612,-1).norm()); // cam pos, dir\n",
+    "  // Different camera setups.\n",
+    "  // Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir\n",
+    "  // Ray cam(Vec(80,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir\n",
+    "  Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r;\n",
+    "  Vec *frame=new Vec[w*h];\n",
+    "\n",
+    "  #pragma omp parallel for schedule(dynamic, 1) private(r)\n",
+    "  for (unsigned short y=0; y<h; y++) { // Loop over image rows\n",
+    "    for (unsigned short x=0, Xi[3]={0,0,(unsigned short)(y*y*y)}; x<w; x++) { // Loop cols\n",
+    "      for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++) { // 2x2 subpixel rows\n",
+    "        for (int sx=0; sx<2; sx++, r=Vec()) {     // 2x2 subpixel cols\n",
+    "          for (int s=0; s<samps; s++) {\n",
+    "            double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1 : 1-sqrt(2-r1);\n",
+    "            double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1 : 1-sqrt(2-r2);\n",
+    "            Vec d = cx*(((sx+.5+dx)/2+x)/w-.5)+cy*(((sy+.5+dy)/2+y)/h-.5)+cam.d;\n",
+    "            r = r + radiance(Ray(cam.o+d*140, d.norm()), 0, Xi)*(1./samps);\n",
+    "          } // Camera rays are pushed ^^^^^ forward to start in interior\n",
+    "          frame[i] = frame[i] + Vec(clamp(r.x), clamp(r.y), clamp(r.z))*.25;\n",
+    "        }\n",
+    "      }\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  // Write image to PPM file.\n",
+    "  char filename[100];\n",
+    "  snprintf(filename, 100, \"image-%d.ppm\", fr);\n",
+    "  fprintf(stderr, \"\\rSave %s\\n\", filename);\n",
+    "  FILE *f = fopen(filename, \"wb\");\n",
+    "  fprintf(f, \"P6\\n%d %d\\n%d\\n\", w, h, 255);\n",
+    "  for (int i=0; i<w*h; i++)\n",
+    "    fprintf(f, \"%c%c%c\", toInt(frame[i].x), toInt(frame[i].y), toInt(frame[i].z));\n",
+    "  fclose(f);\n",
+    "    \n",
+    "  snprintf(filename, 100, \"image-%d.png\", fr);\n",
+    "  save(filename, w, h, frame);\n",
+    "}\n",
+    "\n",
+    "std::cout << \"Rendering... \"; std::cout.flush();\n",
+    "render(8);\n",
+    "std::cout << \"Ready!\" << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cde4511b-65ed-4d69-99d5-ead1ebe8bca3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//#include \"xcpp/xdisplay.hpp\"\n",
+    "display(image(\"image-0.png\"));"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++17 (+Clad)",
+   "language": "C++17",
+   "name": "clad-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "C++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/cu-smallpt.ipynb b/notebooks/cu-smallpt.ipynb
new file mode 100644
index 00000000..997afbc9
--- /dev/null
+++ b/notebooks/cu-smallpt.ipynb
@@ -0,0 +1,1121 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c05462f5-b321-48b4-a6ad-a2a5258406d2",
+   "metadata": {},
+   "source": [
+    "# CUDA SmallPT Demo\n",
+    "\n",
+    "CUDA C++ modification of Kevin Baeson's [smallpt](http://www.kevinbeason.com/smallpt/).\n",
+    "\n",
+    "Adapted from https://github.com/matt77hias/cu-smallpt.git"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6b13691-e23d-49de-838b-04c78b348194",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Main helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "51e35d10-3770-4e9f-92dd-b07c0e2f5263",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include \"cuda_runtime.h\"\n",
+    "#include \"device_launch_parameters.h\"\n",
+    "\n",
+    "#include <algorithm>\n",
+    "#include <cmath>\n",
+    "#include <cstdint>\n",
+    "#include <limits>\n",
+    "#include <cstdio>\n",
+    "#include <cstdlib>\n",
+    "\n",
+    "namespace smallpt {\n",
+    "\n",
+    "  constexpr double g_pi = 3.14159265358979323846;\n",
+    "\n",
+    "  __host__ __device__ inline double Clamp(double v, \n",
+    "                      double low = 0.0, \n",
+    "                      double high = 1.0) noexcept\n",
+    "  {\n",
+    "    return fmin(fmax(v, low), high);\n",
+    "  }\n",
+    "\n",
+    "  inline std::uint8_t ToByte(double color, double gamma = 2.2) noexcept\n",
+    "  {\n",
+    "    const double gcolor = std::pow(color, 1.0 / gamma);\n",
+    "    return static_cast< std::uint8_t >(Clamp(255.0 * gcolor, 0.0, 255.0));\n",
+    "  }\n",
+    "\n",
+    "  inline void HandleError(cudaError_t err, const char* file, int line) {\n",
+    "    if (cudaSuccess != err) {\n",
+    "      std::printf(\"%s in %s at line %d\\n\", \n",
+    "            cudaGetErrorString(err), file, line);\n",
+    "      std::exit(EXIT_FAILURE);\n",
+    "    }\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))\n",
+    "\n",
+    "#define HANDLE_NULL(a) {if (a == NULL) { \\\n",
+    "  std::printf(\"Host memory failed in %s at line %d\\n\", __FILE__, __LINE__); \\\n",
+    "    std::exit(EXIT_FAILURE);}}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b44f208-5b50-45c5-a644-acf7d60552ff",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Vector helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6d4988ce-575f-42ba-a96d-be0eabc113d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//#include \"math.hpp\"\n",
+    "\n",
+    "namespace smallpt {\n",
+    "\n",
+    "  struct Vector3 {\n",
+    "\n",
+    "  public:\n",
+    "\n",
+    "    __host__ __device__ explicit Vector3(double xyz = 0.0) noexcept\n",
+    "      : Vector3(xyz, xyz, xyz) {}\n",
+    "    __host__ __device__ Vector3(double x, double y, double z) noexcept\n",
+    "      : m_x(x), m_y(y), m_z(z) {}\n",
+    "    Vector3(const Vector3& v) noexcept = default;\n",
+    "    Vector3(Vector3&& v) noexcept = default;\n",
+    "    ~Vector3() = default;\n",
+    "\n",
+    "    Vector3& operator=(const Vector3& v) = default;\n",
+    "    Vector3& operator=(Vector3&& v) = default;\n",
+    "\n",
+    "    __device__ bool HasNaNs() const {\n",
+    "      return std::isnan(m_x) || std::isnan(m_y) || std::isnan(m_z);\n",
+    "    }\n",
+    "\n",
+    "    __device__ const Vector3 operator-() const {\n",
+    "      return { -m_x, -m_y, -m_z };\n",
+    "    }\n",
+    "\n",
+    "    __device__ const Vector3 operator+(const Vector3& v) const {\n",
+    "      return { m_x + v.m_x, m_y + v.m_y, m_z + v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator-(const Vector3& v) const {\n",
+    "      return { m_x - v.m_x, m_y - v.m_y, m_z - v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator*(const Vector3& v) const {\n",
+    "      return { m_x * v.m_x, m_y * v.m_y, m_z * v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator/(const Vector3& v) const {\n",
+    "      return { m_x / v.m_x, m_y / v.m_y, m_z / v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator+(double a) const {\n",
+    "      return { m_x + a, m_y + a, m_z + a };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator-(double a) const {\n",
+    "      return { m_x - a, m_y - a, m_z - a };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator*(double a) const {\n",
+    "      return { m_x * a, m_y * a, m_z * a };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator/(double a) const {\n",
+    "      const double inv_a = 1.0 / a;\n",
+    "      return { m_x * inv_a, m_y * inv_a, m_z * inv_a };\n",
+    "    }\n",
+    "\n",
+    "    __device__ Vector3& operator+=(const Vector3& v) {\n",
+    "      m_x += v.m_x;\n",
+    "      m_y += v.m_y;\n",
+    "      m_z += v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator-=(const Vector3& v) {\n",
+    "      m_x -= v.m_x;\n",
+    "      m_y -= v.m_y;\n",
+    "      m_z -= v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator*=(const Vector3& v) {\n",
+    "      m_x *= v.m_x;\n",
+    "      m_y *= v.m_y;\n",
+    "      m_z *= v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator/=(const Vector3& v) {\n",
+    "      m_x /= v.m_x;\n",
+    "      m_y /= v.m_y;\n",
+    "      m_z /= v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator+=(double a) {\n",
+    "      m_x += a;\n",
+    "      m_y += a;\n",
+    "      m_z += a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator-=(double a) {\n",
+    "      m_x -= a;\n",
+    "      m_y -= a;\n",
+    "      m_z -= a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator*=(double a) {\n",
+    "      m_x *= a;\n",
+    "      m_y *= a;\n",
+    "      m_z *= a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator/=(double a) {\n",
+    "      const double inv_a = 1.0 / a;\n",
+    "      m_x *= inv_a;\n",
+    "      m_y *= inv_a;\n",
+    "      m_z *= inv_a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "\n",
+    "    __device__ double Dot(const Vector3& v) const {\n",
+    "      return m_x * v.m_x + m_y * v.m_y + m_z * v.m_z;\n",
+    "    }\n",
+    "    __device__ const Vector3 Cross(const Vector3& v) const {\n",
+    "      return {\n",
+    "        m_y * v.m_z - m_z * v.m_y,\n",
+    "        m_z * v.m_x - m_x * v.m_z,\n",
+    "        m_x * v.m_y - m_y * v.m_x\n",
+    "      };\n",
+    "    }\n",
+    "\n",
+    "    __device__ bool operator==(const Vector3& rhs) const {\n",
+    "      return m_x == rhs.m_x && m_y == rhs.m_y && m_z == rhs.m_z;\n",
+    "    }\n",
+    "    __device__ bool operator!=(const Vector3& rhs) const {\n",
+    "      return !(*this == rhs);\n",
+    "    }\n",
+    "\n",
+    "    __device__ double& operator[](std::size_t i) {\n",
+    "      return (&m_x)[i];\n",
+    "    }\n",
+    "    __device__ double operator[](std::size_t i) const {\n",
+    "      return (&m_x)[i];\n",
+    "    }\n",
+    "\n",
+    "    __device__ std::size_t MinDimension() const {\n",
+    "      return (m_x < m_y && m_x < m_z) ? 0u : ((m_y < m_z) ? 1u : 2u);\n",
+    "    }\n",
+    "    __device__ std::size_t MaxDimension() const {\n",
+    "      return (m_x > m_y && m_x > m_z) ? 0u : ((m_y > m_z) ? 1u : 2u);\n",
+    "    }\n",
+    "    __device__ double Min() const {\n",
+    "      return fmin(m_x, fmin(m_y, m_z));\n",
+    "    }\n",
+    "    __device__ double Max() const {\n",
+    "      return fmax(m_x, fmax(m_y, m_z));\n",
+    "    }\n",
+    "\n",
+    "    __device__ double Norm2_squared() const {\n",
+    "      return m_x * m_x + m_y * m_y + m_z * m_z;\n",
+    "    }\n",
+    "\n",
+    "    __device__ double Norm2() const {\n",
+    "      return std::sqrt(Norm2_squared());\n",
+    "    }\n",
+    "\n",
+    "    __device__ void Normalize() {\n",
+    "      const double a = 1.0 / Norm2();\n",
+    "      m_x *= a;\n",
+    "      m_y *= a;\n",
+    "      m_z *= a;\n",
+    "    }\n",
+    "\n",
+    "    double m_x, m_y, m_z;\n",
+    "  };\n",
+    "\n",
+    "  __device__ inline const Vector3 operator+(double a, const Vector3& v) {\n",
+    "    return { a + v.m_x, a + v.m_y, a + v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 operator-(double a, const Vector3& v) {\n",
+    "    return { a - v.m_x, a - v.m_y, a - v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 operator*(double a, const Vector3& v) {\n",
+    "    return { a * v.m_x, a * v.m_y, a * v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 operator/(double a, const Vector3& v) {\n",
+    "    return { a / v.m_x, a / v.m_y, a / v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Sqrt(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::sqrt(v.m_x),\n",
+    "      std::sqrt(v.m_y),\n",
+    "      std::sqrt(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Pow(const Vector3& v, double a) {\n",
+    "    return {\n",
+    "      std::pow(v.m_x, a),\n",
+    "      std::pow(v.m_y, a),\n",
+    "      std::pow(v.m_z, a)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Abs(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::abs(v.m_x),\n",
+    "      std::abs(v.m_y),\n",
+    "      std::abs(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Min(const Vector3& v1, const Vector3& v2) {\n",
+    "    return {\n",
+    "      fmin(v1.m_x, v2.m_x),\n",
+    "      fmin(v1.m_y, v2.m_y),\n",
+    "      fmin(v1.m_z, v2.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Max(const Vector3& v1, const Vector3& v2) {\n",
+    "    return {\n",
+    "      fmax(v1.m_x, v2.m_x),\n",
+    "      fmax(v1.m_y, v2.m_y),\n",
+    "      fmax(v1.m_z, v2.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Round(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::round(v.m_x),\n",
+    "      std::round(v.m_y),\n",
+    "      std::round(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Floor(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::floor(v.m_x),\n",
+    "      std::floor(v.m_y),\n",
+    "      std::floor(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Ceil(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::ceil(v.m_x),\n",
+    "      std::ceil(v.m_y),\n",
+    "      std::ceil(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Trunc(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::trunc(v.m_x),\n",
+    "      std::trunc(v.m_y),\n",
+    "      std::trunc(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Clamp(const Vector3& v, \n",
+    "                      double low = 0.0, \n",
+    "                      double high = 1.0) {\n",
+    "    return {\n",
+    "      Clamp(v.m_x, low, high),\n",
+    "      Clamp(v.m_y, low, high),\n",
+    "      Clamp(v.m_z, low, high) }\n",
+    "    ;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Lerp(double a, \n",
+    "                     const Vector3& v1, \n",
+    "                     const Vector3& v2) {\n",
+    "    return v1 + a * (v2 - v1);\n",
+    "  }\n",
+    "\n",
+    "  template< std::size_t X, std::size_t Y, std::size_t Z >\n",
+    "  __device__ inline const Vector3 Permute(const Vector3& v) {\n",
+    "    return { v[X], v[Y], v[Z] };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Normalize(const Vector3& v) {\n",
+    "    const double a = 1.0 / v.Norm2();\n",
+    "    return a * v;\n",
+    "  }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b06a4dd-381d-491c-a963-60066108944c",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Image IO helper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b05a70cb-e401-4b1b-b5d3-9ee911ec4da9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//#include \"vector.hpp\"\n",
+    "#include <cstdio>\n",
+    "\n",
+    "#ifdef __unix\n",
+    "#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL\n",
+    "#endif\n",
+    "\n",
+    "namespace smallpt {\n",
+    "\n",
+    "  inline void WritePPM(std::uint32_t w, \n",
+    "             std::uint32_t h, \n",
+    "             const Vector3* Ls, \n",
+    "             const char* fname = \"cu-image.ppm\") noexcept {\n",
+    "    \n",
+    "    FILE* fp;\n",
+    "    \n",
+    "    fopen_s(&fp, fname, \"w\");\n",
+    "    \n",
+    "    std::fprintf(fp, \"P3\\n%u %u\\n%u\\n\", w, h, 255u);\n",
+    "    for (std::size_t i = 0; i < w * h; ++i) {\n",
+    "      std::fprintf(fp, \"%u %u %u \", \n",
+    "             ToByte(Ls[i].m_x), \n",
+    "             ToByte(Ls[i].m_y), \n",
+    "             ToByte(Ls[i].m_z));\n",
+    "    }\n",
+    "    \n",
+    "    std::fclose(fp);\n",
+    "  }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c1a91c9d-a269-431e-9a8b-3620e6ceafe1",
+   "metadata": {},
+   "source": [
+    "### SV PNG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9cf31b05-61a6-486d-a584-792e45ee30d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "/*\n",
+    "Copyright (C) 2017 Milo Yip. All rights reserved.\n",
+    "\n",
+    "Redistribution and use in source and binary forms, with or without\n",
+    "modification, are permitted provided that the following conditions are met:\n",
+    "\n",
+    "* Redistributions of source code must retain the above copyright notice, this\n",
+    "  list of conditions and the following disclaimer.\n",
+    "\n",
+    "* Redistributions in binary form must reproduce the above copyright notice,\n",
+    "  this list of conditions and the following disclaimer in the documentation\n",
+    "  and/or other materials provided with the distribution.\n",
+    "\n",
+    "* Neither the name of pngout nor the names of its\n",
+    "  contributors may be used to endorse or promote products derived from\n",
+    "  this software without specific prior written permission.\n",
+    "\n",
+    "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n",
+    "AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n",
+    "IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n",
+    "DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n",
+    "FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n",
+    "DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n",
+    "SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n",
+    "CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n",
+    "OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n",
+    "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n",
+    "*/\n",
+    "\n",
+    "/*! \\file\n",
+    "    \\brief      svpng() is a minimalistic C function for saving RGB/RGBA image into uncompressed PNG.\n",
+    "    \\author     Milo Yip\n",
+    "    \\version    0.1.1\n",
+    "    \\copyright  MIT license\n",
+    "    \\sa         http://github.com/miloyip/svpng\n",
+    "*/\n",
+    "\n",
+    "///#ifndef SVPNG_INC_\n",
+    "///#define SVPNG_INC_\n",
+    "\n",
+    "/*! \\def SVPNG_LINKAGE\n",
+    "    \\brief User customizable linkage for svpng() function.\n",
+    "    By default this macro is empty.\n",
+    "    User may define this macro as static for static linkage, \n",
+    "    and/or inline in C99/C++, etc.\n",
+    "*/\n",
+    "///#ifndef SVPNG_LINKAGE\n",
+    "#define SVPNG_LINKAGE\n",
+    "///#endif\n",
+    "\n",
+    "/*! \\def SVPNG_OUTPUT\n",
+    "    \\brief User customizable output stream.\n",
+    "    By default, it uses C file descriptor and fputc() to output bytes.\n",
+    "    In C++, for example, user may use std::ostream or std::vector instead.\n",
+    "*/\n",
+    "///#ifndef SVPNG_OUTPUT\n",
+    "#include <stdio.h>\n",
+    "#define SVPNG_OUTPUT FILE* fp\n",
+    "///#endif\n",
+    "\n",
+    "/*! \\def SVPNG_PUT\n",
+    "    \\brief Write a byte\n",
+    "*/\n",
+    "///#ifndef SVPNG_PUT\n",
+    "#define SVPNG_PUT(u) fputc(u, fp)\n",
+    "///#endif\n",
+    "\n",
+    "\n",
+    "/*!\n",
+    "    \\brief Save a RGB/RGBA image in PNG format.\n",
+    "    \\param SVPNG_OUTPUT Output stream (by default using file descriptor).\n",
+    "    \\param w Width of the image. (<16383)\n",
+    "    \\param h Height of the image.\n",
+    "    \\param img Image pixel data in 24-bit RGB or 32-bit RGBA format.\n",
+    "    \\param alpha Whether the image contains alpha channel.\n",
+    "*/\n",
+    "SVPNG_LINKAGE void svpng(SVPNG_OUTPUT, unsigned w, unsigned h, const unsigned char* img, int alpha) {\n",
+    "    static const unsigned t[] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, \n",
+    "    /* CRC32 Table */    0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };\n",
+    "    unsigned a = 1, b = 0, c, p = w * (alpha ? 4 : 3) + 1, x, y, i;   /* ADLER-a, ADLER-b, CRC, pitch */\n",
+    "#define SVPNG_U8A(ua, l) for (i = 0; i < l; i++) SVPNG_PUT((ua)[i]);\n",
+    "#define SVPNG_U32(u) do { SVPNG_PUT((u) >> 24); SVPNG_PUT(((u) >> 16) & 255); SVPNG_PUT(((u) >> 8) & 255); SVPNG_PUT((u) & 255); } while(0)\n",
+    "#define SVPNG_U8C(u) do { SVPNG_PUT(u); c ^= (u); c = (c >> 4) ^ t[c & 15]; c = (c >> 4) ^ t[c & 15]; } while(0)\n",
+    "#define SVPNG_U8AC(ua, l) for (i = 0; i < l; i++) SVPNG_U8C((ua)[i])\n",
+    "#define SVPNG_U16LC(u) do { SVPNG_U8C((u) & 255); SVPNG_U8C(((u) >> 8) & 255); } while(0)\n",
+    "#define SVPNG_U32C(u) do { SVPNG_U8C((u) >> 24); SVPNG_U8C(((u) >> 16) & 255); SVPNG_U8C(((u) >> 8) & 255); SVPNG_U8C((u) & 255); } while(0)\n",
+    "#define SVPNG_U8ADLER(u) do { SVPNG_U8C(u); a = (a + (u)) % 65521; b = (b + a) % 65521; } while(0)\n",
+    "#define SVPNG_BEGIN(s, l) do { SVPNG_U32(l); c = ~0U; SVPNG_U8AC(s, 4); } while(0)\n",
+    "#define SVPNG_END() SVPNG_U32(~c)\n",
+    "    SVPNG_U8A(\"\\x89PNG\\r\\n\\32\\n\", 8);           /* Magic */\n",
+    "    SVPNG_BEGIN(\"IHDR\", 13);                    /* IHDR chunk { */\n",
+    "    SVPNG_U32C(w); SVPNG_U32C(h);               /*   Width & Height (8 bytes) */\n",
+    "    SVPNG_U8C(8); SVPNG_U8C(alpha ? 6 : 2);     /*   Depth=8, Color=True color with/without alpha (2 bytes) */\n",
+    "    SVPNG_U8AC(\"\\0\\0\\0\", 3);                    /*   Compression=Deflate, Filter=No, Interlace=No (3 bytes) */\n",
+    "    SVPNG_END();                                /* } */\n",
+    "    SVPNG_BEGIN(\"IDAT\", 2 + h * (5 + p) + 4);   /* IDAT chunk { */\n",
+    "    SVPNG_U8AC(\"\\x78\\1\", 2);                    /*   Deflate block begin (2 bytes) */\n",
+    "    for (y = 0; y < h; y++) {                   /*   Each horizontal line makes a block for simplicity */\n",
+    "        SVPNG_U8C(y == h - 1);                  /*   1 for the last block, 0 for others (1 byte) */\n",
+    "        SVPNG_U16LC(p); SVPNG_U16LC(~p);        /*   Size of block in little endian and its 1's complement (4 bytes) */\n",
+    "        SVPNG_U8ADLER(0);                       /*   No filter prefix (1 byte) */\n",
+    "        for (x = 0; x < p - 1; x++, img++)\n",
+    "            SVPNG_U8ADLER(*img);                /*   Image pixel data */\n",
+    "    }\n",
+    "    SVPNG_U32C((b << 16) | a);                  /*   Deflate block end with adler (4 bytes) */\n",
+    "    SVPNG_END();                                /* } */\n",
+    "    SVPNG_BEGIN(\"IEND\", 0); SVPNG_END();        /* IEND chunk {} */\n",
+    "}\n",
+    "\n",
+    "///#endif /* SVPNG_INC_ */"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29109b8c-ac5e-4826-aa48-6e167a48c059",
+   "metadata": {},
+   "source": [
+    "### Image helper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "de44523b-d34f-4503-9796-1e124d20eea5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int toInt(float x)\n",
+    "{\n",
+    "    return (int) (pow(smallpt::Clamp(x, 0.0f, 1.0f), 1.0f / 2.2f) * 255 + 0.5f);\n",
+    "}\n",
+    "\n",
+    "void save(const char* fileName, int width, int height, smallpt::Vector3* data)\n",
+    "{\n",
+    "    FILE *fp = fopen(fileName, \"wb\");\n",
+    "\n",
+    "    // Convert from float3 array to uchar array\n",
+    "    unsigned char* output = new unsigned char[width * height * 3];\n",
+    "\n",
+    "    for(int i = 0; i < width * height; i++)\n",
+    "    {\n",
+    "        //printf_s(\"%f %f %f \\n\", data[i].x, data[i].y, data[i].z);\n",
+    "        output[i * 3 + 0] = toInt(data[i].m_x);\n",
+    "        output[i * 3 + 1] = toInt(data[i].m_y);\n",
+    "        output[i * 3 + 2] = toInt(data[i].m_z);\n",
+    "    }\n",
+    "\n",
+    "    svpng(fp, width, height, output, 0);\n",
+    "    fclose(fp);\n",
+    "    delete[] output;\n",
+    "}\n",
+    "\n",
+    "//#include \"xeus/xjson.hpp\"\n",
+    "/*\n",
+    "inline std::string base64encode(const std::string& input)\n",
+    "{\n",
+    "  std::string output;\n",
+    "  int val = 0;\n",
+    "  int valb = -6;\n",
+    "  for (char sc : input)\n",
+    "  {\n",
+    "    unsigned char c = static_cast<unsigned char>(sc);\n",
+    "    val = (val << 8) + c;\n",
+    "    valb += 8;\n",
+    "    while (valb >= 0) {\n",
+    "        output.push_back(\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\"[(val >> valb) & 0x3F]);\n",
+    "        valb -= 6;\n",
+    "    }\n",
+    "  }\n",
+    "  if (valb > -6) {\n",
+    "      output.push_back(\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\"[((val << 8) >> (valb + 8)) & 0x3F]);\n",
+    "  }\n",
+    "  while (output.size() % 4) {\n",
+    "      output.push_back('=');\n",
+    "  }\n",
+    "  return output;\n",
+    "}\n",
+    "*/\n",
+    "/*\n",
+    "// display image in the notebook\n",
+    "void display_image(std::vector<unsigned char> &image, bool clear_ouput){\n",
+    "    // memory objects for output in the web browser\n",
+    "    std::stringstream buffer;\n",
+    "    xeus::xjson mine;\n",
+    "\n",
+    "    if(clear_ouput)\n",
+    "        xeus::get_interpreter().clear_output(true);\n",
+    "\n",
+    "    buffer.str(\"\");\n",
+    "    for(auto c : image){\n",
+    "        buffer << c;\n",
+    "    }\n",
+    "\n",
+    "    mine[\"image/png\"] = xtl::base64encode(buffer.str());\n",
+    "    xeus::get_interpreter().display_data(\n",
+    "        std::move(mine),\n",
+    "        xeus::xjson::object(),\n",
+    "        xeus::xjson::object());\n",
+    "}\n",
+    "*/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "caa7fb92-d64d-45b9-8d4f-9f9d81e75bfa",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# CUDA SmallPT main"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cd12e5f-59ed-4367-8d22-65838179dc33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//#include \"imageio.hpp\"\n",
+    "//#include \"sampling.cuh\"\n",
+    "//#include \"specular.cuh\"\n",
+    "//#include \"sphere.hpp\"\n",
+    "\n",
+    "//#include \"cuda_tools.hpp\"\n",
+    "#include \"curand_kernel.h\"\n",
+    "\n",
+    "#define REFRACTIVE_INDEX_OUT 1.0\n",
+    "#define REFRACTIVE_INDEX_IN  1.5\n",
+    "#define EPSILON_SPHERE 1e-4\n",
+    "\n",
+    "namespace smallpt {\n",
+    "\n",
+    "  struct __align__(16)Ray {\n",
+    "    __device__ explicit Ray(Vector3 o, \n",
+    "                Vector3 d, \n",
+    "                double tmin = 0.0, \n",
+    "                double tmax = std::numeric_limits< double >::infinity(), \n",
+    "                std::uint32_t depth = 0u) noexcept\n",
+    "      : m_o(std::move(o)),\n",
+    "      m_d(std::move(d)),\n",
+    "      m_tmin(tmin),\n",
+    "      m_tmax(tmax),\n",
+    "      m_depth(depth) {};\n",
+    "    Ray(const Ray& ray) noexcept = default;\n",
+    "    Ray(Ray&& ray) noexcept = default;\n",
+    "    ~Ray() = default;\n",
+    "\n",
+    "    Ray& operator=(const Ray& ray) = default;\n",
+    "    Ray& operator=(Ray&& ray) = default;\n",
+    "\n",
+    "    __device__ const Vector3 operator()(double t) const {\n",
+    "      return m_o + m_d * t;\n",
+    "    }\n",
+    "\n",
+    "    Vector3 m_o, m_d;\n",
+    "    mutable double m_tmin, m_tmax;\n",
+    "    std::uint32_t m_depth;\n",
+    "  };\n",
+    "\n",
+    "  enum struct Reflection_t {\n",
+    "    Diffuse,\n",
+    "    Specular,\n",
+    "    Refractive\n",
+    "  };\n",
+    "\n",
+    "  struct __align__(16) Sphere {\n",
+    "    __host__ __device__ explicit Sphere(double r,\n",
+    "                      Vector3 p,\n",
+    "                      Vector3 e,\n",
+    "                      Vector3 f,\n",
+    "                      Reflection_t reflection_t) noexcept\n",
+    "      : m_r(r),\n",
+    "      m_p(std::move(p)),\n",
+    "      m_e(std::move(e)),\n",
+    "      m_f(std::move(f)),\n",
+    "      m_reflection_t(reflection_t) {}\n",
+    "    Sphere(const Sphere& sphere) noexcept = default;\n",
+    "    Sphere(Sphere&& sphere) noexcept = default;\n",
+    "    ~Sphere() = default;\n",
+    "\n",
+    "    Sphere& operator=(const Sphere& sphere) = default;\n",
+    "    Sphere& operator=(Sphere&& sphere) = default;\n",
+    "    \n",
+    "    __device__ bool Intersect(const Ray& ray) const {\n",
+    "      const Vector3 op = m_p - ray.m_o;\n",
+    "      const double dop = ray.m_d.Dot(op);\n",
+    "      const double D = dop * dop - op.Dot(op) + m_r * m_r;\n",
+    "\n",
+    "      if (0.0 > D) {\n",
+    "        return false;\n",
+    "      }\n",
+    "\n",
+    "      const double sqrtD = sqrt(D);\n",
+    "\n",
+    "      const double tmin = dop - sqrtD;\n",
+    "      if (ray.m_tmin < tmin && tmin < ray.m_tmax) {\n",
+    "        ray.m_tmax = tmin;\n",
+    "        return true;\n",
+    "      }\n",
+    "\n",
+    "      const double tmax = dop + sqrtD;\n",
+    "      if (ray.m_tmin < tmax && tmax < ray.m_tmax) {\n",
+    "        ray.m_tmax = tmax;\n",
+    "        return true;\n",
+    "      }\n",
+    "\n",
+    "      return false;\n",
+    "    }\n",
+    "\n",
+    "    double m_r;\n",
+    "    Vector3 m_p; // position\n",
+    "    Vector3 m_e; // emission\n",
+    "    Vector3 m_f; // reflection\n",
+    "    Reflection_t m_reflection_t;\n",
+    "  };\n",
+    "    \n",
+    "  const Sphere g_spheres[] = {\n",
+    "    Sphere(1e5,  Vector3(1e5 + 1, 40.8, 81.6),   Vector3(),   Vector3(0.75,0.25,0.25), Reflection_t::Diffuse),    //Left\n",
+    "    Sphere(1e5,  Vector3(-1e5 + 99, 40.8, 81.6), Vector3(),   Vector3(0.25,0.25,0.75), Reflection_t::Diffuse),    //Right\n",
+    "    Sphere(1e5,  Vector3(50, 40.8, 1e5),         Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Back\n",
+    "    Sphere(1e5,  Vector3(50, 40.8, -1e5 + 170),  Vector3(),   Vector3(),               Reflection_t::Diffuse),    //Front\n",
+    "    Sphere(1e5,  Vector3(50, 1e5, 81.6),         Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Bottom\n",
+    "    Sphere(1e5,  Vector3(50, -1e5 + 81.6, 81.6), Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Top\n",
+    "    Sphere(16.5, Vector3(27, 16.5, 47),          Vector3(),   Vector3(0.999),          Reflection_t::Specular),   //Mirror\n",
+    "    Sphere(16.5, Vector3(73, 16.5, 78),          Vector3(),   Vector3(0.999),          Reflection_t::Refractive), //Glass\n",
+    "    Sphere(600,  Vector3(50, 681.6 - .27, 81.6), Vector3(12), Vector3(),               Reflection_t::Diffuse)     //Light\n",
+    "  };\n",
+    "\n",
+    "  __device__ inline Vector3 UniformSampleOnHemisphere(double u1,  double u2) {\n",
+    "    // u1 := cos_theta\n",
+    "    const double sin_theta = std::sqrt(fmax(0.0, 1.0 - u1 * u1));\n",
+    "    const double phi = 2.0 * g_pi * u2;\n",
+    "    return {\n",
+    "      std::cos(phi) * sin_theta,\n",
+    "      std::sin(phi) * sin_theta,\n",
+    "      u1\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline Vector3 CosineWeightedSampleOnHemisphere(double u1, double u2) {\n",
+    "    const double cos_theta = sqrt(1.0 - u1);\n",
+    "    const double sin_theta = sqrt(u1);\n",
+    "    const double phi = 2.0 * g_pi * u2;\n",
+    "    return {\n",
+    "      std::cos(phi) * sin_theta,\n",
+    "      std::sin(phi) * sin_theta,\n",
+    "      cos_theta\n",
+    "    };\n",
+    "  }\n",
+    "    \n",
+    "  __device__ inline double Reflectance0(double n1, double n2) {\n",
+    "    const double sqrt_R0 = (n1 - n2) / (n1 + n2);\n",
+    "    return sqrt_R0 * sqrt_R0;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline double SchlickReflectance(double n1, \n",
+    "                        double n2, \n",
+    "                        double c) {\n",
+    "    const double R0 = Reflectance0(n1, n2);\n",
+    "    return R0 + (1.0 - R0) * c * c * c * c * c;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 IdealSpecularReflect(const Vector3& d, const Vector3& n) {\n",
+    "    return d - 2.0 * n.Dot(d) * n;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 IdealSpecularTransmit(const Vector3& d, \n",
+    "                              const Vector3& n, \n",
+    "                              double n_out, \n",
+    "                              double n_in, \n",
+    "                              double& pr, \n",
+    "                              curandState* state) {\n",
+    "    \n",
+    "    const Vector3 d_Re = IdealSpecularReflect(d, n);\n",
+    "\n",
+    "    const bool out_to_in = (0.0 > n.Dot(d));\n",
+    "    const Vector3 nl = out_to_in ? n : -n;\n",
+    "    const double nn = out_to_in ? n_out / n_in : n_in / n_out;\n",
+    "    const double cos_theta = d.Dot(nl);\n",
+    "    const double cos2_phi = 1.0 - nn * nn * (1.0 - cos_theta * cos_theta);\n",
+    "\n",
+    "    // Total Internal Reflection\n",
+    "    if (0.0 > cos2_phi) {\n",
+    "      pr = 1.0;\n",
+    "      return d_Re;\n",
+    "    }\n",
+    "\n",
+    "    const Vector3 d_Tr = Normalize(nn * d - nl * (nn * cos_theta + sqrt(cos2_phi)));\n",
+    "    const double c = 1.0 - (out_to_in ? -cos_theta : d_Tr.Dot(n));\n",
+    "\n",
+    "    const double Re = SchlickReflectance(n_out, n_in, c);\n",
+    "    const double p_Re = 0.25 + 0.5 * Re;\n",
+    "    if (curand_uniform_double(state) < p_Re) {\n",
+    "      pr = (Re / p_Re);\n",
+    "      return d_Re;\n",
+    "    }\n",
+    "    else {\n",
+    "      const double Tr = 1.0 - Re;\n",
+    "      const double p_Tr = 1.0 - p_Re;\n",
+    "      pr = (Tr / p_Tr);\n",
+    "      return d_Tr;\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline bool Intersect(const Sphere* dev_spheres, \n",
+    "                   std::size_t nb_spheres, \n",
+    "                   const Ray& ray, \n",
+    "                   size_t& id)\n",
+    "  {\n",
+    "    bool hit = false;\n",
+    "    for (std::size_t i = 0u; i < nb_spheres; ++i) {\n",
+    "      if (dev_spheres[i].Intersect(ray)) {\n",
+    "        hit = true;\n",
+    "        id  = i;\n",
+    "      }\n",
+    "    }\n",
+    "\n",
+    "    return hit;\n",
+    "  }\n",
+    "\n",
+    "  __device__ static Vector3 Radiance(const Sphere* dev_spheres, \n",
+    "                     std::size_t nb_spheres,\n",
+    "                     const Ray& ray, \n",
+    "                     curandState* state)\n",
+    "  {  \n",
+    "    Ray r = ray;\n",
+    "    Vector3 L;\n",
+    "    Vector3 F(1.0);\n",
+    "\n",
+    "    while (true) {\n",
+    "      std::size_t id;\n",
+    "      if (!Intersect(dev_spheres, nb_spheres, r, id)) {\n",
+    "        return L;\n",
+    "      }\n",
+    "\n",
+    "      const Sphere& shape = dev_spheres[id];\n",
+    "      const Vector3 p = r(r.m_tmax);\n",
+    "      const Vector3 n = Normalize(p - shape.m_p);\n",
+    "\n",
+    "      L += F * shape.m_e;\n",
+    "      F *= shape.m_f;\n",
+    "\n",
+    "      // Russian roulette\n",
+    "      if (4 < r.m_depth) {\n",
+    "        const double continue_probability = shape.m_f.Max();\n",
+    "        if (curand_uniform_double(state) >= continue_probability) {\n",
+    "          return L;\n",
+    "        }\n",
+    "        F /= continue_probability;\n",
+    "      }\n",
+    "\n",
+    "      // Next path segment\n",
+    "      switch (shape.m_reflection_t) {\n",
+    "      \n",
+    "      case Reflection_t::Specular: {\n",
+    "        const Vector3 d = IdealSpecularReflect(r.m_d, n);\n",
+    "        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n",
+    "        break;\n",
+    "      }\n",
+    "      \n",
+    "      case Reflection_t::Refractive: {\n",
+    "        double pr;\n",
+    "        const Vector3 d = IdealSpecularTransmit(r.m_d, n, REFRACTIVE_INDEX_OUT, REFRACTIVE_INDEX_IN, pr, state);\n",
+    "        F *= pr;\n",
+    "        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n",
+    "        break;\n",
+    "      }\n",
+    "      \n",
+    "      default: {\n",
+    "        const Vector3 w = (0.0 > n.Dot(r.m_d)) ? n : -n;\n",
+    "        const Vector3 u = Normalize((abs(w.m_x) > 0.1 ? Vector3(0.0, 1.0, 0.0) : Vector3(1.0, 0.0, 0.0)).Cross(w));\n",
+    "        const Vector3 v = w.Cross(u);\n",
+    "\n",
+    "        const Vector3 sample_d = CosineWeightedSampleOnHemisphere(curand_uniform_double(state), curand_uniform_double(state));\n",
+    "        const Vector3 d = Normalize(sample_d.m_x * u + sample_d.m_y * v + sample_d.m_z * w);\n",
+    "        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n",
+    "      }\n",
+    "      }\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  __global__ static void kernel(const Sphere* dev_spheres, \n",
+    "                  std::size_t nb_spheres,\n",
+    "                  std::uint32_t w, \n",
+    "                  std::uint32_t h, \n",
+    "                  Vector3* Ls, \n",
+    "                  std::uint32_t nb_samples)\n",
+    "  {\n",
+    "    const std::uint32_t x = threadIdx.x + blockIdx.x * blockDim.x;\n",
+    "    const std::uint32_t y = threadIdx.y + blockIdx.y * blockDim.y;\n",
+    "    const std::uint32_t offset = x + y * blockDim.x * gridDim.x;\n",
+    "\n",
+    "    if (x >= w || y >= h) {\n",
+    "      return;\n",
+    "    }\n",
+    "\n",
+    "    // RNG\n",
+    "    curandState state;\n",
+    "    curand_init(offset, 0u, 0u, &state);\n",
+    "\n",
+    "    const Vector3 eye = { 50.0, 52.0, 295.6 };\n",
+    "    const Vector3 gaze = Normalize(Vector3(0.0, -0.042612, -1.0));\n",
+    "    const double fov = 0.5135;\n",
+    "    const Vector3 cx = { w * fov / h, 0.0, 0.0 };\n",
+    "    const Vector3 cy = Normalize(cx.Cross(gaze)) * fov;\n",
+    "\n",
+    "    for (std::size_t sy = 0u, i = (h - 1u - y) * w + x; sy < 2u; ++sy) { // 2 subpixel row\n",
+    "\n",
+    "      for (std::size_t sx = 0u; sx < 2u; ++sx) { // 2 subpixel column\n",
+    "\n",
+    "        Vector3 L;\n",
+    "\n",
+    "        for (std::size_t s = 0u; s < nb_samples; ++s) { // samples per subpixel\n",
+    "          const double u1 = 2.0 * curand_uniform_double(&state);\n",
+    "          const double u2 = 2.0 * curand_uniform_double(&state);\n",
+    "          const double dx = (u1 < 1.0) ? sqrt(u1) - 1.0 : 1.0 - sqrt(2.0 - u1);\n",
+    "          const double dy = (u2 < 1.0) ? sqrt(u2) - 1.0 : 1.0 - sqrt(2.0 - u2);\n",
+    "          const Vector3 d = cx * (((sx + 0.5 + dx) * 0.5 + x) / w - 0.5) +\n",
+    "                          cy * (((sy + 0.5 + dy) * 0.5 + y) / h - 0.5) + gaze;\n",
+    "          \n",
+    "          L += Radiance(dev_spheres, nb_spheres, \n",
+    "            Ray(eye + d * 130, Normalize(d), EPSILON_SPHERE), &state) \n",
+    "            * (1.0 / nb_samples);\n",
+    "        }\n",
+    "        \n",
+    "        Ls[i] += 0.25 * Clamp(L);\n",
+    "      }\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  static void Render(std::uint32_t nb_samples) noexcept\n",
+    "  {\n",
+    "    const std::uint32_t w = 256u; //1024u;\n",
+    "    const std::uint32_t h = 192u; //768u;\n",
+    "    const std::uint32_t nb_pixels = w * h;\n",
+    "\n",
+    "    // Set up device memory\n",
+    "    Sphere* dev_spheres;\n",
+    "    HANDLE_ERROR(cudaMalloc((void**)&dev_spheres, sizeof(g_spheres)));\n",
+    "    HANDLE_ERROR(cudaMemcpy(dev_spheres, g_spheres, sizeof(g_spheres), cudaMemcpyHostToDevice));\n",
+    "    Vector3* dev_Ls;\n",
+    "    HANDLE_ERROR(cudaMalloc((void**)&dev_Ls, nb_pixels * sizeof(Vector3)));\n",
+    "    HANDLE_ERROR(cudaMemset(dev_Ls, 0, nb_pixels * sizeof(Vector3)));\n",
+    "\n",
+    "    // Kernel execution\n",
+    "    const dim3 nblocks(w / 16u, h / 16u);\n",
+    "    const dim3 nthreads(16u, 16u);\n",
+    "    kernel<<<nblocks, nthreads>>>(dev_spheres, sizeof(g_spheres)/sizeof(g_spheres[0]),  w, h, dev_Ls, nb_samples);\n",
+    "\n",
+    "    // Set up host memory\n",
+    "    Vector3* Ls = (Vector3*)malloc(nb_pixels * sizeof(Vector3));\n",
+    "    // Transfer device -> host\n",
+    "    HANDLE_ERROR(cudaMemcpy(Ls, dev_Ls, nb_pixels * sizeof(Vector3), cudaMemcpyDeviceToHost));\n",
+    "\n",
+    "    // Clean up device memory\n",
+    "    HANDLE_ERROR(cudaFree(dev_Ls));\n",
+    "    HANDLE_ERROR(cudaFree(dev_spheres));\n",
+    "\n",
+    "    //WritePPM(w, h, Ls);\n",
+    "    save(\"cu-smallpt.png\", w, h, Ls);\n",
+    "\n",
+    "    // Clean up host memory\n",
+    "    free(Ls);\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "void devicePropertyPrint()\n",
+    "{\n",
+    "  int dev = 0;\n",
+    "  cudaDeviceProp devProp;\n",
+    "  if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)\n",
+    "  {\n",
+    "    printf(\"Device %i, named: %s\\n\", dev, devProp.name);\n",
+    "    printf(\"Device compute capability: %i - %i\\n\", devProp.minor, devProp.major);\n",
+    "    printf(\"Device maxThreadDim: [%i, %i, %i]\\n\", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);\n",
+    "    printf(\"Device maxGridSize: [%i, %i, %i]\\n\", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);\n",
+    "    printf(\"Multi Processor Count: %i\\n\", devProp.multiProcessorCount);\n",
+    "    printf(\"Size of SharedMem Per-Block: %f KB\\n\", devProp.sharedMemPerBlock / 1024.0);\n",
+    "    printf(\"Max Threads Per-Block: %i\\n\", devProp.maxThreadsPerBlock);\n",
+    "    printf(\"Max Threads Per-MultiProcessor: %i\\n\", devProp.maxThreadsPerMultiProcessor);\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "devicePropertyPrint();\n",
+    "const std::uint32_t nb_samples = 100;\n",
+    "smallpt::Render(nb_samples);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2e81c14-6630-4c0e-b8dd-8ee1f2ecfdd7",
+   "metadata": {},
+   "source": [
+    "<img src=\"/files/notebooks/cu-smallpt.png\" />"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec676c73-d492-494c-b089-85f7034cfe50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "from IPython.display import Image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d82489e-27ed-4324-a407-cc312bb84281",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "Image(filename='cu-smallpt.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae6231e8-105e-47f4-8eae-c6072b44a613",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <fstream>\n",
+    "#include \"nlohmann/json.hpp\"\n",
+    "#include \"xtl/xbase64.hpp\"\n",
+    "#include \"xeus/xinterpreter.hpp\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "777222e4-6175-408c-9e1b-b1c110f89377",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void show_image() {\n",
+    "    std::ifstream fin(\"cu-smallpt.png\", std::ios::binary);   \n",
+    "    std::stringstream m_buffer;\n",
+    "    m_buffer << fin.rdbuf();\n",
+    "    auto bundle = nlohmann::json::object();\n",
+    "    bundle[\"image/png\"] = xtl::base64encode(m_buffer.str());\n",
+    "    ////\n",
+    "    //bundle[\"text/html\"] = \"<p>test</p>\";\n",
+    "    //       std::string(\"<img src='data:image/png;base64,\")\n",
+    "    //       + xtl::base64encode(m_buffer.str()) +\n",
+    "    //        \"' width=256 height=192 />\";\n",
+    "    ////\n",
+    "    //printf(\"%s\\n\", bundle.dump().c_str());\n",
+    "    xeus::get_interpreter().display_data(std::move(bundle), nlohmann::json::object(), nlohmann::json::object());\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7369364-526f-4a4c-81c5-d0249e03b6ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_image();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14c154fe-9d18-4c4b-bc82-eeef5a310375",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CppInterOp (C++17)",
+   "language": "CUDA",
+   "name": "cppinterop-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "C++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/eigen_demo/cppyy_caas_tests.ipynb b/notebooks/eigen_demo/cppyy_caas_tests.ipynb
new file mode 100644
index 00000000..5144b5e0
--- /dev/null
+++ b/notebooks/eigen_demo/cppyy_caas_tests.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1170e2e2-0b77-4d70-afab-3069af8d0528",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include<iostream>\n",
+    "#include<vector>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9af566fb-a31f-4d0b-ad7b-69383b2b0940",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "namespace VoidPtrArray {\n",
+    "            typedef struct _name {\n",
+    "                _name() { p[0] = (void*)0x1; p[1] = (void*)0x2; p[2] = (void*)0x3; }\n",
+    "                void* p[3];\n",
+    "            } name;\n",
+    "        }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ce6a3448-50ee-483c-8c67-b3e3fd9f6c8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "n = cppyy.gbl.VoidPtrArray.name()\n",
+    "assert n.p[0] == 0x1\n",
+    "assert n.p[1] == 0x2\n",
+    "assert n.p[2] == 0x3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9dde9a47-48a7-4cae-90db-d489d61a1c51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "namespace VectorConstCharStar {\n",
+    "            std::vector<const char*> test = {\"hello\"};\n",
+    "        }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4c45ea0c-3634-43a3-bae9-c1972e6b4762",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import cppyy\n",
+    "ns = cppyy.gbl.VectorConstCharStar\n",
+    "\n",
+    "assert len(ns.test) == 1\n",
+    "assert ns.test[0] == \"hello\"\n",
+    "\n",
+    "ns.test.push_back(\"world\")\n",
+    "assert len(ns.test) == 2\n",
+    "assert ns.test[0] == \"hello\"\n",
+    "assert ns.test[1] == \"world\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d5699a3b-94ce-4b1a-96f5-2b3264e0cabb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "namespace ArrayLike {\n",
+    "struct __attribute__((__packed__)) Vector3f {\n",
+    "    float x, y, z;\n",
+    "}; }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8bff7898-1265-4341-941e-3d1810b06abd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "N = 5\n",
+    "\n",
+    "v = cppyy.gbl.std.vector['ArrayLike::Vector3f'](N)\n",
+    "\n",
+    "for i in range(N):\n",
+    "    d = v[i]\n",
+    "    d.x, d.y, d.z = i, i*N, i*N**2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "bbc3c5a0-7b0b-46ae-a576-7e5721b0883a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "for i in range(N):\n",
+    "    d = v[i]\n",
+    "    assert d.x == float(i)\n",
+    "    assert d.y == float(i*N)\n",
+    "    assert d.z == float(i*N**2)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15727205-0392-4fbd-a7e5-8cdb73f37ab1",
+   "metadata": {},
+   "source": [
+    "#### STL-like class with preinc by-ref returns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "65ec9f7d-40a6-44c4-a944-6182ea7f8b02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "namespace PreIncrement {\n",
+    "        struct Token {\n",
+    "            int value;\n",
+    "        };\n",
+    "\n",
+    "struct Iterator {\n",
+    "    Token current;\n",
+    "\n",
+    "    bool operator!=(const Iterator& rhs) {\n",
+    "        return rhs.current.value != current.value; }\n",
+    "    const Token& operator*() { return current; }\n",
+    "    Iterator& operator++() {\n",
+    "        current.value++;\n",
+    "        return *this;\n",
+    "    }\n",
+    "};\n",
+    "\n",
+    "struct Stream {\n",
+    "    Iterator begin() { return Iterator(); }\n",
+    "    Iterator end() { return Iterator{10}; }\n",
+    "}; }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c4ed4321-6b9b-4531-8d79-ee00f7d46e3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Traceback (most recent call last):\n",
+      "  File \"<string>\", line 5, in <module>\n",
+      "TypeError: PreIncrement::Iterator PreIncrement::Stream::beginIterator PreIncrement::Stream::begin() =>\n",
+      "    TypeError: unbound method Stream::begin must be called with a Stream instance as first argument\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import cppyy\n",
+    "ns = cppyy.gbl.PreIncrement\n",
+    "\n",
+    "stream = ns.Stream\n",
+    "stream.begin(stream)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CUDA (C++17)",
+   "language": "CUDA",
+   "name": "cuda-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/eigen_demo/eigen_demo.ipynb b/notebooks/eigen_demo/eigen_demo.ipynb
new file mode 100644
index 00000000..e36c9665
--- /dev/null
+++ b/notebooks/eigen_demo/eigen_demo.ipynb
@@ -0,0 +1,343 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6c9eda5c-bf2f-4bcd-8dee-bc7db08dd801",
+   "metadata": {},
+   "source": [
+    "### Here we include the **conda** installed library using Cppyy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b172dcbe-7a72-4d1a-8e70-5955dec48735",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting package metadata (current_repodata.json): ...working... done\n",
+      "Solving environment: ...working... done\n",
+      "\n",
+      "\n",
+      "==> WARNING: A newer version of conda exists. <==\n",
+      "  current version: 22.9.0\n",
+      "  latest version: 23.7.4\n",
+      "\n",
+      "Please update conda by running\n",
+      "\n",
+      "    $ conda update -n base -c conda-forge conda\n",
+      "\n",
+      "\n",
+      "\n",
+      "# All requested packages already installed.\n",
+      "\n",
+      "Retrieving notices: ...working... done\n"
+     ]
+    }
+   ],
+   "source": [
+    "!conda install -y -c conda-forge eigen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "01d6e223-c6bc-4255-bd2f-a2deb68ea06d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <clang/Interpreter/CppInterOp.h>\n",
+    "Cpp::AddIncludePath(\"/opt/conda/envs/.venv/include\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22e7b3ed-747e-4bd5-8b89-4a6e5661dfbf",
+   "metadata": {},
+   "source": [
+    "### We are immediately able to use the `Eigen::Dense` namespaces in our C++ code cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "815bd42b-024a-49c7-9557-fa5bfd8b4496",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include \"/opt/conda/envs/.venv/include/eigen3/Eigen/Dense\"\n",
+    "\n",
+    "typedef Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> MatrixXd;\n",
+    "\n",
+    "Eigen::Vector2d a;\n",
+    "Eigen::Vector3d b;\n",
+    "Eigen::Vector4d c;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0766e5c0-2d3b-4ae4-96bd-4a78f3b02cc2",
+   "metadata": {},
+   "source": [
+    "### We can define a templated class with a set of vector operations offered by Eigen "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bca71004-dec4-4571-8a89-c9c9a54aaeee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template <typename VectorType>\n",
+    "class EigenOperations {\n",
+    "public:\n",
+    "    static VectorType PerformOperations(const VectorType& v) {\n",
+    "        VectorType result = v;\n",
+    "        \n",
+    "        // Using Eigen::Dense object methods\n",
+    "        \n",
+    "        result.normalize(); // eigen vector normalisation\n",
+    "\n",
+    "        double dot = v.dot(v); // dot product\n",
+    "        for (int i = 0; i < result.size(); i++) {\n",
+    "            result[i] += dot;\n",
+    "        }\n",
+    " \n",
+    "        double squaredNorm = v.squaredNorm(); // vector squared norm \n",
+    "        for (int i = 0; i < result.size(); i++) {\n",
+    "            result[i] -= squaredNorm;\n",
+    "        }\n",
+    "\n",
+    "        return result;\n",
+    "    }\n",
+    "};\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de00d39d-8156-4ecb-a932-924661e7b80e",
+   "metadata": {},
+   "source": [
+    "### Accessing the declared Eigen vectors on the Python side using Cppyy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9b7e7fee-1edd-427c-a738-0cdf941200c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import cppyy\n",
+    "\n",
+    "a = cppyy.gbl.a\n",
+    "b = cppyy.gbl.b\n",
+    "c = cppyy.gbl.c\n",
+    "\n",
+    "def display_eigen(vec, name):\n",
+    "    print(\"Vector : \", name, \", Dimensions : \", len(vec))\n",
+    "    for x in vec:\n",
+    "        print(x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07cfc277-4202-47b6-9859-dd4a651bd4a1",
+   "metadata": {},
+   "source": [
+    "#### Setting values to the C++ Eigen Vector"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "3775e315-652d-4dea-8729-e36549524b62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "for i in range(len(a)):\n",
+    "    a[i] = i - 1\n",
+    "\n",
+    "for i in range(len(b)):\n",
+    "    b[i] = i + 1\n",
+    "\n",
+    "for i in range(len(c)):\n",
+    "    c[i] = i - 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ca77d6ff-3428-4977-9072-9f2f2a904973",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vector :  A , Dimensions :  2\n",
+      "-1.0\n",
+      "0.0\n",
+      "Vector :  B , Dimensions :  3\n",
+      "1.0\n",
+      "2.0\n",
+      "3.0\n",
+      "Vector :  C , Dimensions :  4\n",
+      "-1.0\n",
+      "0.0\n",
+      "1.0\n",
+      "2.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "display_eigen(a, \"A\")\n",
+    "display_eigen(b, \"B\")\n",
+    "display_eigen(c, \"C\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8c29a51-9897-4e16-9f6e-bacf918129cb",
+   "metadata": {},
+   "source": [
+    "### Calling the Templated method of the C++ class from the Python side"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a01b4c6d-88ea-4cb1-a17e-716e1d813b1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "res2d = cppyy.gbl.EigenOperations[\"Eigen::Vector2d\"].PerformOperations(a)\n",
+    "res3d = cppyy.gbl.EigenOperations[\"Eigen::Vector3d\"].PerformOperations(b)\n",
+    "res4d = cppyy.gbl.EigenOperations[\"Eigen::Vector4d\"].PerformOperations(c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "680cc0d4-3d7b-441c-8f71-3acdf1b9b3e2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: matplotlib in /opt/conda/envs/.venv/lib/python3.10/site-packages (3.8.0)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.1.1)\n",
+      "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (4.42.1)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n",
+      "Requirement already satisfied: numpy<2,>=1.21 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.26.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (23.1)\n",
+      "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (10.0.1)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (3.1.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "876beb2f-113e-4889-bd10-3e460d17181e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "origin = [0, 0]\n",
+    "\n",
+    "np_res2 = np.array(list(res2d))\n",
+    "np_res3 = np.array(list(res3d)) \n",
+    "np_res4 = np.array(list(res4d)) \n",
+    "\n",
+    "plt.quiver(*origin, *np_res2, color=['r'], scale=5)\n",
+    "plt.quiver(*origin, *np_res3, color=['b'], scale=5)\n",
+    "\n",
+    "\n",
+    "plt.savefig(\"test.jpg\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc77ec01-bd62-4a87-b2e1-76141623fc5c",
+   "metadata": {},
+   "source": [
+    "<img src=\"./test.jpg\"/>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4078ec69-0b4d-4350-8e6b-2d793bc27491",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vector :  A , Dimensions :  2\n",
+      "-1.0\n",
+      "0.0\n",
+      "Vector :  B , Dimensions :  3\n",
+      "0.26726124191242384\n",
+      "0.5345224838248495\n",
+      "0.8017837257372733\n",
+      "Vector :  C , Dimensions :  4\n",
+      "-0.40824829046386313\n",
+      "0.0\n",
+      "0.40824829046386313\n",
+      "0.8164965809277263\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "display_eigen(res2d, \"A\")\n",
+    "display_eigen(res3d, \"B\")\n",
+    "display_eigen(res4d, \"C\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++17",
+   "language": "C++17",
+   "name": "xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/eigen_demo/test.jpg b/notebooks/eigen_demo/test.jpg
new file mode 100644
index 00000000..ff7a9422
Binary files /dev/null and b/notebooks/eigen_demo/test.jpg differ
diff --git a/notebooks/image_CUDA_demo/img_in.jpg b/notebooks/image_CUDA_demo/img_in.jpg
new file mode 100644
index 00000000..08fc8254
Binary files /dev/null and b/notebooks/image_CUDA_demo/img_in.jpg differ
diff --git a/notebooks/image_CUDA_demo/img_out.jpg b/notebooks/image_CUDA_demo/img_out.jpg
new file mode 100644
index 00000000..10ed414f
Binary files /dev/null and b/notebooks/image_CUDA_demo/img_out.jpg differ
diff --git a/notebooks/image_CUDA_demo/threshold_test.ipynb b/notebooks/image_CUDA_demo/threshold_test.ipynb
new file mode 100644
index 00000000..9e59283c
--- /dev/null
+++ b/notebooks/image_CUDA_demo/threshold_test.ipynb
@@ -0,0 +1,376 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9ba0128f-ef46-4b86-a4cd-02fba18d6516",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <iostream>\n",
+    "#include <cmath>\n",
+    "#include <cuda_runtime.h>\n",
+    "#include<vector>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c63a0cf7-c79b-4281-8b76-e0ccad6c7b93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void thresholdKernel(float* input, float* output, const int width, const int height) {\n",
+    "    const unsigned int col = threadIdx.x + blockIdx.x * blockDim.x;\n",
+    "    const unsigned int row = threadIdx.y + blockIdx.y * blockDim.y;\n",
+    "      if (row < height && col < width) {\n",
+    "        \n",
+    "    if(input[col + row * width] > 200)\n",
+    "        output[col + row * width] = input[col + row * width] * 2;\n",
+    "    else\n",
+    "        output[col + row * width] = input[col + row * width] * 0.4;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c7454511-0b12-4022-ab97-c73f50c3c1f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "const int width = 512;  \n",
+    "const int height = 512;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c639e359-7481-4fe0-8dfd-860885fd9044",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "float* h_input = new float[width * height];\n",
+    "float* h_output = new float[width * height];"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bf055a82-9dd8-4961-a271-b3e5a144c909",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: Pillow in /opt/conda/envs/.venv/lib/python3.10/site-packages (10.0.1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install Pillow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d7c9c2b7-45be-4d22-90f9-4d61042e353f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: numpy in /opt/conda/envs/.venv/lib/python3.10/site-packages (1.26.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install numpy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ac00db7b-867a-4ff0-8afa-2c13006b9f32",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: matplotlib in /opt/conda/envs/.venv/lib/python3.10/site-packages (3.8.0)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.1.1)\n",
+      "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (4.42.1)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n",
+      "Requirement already satisfied: numpy<2,>=1.21 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.26.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (23.1)\n",
+      "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (10.0.1)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (3.1.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a872a8cd-117b-4238-a4ea-e31e9dac48f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "image = Image.open('img_in.jpg') \n",
+    "image = image.resize((512, 512))\n",
+    "\n",
+    "image_array = np.array(image)\n",
+    "if len(image_array.shape) == 3 and image_array.shape[2] == 3:\n",
+    "    image_array = np.array(image.convert('L'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6a1cd914-db44-4e34-a9fb-5d2100b298b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[126 126 125 ...  84  84  84]\n",
+      " [126 126 125 ...  84  84  84]\n",
+      " [127 126 126 ...  84  84  84]\n",
+      " ...\n",
+      " [ 58  55  63 ...  19  15   8]\n",
+      " [ 57  56  53 ...  23  17  16]\n",
+      " [ 55  50  58 ...   5  16  16]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%python\n",
+    "\n",
+    "print(image_array)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7545b5c9-c901-4599-abde-3ae5ac17e160",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void displayImgArray(float* input) {\n",
+    "    for (int i = 0; i < 3; i++) {\n",
+    "        std::cout << input[i] << \" \"; \n",
+    "    }\n",
+    "\n",
+    "    std::cout << \" ... \";\n",
+    "\n",
+    "    for (int i = width * height - 3; i < width * height; i++) {\n",
+    "        std::cout << input[i] << \" \"; \n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "a3c74a47-399e-4454-89f6-f4e8b0376a68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import cppyy\n",
+    "\n",
+    "img_list = image_array.flatten().tolist()\n",
+    "img_vector = cppyy.gbl.std.vector['float'](img_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ab6c16bd-c44a-4e4e-985a-12c5ca3256a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "void setImg(const std::vector<float>& input) {\n",
+    "    if (h_input != nullptr) {\n",
+    "        delete[] h_input;\n",
+    "    }\n",
+    "\n",
+    "    h_input = new float[input.size()];\n",
+    "\n",
+    "    for (size_t i = 0; i < input.size(); i++) {\n",
+    "        h_input[i] = input[i]; // No casting needed\n",
+    "    }\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "a168bbab-fc67-41f5-85c1-f9980d75a00b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<float> getOutput() {\n",
+    "    std::vector<float> res;\n",
+    "    for (size_t i = 0; i < width * height; i++) {\n",
+    "        res.push_back(h_output[i]);\n",
+    "    }\n",
+    "    return res;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "2eec8b62-5500-45e4-9c02-9bc587288206",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "cppyy.gbl.setImg(img_vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "fd1bef06-d14e-4c82-8fa5-da14df696fe5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "126 126 125  ... 5 16 16 "
+     ]
+    }
+   ],
+   "source": [
+    "displayImgArray(h_input);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "ea90464f-ed8f-44f1-9336-99aa3dc85149",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "float* d_input;\n",
+    "float* d_output;\n",
+    "\n",
+    "cudaMalloc((void**)&d_input, width * height * sizeof(float));\n",
+    "cudaMalloc((void**)&d_output, width * height * sizeof(float));\n",
+    "\n",
+    "cudaMemcpy(d_input, h_input, width * height * sizeof(float), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "dim3 dimBlock(16, 16);\n",
+    "dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);\n",
+    "\n",
+    "thresholdKernel<<<dimGrid, dimBlock>>>(d_input, d_output, width, height);\n",
+    "\n",
+    "cudaMemcpy(h_output, d_output, width * height * sizeof(float), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "cudaFree(d_input);\n",
+    "cudaFree(d_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "7bf09ac1-ac6b-45e9-93c2-d4ae50897c2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "50.4 50.4 50  ... 2 6.4 6.4 "
+     ]
+    }
+   ],
+   "source": [
+    "displayImgArray(h_output);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "3a97a53f-c2b7-4e24-a299-a6c0ec81b16e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<float> blurredRes = getOutput();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "2586f437-295d-4b33-b18e-6f28628a7f81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "k = np.array(cppyy.gbl.blurredRes, dtype = np.uint8)\n",
+    "k.resize(512, 512)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a8104074-7398-4653-a81a-827dd81a16b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "Image.fromarray(k).save(\"img_out.jpg\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0d2e15b-5a5f-4ed8-8f17-3fd1f3fc4daf",
+   "metadata": {},
+   "source": [
+    "<img src=\"img_out.jpg\" align=right width=\"400\">\n",
+    "<img src=\"img_in.jpg\" align=left width=\"400\">"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CUDA (C++17)",
+   "language": "CUDA",
+   "name": "cuda-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/images/xeus-cpp.png b/notebooks/images/xeus-cpp.png
new file mode 100644
index 00000000..c5f9c11b
Binary files /dev/null and b/notebooks/images/xeus-cpp.png differ
diff --git a/notebooks/index.ipynb b/notebooks/index.ipynb
new file mode 100644
index 00000000..5222ed54
--- /dev/null
+++ b/notebooks/index.ipynb
@@ -0,0 +1,721 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![xeus-cpp](images/xeus-cpp.png)](https://github.com/compiler-research/xeus-cpp)\n",
+    "\n",
+    "A Jupyter kernel for C++ based on the `clang-repl` C++ interpreter and the `xeus` native implementation of the Jupyter protocol, xeus.\n",
+    "\n",
+    "- GitHub repository: https://github.com/compiler-research/xeus-cpp\n",
+    "- Online documentation: https://xeus-cpp.readthedocs.io/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage\n",
+    "\n",
+    "<div style=\"background: #efffed;\n",
+    "            border: 1px solid grey;\n",
+    "            margin: 8px 0 8px 0;\n",
+    "            text-align: center;\n",
+    "            padding: 8px; \">\n",
+    "    <i class=\"fa-play fa\" \n",
+    "       style=\"font-size: 40px;\n",
+    "              line-height: 40px;\n",
+    "              margin: 8px;\n",
+    "              color: #444;\">\n",
+    "    </i>\n",
+    "    <div>\n",
+    "    To run the selected code cell, hit <pre style=\"background: #efffed\">Shift + Enter</pre>\n",
+    "    </div>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Output and error streams\n",
+    "\n",
+    "`std::cout` and `std::cerr` are redirected to the notebook frontend."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "some output\n"
+     ]
+    }
+   ],
+   "source": [
+    "#include <iostream>\n",
+    "\n",
+    "std::cout << \"some output\" << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "some error\n"
+     ]
+    }
+   ],
+   "source": [
+    "std::cerr << \"some error\" << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <stdexcept>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "Standard Exception",
+     "evalue": "Unknown exception",
+     "output_type": "error",
+     "traceback": [
+      "Standard Exception Unknown exception"
+     ]
+    }
+   ],
+   "source": [
+    "throw std::runtime_error(\"Unknown exception\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Omitting the `;` in the last statement of a cell results in an output being printed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int j = 5;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "//j"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Interpreting the C++ programming language\n",
+    "\n",
+    "`clang-repl` has a broad support of the features of C++. You can define functions, classes, templates, etc ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "double sqr(double a)\n",
+    "{\n",
+    "    return a * a;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "double a = 2.5;\n",
+    "double asqr = sqr(a);\n",
+    "//// Fix:\n",
+    "//asqr"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Classes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Foo\n",
+    "{\n",
+    "public:\n",
+    "\n",
+    "    virtual ~Foo() {}\n",
+    "    \n",
+    "    virtual void print(double value) const\n",
+    "    {\n",
+    "        std::cout << \"Foo value = \" << value << std::endl;\n",
+    "    }\n",
+    "};"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Foo value = 1.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "Foo bar;\n",
+    "bar.print(1.2);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Polymorphism"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Bar : public Foo\n",
+    "{\n",
+    "public:\n",
+    "\n",
+    "    virtual ~Bar() {}\n",
+    "    \n",
+    "    virtual void print(double value) const\n",
+    "    {\n",
+    "        std::cout << \"Bar value = \" << 2 * value << std::endl;\n",
+    "    }\n",
+    "};"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bar value = 2.4\n"
+     ]
+    }
+   ],
+   "source": [
+    "Foo* bar2 = new Bar;\n",
+    "bar2->print(1.2);\n",
+    "delete bar2;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Templates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <typeinfo>\n",
+    "\n",
+    "template <class T>\n",
+    "class FooT\n",
+    "{\n",
+    "public:\n",
+    "    \n",
+    "    explicit FooT(const T& t) : m_t(t) {}\n",
+    "    \n",
+    "    void print() const\n",
+    "    {\n",
+    "        std::cout << typeid(T).name() << \" m_t = \" << m_t << std::endl;\n",
+    "    }\n",
+    "    \n",
+    "private:\n",
+    "    \n",
+    "    T m_t;\n",
+    "};"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "d m_t = 1.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "FooT<double> foot(1.2);\n",
+    "foot.print();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## C++11 / C++14 support"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Foo11\n",
+    "{\n",
+    "public:\n",
+    "    \n",
+    "    Foo11() { std::cout << \"Foo11 default constructor\" << std::endl; }\n",
+    "    Foo11(const Foo11&) { std::cout << \"Foo11 copy constructor\" << std::endl; }\n",
+    "    Foo11(Foo11&&) { std::cout << \"Foo11 move constructor\" << std::endl; }\n",
+    "};"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Foo11 default constructor\n",
+      "Foo11 copy constructor\n",
+      "Foo11 move constructor\n"
+     ]
+    }
+   ],
+   "source": [
+    "Foo11 f1;\n",
+    "Foo11 f2(f1);\n",
+    "Foo11 f3(std::move(f1));"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <vector>\n",
+    "\n",
+    "std::vector<int> v = { 1, 2, 3 };\n",
+    "auto iter = ++v.begin();\n",
+    "//// Fix:\n",
+    "//v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "//*iter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "... and also lambda, universal references, `decltype`, etc ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Documentation and completion\n",
+    "\n",
+    " - Documentation for types of the standard library is retrieved on cppreference.com.\n",
+    " - The quick-help feature can also be enabled for user-defined types and third-party libraries. More documentation on this feature is available at https://xeus-cpp.readthedocs.io/en/latest/inline_help.html.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "//?std::vector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using the `display_data` mechanism"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For a user-defined type `T`, the rich rendering in the notebook and JupyterLab can be enabled by by implementing the function `nl::json mime_bundle_repr(const T& im)`, which returns the JSON mime bundle for that type.\n",
+    "\n",
+    "More documentation on the rich display system of Jupyter and Xeus-cpp is available at https://xeus-cpp.readthedocs.io/en/latest/rich_display.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Image example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <string>\n",
+    "#include <fstream>\n",
+    "\n",
+    "#include \"nlohmann/json.hpp\"\n",
+    "\n",
+    "#include \"xtl/xbase64.hpp\"\n",
+    "\n",
+    "namespace nl = nlohmann;\n",
+    "\n",
+    "namespace im\n",
+    "{\n",
+    "    struct image\n",
+    "    {   \n",
+    "        inline image(const std::string& filename)\n",
+    "        {\n",
+    "            std::ifstream fin(filename, std::ios::binary);   \n",
+    "            m_buffer << fin.rdbuf();\n",
+    "        }\n",
+    "        \n",
+    "        std::stringstream m_buffer;\n",
+    "    };\n",
+    "    \n",
+    "    nl::json mime_bundle_repr(const image& i)\n",
+    "    {\n",
+    "        auto bundle = nl::json::object();\n",
+    "        bundle[\"image/png\"] = xtl::base64encode(i.m_buffer.str());\n",
+    "        return bundle;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im::image marie(\"images/marie.png\");\n",
+    "//// Fix:\n",
+    "//marie"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Audio example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <string>\n",
+    "#include <fstream>\n",
+    "\n",
+    "#include \"nlohmann/json.hpp\"\n",
+    "\n",
+    "#include \"xtl/xbase64.hpp\"\n",
+    "\n",
+    "namespace nl = nlohmann;\n",
+    "\n",
+    "namespace au\n",
+    "{\n",
+    "    struct audio\n",
+    "    {   \n",
+    "        inline audio(const std::string& filename)\n",
+    "        {\n",
+    "            std::ifstream fin(filename, std::ios::binary);   \n",
+    "            m_buffer << fin.rdbuf();\n",
+    "        }\n",
+    "        \n",
+    "        std::stringstream m_buffer;\n",
+    "    };\n",
+    "    \n",
+    "    nl::json mime_bundle_repr(const audio& a)\n",
+    "    {\n",
+    "        auto bundle = nl::json::object();\n",
+    "        bundle[\"text/html\"] =\n",
+    "           std::string(\"<audio controls=\\\"controls\\\"><source src=\\\"data:audio/wav;base64,\")\n",
+    "           + xtl::base64encode(a.m_buffer.str()) +\n",
+    "            \"\\\" type=\\\"audio/wav\\\" /></audio>\";\n",
+    "        return bundle;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "au::audio drums(\"audio/audio.wav\");\n",
+    "//// Fix:\n",
+    "//drums"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "//#include \"xcpp/xdisplay.hpp\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "//xcpp::display(drums);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Update-display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <string>\n",
+    "//// Fix:\n",
+    "//#include \"xcpp/xdisplay.hpp\"\n",
+    "\n",
+    "#include \"nlohmann/json.hpp\"\n",
+    "\n",
+    "namespace nl = nlohmann;\n",
+    "\n",
+    "namespace ht\n",
+    "{\n",
+    "    struct html\n",
+    "    {   \n",
+    "        inline html(const std::string& content)\n",
+    "        {\n",
+    "            m_content = content;\n",
+    "        }\n",
+    "        std::string m_content;\n",
+    "    };\n",
+    "\n",
+    "    nl::json mime_bundle_repr(const html& a)\n",
+    "    {\n",
+    "        auto bundle = nl::json::object();\n",
+    "        bundle[\"text/html\"] = a.m_content;\n",
+    "        return bundle;\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "// A blue rectangle\n",
+    "//// Fix:\n",
+    "/*ht::html rect(R\"(\n",
+    "<div style='\n",
+    "    width: 90px;\n",
+    "    height: 50px;\n",
+    "    line-height: 50px;\n",
+    "    background-color: blue;\n",
+    "    color: white;\n",
+    "    text-align: center;'>\n",
+    "Original\n",
+    "</div>)\");*/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "//xcpp::display(rect, \"some_display_id\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Update the rectangle to be red\n",
+    "//// Fix:\n",
+    "/*rect.m_content = R\"(\n",
+    "<div style='\n",
+    "    width: 90px;\n",
+    "    height: 50px;\n",
+    "    line-height: 50px;\n",
+    "    background-color: red;\n",
+    "    color: white;\n",
+    "    text-align: center;'>\n",
+    "Updated\n",
+    "</div>)\";*/\n",
+    "\n",
+    "//xcpp::display(rect, \"some_display_id\", true);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clear output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <chrono>\n",
+    "#include <iostream>\n",
+    "#include <thread>\n",
+    "\n",
+    "//// Fix:\n",
+    "//#include \"xcpp/xdisplay.hpp\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "/*std::cout << \"hello\" << std::endl;\n",
+    "std::this_thread::sleep_for(std::chrono::seconds(1));\n",
+    "xcpp::clear_output();  // will flicker when replacing \"hello\" with \"goodbye\"\n",
+    "std::this_thread::sleep_for(std::chrono::seconds(1));\n",
+    "std::cout << \"goodbye\" << std::endl;*/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//// Fix:\n",
+    "/*std::cout << \"hello\" << std::endl;\n",
+    "std::this_thread::sleep_for(std::chrono::seconds(1));\n",
+    "xcpp::clear_output(true);  // prevents flickering\n",
+    "std::this_thread::sleep_for(std::chrono::seconds(1));\n",
+    "std::cout << \"goodbye\" << std::endl;*/"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++14",
+   "language": "C++14",
+   "name": "xcpp14"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/kalman_CUDA_demo/1D_KF_plot.png b/notebooks/kalman_CUDA_demo/1D_KF_plot.png
new file mode 100644
index 00000000..843b4035
Binary files /dev/null and b/notebooks/kalman_CUDA_demo/1D_KF_plot.png differ
diff --git a/notebooks/kalman_CUDA_demo/assets/kf_diagram.png b/notebooks/kalman_CUDA_demo/assets/kf_diagram.png
new file mode 100644
index 00000000..a2b06567
Binary files /dev/null and b/notebooks/kalman_CUDA_demo/assets/kf_diagram.png differ
diff --git a/notebooks/kalman_CUDA_demo/assets/motion_illustration.png b/notebooks/kalman_CUDA_demo/assets/motion_illustration.png
new file mode 100644
index 00000000..be8de724
Binary files /dev/null and b/notebooks/kalman_CUDA_demo/assets/motion_illustration.png differ
diff --git a/notebooks/kalman_CUDA_demo/assets/x_graph.png b/notebooks/kalman_CUDA_demo/assets/x_graph.png
new file mode 100644
index 00000000..a50f9fe7
Binary files /dev/null and b/notebooks/kalman_CUDA_demo/assets/x_graph.png differ
diff --git a/notebooks/kalman_CUDA_demo/data/measurements.yml b/notebooks/kalman_CUDA_demo/data/measurements.yml
new file mode 100644
index 00000000..9e4e5ff6
--- /dev/null
+++ b/notebooks/kalman_CUDA_demo/data/measurements.yml
@@ -0,0 +1,146 @@
+data :
+- 1.04202710058
+- 1.10726790452
+- 1.2913511148
+- 1.48485250951
+- 1.72825901034
+- 1.74216489744
+- 2.11672039768
+- 2.14529225112
+- 2.16029641405
+- 2.21269371128
+- 2.57709350237
+- 2.6682215744
+- 2.51641839428
+- 2.76034056782
+- 2.88131780617
+- 2.88373786518
+- 2.9448468727
+- 2.82866600131
+- 3.0006601946
+- 3.12920591669
+- 2.858361783
+- 2.83808170354
+- 2.68975330958
+- 2.66533185589
+- 2.81613499531
+- 2.81003612051
+- 2.88321849354
+- 2.69789264832
+- 2.4342229249
+- 2.23464791825
+- 2.30278776224
+- 2.02069770395
+- 1.94393985809
+- 1.82498398739
+- 1.52526230354
+- 1.86967808173
+- 1.18073207847
+- 1.10729605087
+- 0.916168349913
+- 0.678547664519
+- 0.562381751596
+- 0.355468474885
+- -0.155607486619
+- -0.287198661013
+- -0.602973173813
+- -0.292648661013
+- -0.602973173813
+- -0.924197686613
+- -1.2563221994129998
+- -1.5993467122129998
+- -1.9532712250129998
+- -2.3180957378129996
+- -2.6938202506129993
+- -3.0804447634129994
+- -3.4779692762129994
+- -3.8863937890129994
+- -4.305718301812999
+- -4.735942814612999
+- -5.177067327412999
+- -5.629091840212999
+- -6.0920163530129985
+- -6.565840865812999
+- -7.0505653786129985
+- -7.546189891412999
+- -8.052714404212999
+- -8.570138917012999
+- -9.098463429812998
+- -9.637687942612999
+- -10.187812455412999
+- -10.748836968212998
+- -11.320761481013
+- -11.903585993813
+- -12.497310506613
+- -13.101935019413
+- -13.717459532213
+- -14.343884045013
+- -14.981208557813002
+- -15.629433070613002
+- -16.288557583413002
+- -16.958582096213004
+- -17.639506609013004
+- -18.331331121813005
+- -19.034055634613004
+- -19.747680147413003
+- -20.472204660213006
+- -21.207629173013007
+- -21.95395368581301
+- -22.71117819861301
+- -23.47930271141301
+- -24.25832722421301
+- -25.04825173701301
+- -25.849076249813013
+- -26.660800762613015
+- -27.483425275413015
+- -28.316949788213016
+- -29.161374301013016
+- -30.01669881381302
+- -30.88292332661302
+- -31.760047839413023
+- -32.64807235221303
+- -33.54699686501303
+- -34.45682137781303
+- -35.37754589061303
+- -36.30917040341303
+- -37.251694916213026
+- -38.20511942901303
+- -39.169443941813036
+- -40.14466845461304
+- -41.13079296741304
+- -42.12781748021305
+- -43.13574199301305
+- -44.15456650581305
+- -45.18429101861305
+- -46.22491553141305
+- -47.276440044213054
+- -48.33886455701305
+- -49.41218906981305
+- -50.49641358261306
+- -51.591538095413064
+- -52.69756260821307
+- -53.81448712101307
+- -54.94231163381308
+- -56.08103614661308
+- -57.23066065941308
+- -58.391185172213085
+- -59.562609685013086
+- -60.74493419781309
+- -61.93815871061309
+- -63.14228322341309
+- -64.35730773621309
+- -65.5832322490131
+- -66.82005676181309
+- -68.0677812746131
+- -69.3264057874131
+- -70.5959303002131
+- -71.8763548130131
+- -73.1676793258131
+- -74.46990383861309
+- -75.7830283514131
+- -77.1070528642131
+- -78.4419773770131
+- -79.78780188981311
+- -81.14452640261311
+- -82.51215091541312
+- -83.89067542821311
\ No newline at end of file
diff --git a/notebooks/kalman_CUDA_demo/kalman_gain_plot.png b/notebooks/kalman_CUDA_demo/kalman_gain_plot.png
new file mode 100644
index 00000000..f0db97cf
Binary files /dev/null and b/notebooks/kalman_CUDA_demo/kalman_gain_plot.png differ
diff --git a/notebooks/kalman_CUDA_demo/position_graph.png b/notebooks/kalman_CUDA_demo/position_graph.png
new file mode 100644
index 00000000..bf77cc6c
Binary files /dev/null and b/notebooks/kalman_CUDA_demo/position_graph.png differ
diff --git a/notebooks/kalman_CUDA_demo/run_kf.ipynb b/notebooks/kalman_CUDA_demo/run_kf.ipynb
new file mode 100644
index 00000000..67db75e4
--- /dev/null
+++ b/notebooks/kalman_CUDA_demo/run_kf.ipynb
@@ -0,0 +1,1503 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0aa31d74-5546-4990-9dd4-c44b552443a3",
+   "metadata": {},
+   "source": [
+    "# Running a 1 Dimensional Kalman Filter using std::vector and CUDA"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e77cdc1-2b81-4fe1-b815-86d7e05f9dab",
+   "metadata": {},
+   "source": [
+    "## System model:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0461c44-8d17-4e89-86ff-e9216ee51513",
+   "metadata": {},
+   "source": [
+    "| 1D LTI Projectile Motion | Position Graph with time|\n",
+    "| ---------------------- | ---------------------- |\n",
+    "| ![motion](assets/motion_illustration.png) | ![dog](assets/x_graph.png) |\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adc83b07-aa17-4bd7-8030-971ff8e16a5f",
+   "metadata": {},
+   "source": [
+    "## Goal : Estimate the value  of Gravitational Acceleration *(g)* based on position measurements of a 1D LTI projectile\n",
+    "\n",
+    "Kalman Filter operates by maintaining an ongoing estimate of the state of a system and updating this estimate based on new measurements and a prediction model. \n",
+    "\n",
+    "<img src = 'assets/kf_diagram.png'>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9f1f08b0-9d14-46c0-af75-4ea72af2d350",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <vector>\n",
+    "#include <cuda_runtime.h>\n",
+    "#include <device_launch_parameters.h>\n",
+    "#include <iostream>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0fbb643-fde6-4623-afd5-ee4069b025c8",
+   "metadata": {},
+   "source": [
+    "### Set up the `KalmanFilter` class and constructor\n",
+    "\n",
+    "  Here we create a Kalman filter with the specified matrices:\n",
+    "  \n",
+    "  - A - System dynamics matrix, here we model 1D motion with a *(Position x, Velocity v, Acceleration a)* model\n",
+    "  - C - Output matrix\n",
+    "  - Q - Covariance matrix of the process noise random variable *p(w) ~ N(0, Q)*\n",
+    "  - R - Covariance matrix of the measurement noise random variable *p(v) ~ N(0, R)*\n",
+    "  - P - Estimate error covariance\n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e1400114-f948-43b6-a0ba-2cdb566014f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class KalmanFilter {\n",
+    "\n",
+    "public:\n",
+    "\n",
+    "  /**\n",
+    "  * Create a Kalman filter with the specified matrices.\n",
+    "  *   A - System dynamics matrix\n",
+    "  *   C - Output matrix\n",
+    "  *   Q - Process noise covariance\n",
+    "  *   R - Measurement noise covariance\n",
+    "  *   P - Estimate error covariance\n",
+    "  */\n",
+    "  KalmanFilter(\n",
+    "      double dt,\n",
+    "      const std::vector<std::vector<double>>& A,\n",
+    "      const std::vector<std::vector<double>>& C,\n",
+    "      const std::vector<std::vector<double>>& Q,\n",
+    "      const std::vector<std::vector<double>>& R,\n",
+    "      const std::vector<std::vector<double>>& P\n",
+    "  );\n",
+    "\n",
+    "  /**\n",
+    "  * Create a blank estimator.\n",
+    "  */\n",
+    "  KalmanFilter();\n",
+    "\n",
+    "  /**\n",
+    "  * Initialize the filter with initial states as zero.\n",
+    "  */\n",
+    "  void init();\n",
+    "\n",
+    "  /**\n",
+    "  * Initialize the filter with a guess for initial states.\n",
+    "  */\n",
+    "  void init(double t0, const std::vector<double>& x0);\n",
+    "\n",
+    "  /**\n",
+    "  * Update the estimated state based on measured values. The\n",
+    "  * time step is assumed to remain constant.\n",
+    "  */\n",
+    "  std::vector<double> update(const std::vector<double>& y);\n",
+    "\n",
+    "  /**\n",
+    "  * Update the estimated state based on measured values,\n",
+    "  * using the given time step and dynamics matrix.\n",
+    "  */\n",
+    "  void update(const std::vector<double>& y, double dt, const std::vector<std::vector<double>>& A);\n",
+    "\n",
+    "  /**\n",
+    "  * Return the current state and time.\n",
+    "  */\n",
+    "  std::vector<double> state() { return x_hat; };\n",
+    "  double time() { return t; };\n",
+    "\n",
+    "private:\n",
+    "\n",
+    "  // Matrices for computation\n",
+    "  std::vector<std::vector<double>> A, C, Q, R, P, K, P0;\n",
+    "\n",
+    "  // System dimensions\n",
+    "  int m, n;\n",
+    "\n",
+    "  // Initial and current time\n",
+    "  double t0, t;\n",
+    "\n",
+    "  // Discrete time step\n",
+    "  double dt;\n",
+    "\n",
+    "  // Is the filter initialized?\n",
+    "  bool initialized;\n",
+    "\n",
+    "  // n-size identity matrix\n",
+    "  std::vector<std::vector<double>> I;\n",
+    "\n",
+    "  // State estimates\n",
+    "  std::vector<double> x_hat, x_hat_new;\n",
+    "};\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc67046f-5dcb-48c4-844b-b7f248cc8cea",
+   "metadata": {},
+   "source": [
+    "### Defining CUDA Kernel functions for parallel matrix and vector operations\n",
+    "\n",
+    "The algorithm operates in two stages of the ongoing discrete Kalman filter cycle:\n",
+    "\n",
+    " - Discrete Kalman filter time update equations:\n",
+    " \n",
+    "     $\\begin{aligned} & \\hat{x}_k^{-}=A \\hat{x}_{k-1} \\\\ & P_k^{-}=A P_{k-1} A^T+Q\\end{aligned}$\n",
+    "    \n",
+    "    They project the state and covariance estimates forward from time step *k* – 1 to step *k*.\n",
+    "    \n",
+    " - Discrete Kalman filter measurement update equations: \n",
+    "\n",
+    "    $\\begin{gathered}K_k=P_k^{-} H^T\\left(H P_k^{-} H^T+R\\right)^{-1} \\\\ \\hat{x}_k=\\hat{x}_k^{-}+K_k\\left(z_k-H \\hat{x}_k^{-}\\right) \\\\ P_k=\\left(I-K_k H\\right) P_k^{-}\\end{gathered}$\n",
+    "    \n",
+    "    The Kalman Gain is computed in this step\n",
+    "\n",
+    "This requires us to define various CUDA kernels that handle matrix-matrix, matrix-vector, and vector-vector operations parallely:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ebcc8af4-32b2-418b-bfbe-9ffb4ff9b0ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matAddKernel(double* a, double* b, double* c, int rows, int cols) {\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "\n",
+    "    if (col < cols && row < rows) {\n",
+    "        int idx = row * cols + col;\n",
+    "        c[idx] = a[idx] + b[idx];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3f7f95d9-2f3d-4600-8896-0ed4dcca1ed6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matSubKernel(double* a, double* b, double* c, int rows, int cols) {\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "\n",
+    "    if (col < cols && row < rows) {\n",
+    "        int idx = row * cols + col;\n",
+    "        c[idx] = a[idx] - b[idx];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "00183ab5-0380-4773-a34c-053fef7782f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matTransposeKernel(double* a, double* c, int rows, int cols) {\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "\n",
+    "    if (col < cols && row < rows) {\n",
+    "        int idx_in = row * cols + col;\n",
+    "        int idx_out = col * rows + row;\n",
+    "        c[idx_out] = a[idx_in];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d0eff8fe-34a5-4370-b6ff-118cd4d7cb22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matvecmulKernel(double* d_mat, double* d_vec, double* d_result, int rows, int cols) {\n",
+    "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "\n",
+    "    if (tid < rows) {\n",
+    "        double sum = 0.0;\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            sum += d_mat[tid * cols + j] * d_vec[j];\n",
+    "        }\n",
+    "        d_result[tid] = sum;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "21718df0-5f56-4122-bc1a-a368b306fdaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void matmulKernel(double* d_a, double* d_b, double* d_result, int rowsA, int colsA, int colsB) {\n",
+    "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "\n",
+    "    if (row < rowsA && col < colsB) {\n",
+    "        double value = 0.0;\n",
+    "        for (int k = 0; k < colsA; k++) {\n",
+    "            value += d_a[row * colsA + k] * d_b[k * colsB + col];\n",
+    "        }\n",
+    "        d_result[row * colsB + col] = value;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "18f843c9-9aee-4055-aae7-473c8f121037",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void vecsubKernel(const double* a, const double* b, double* result, int len) {\n",
+    "    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    if (idx < len) {\n",
+    "        result[idx] = a[idx] - b[idx];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8bb436d-238b-488f-a8a8-b8e1a4a0150d",
+   "metadata": {},
+   "source": [
+    "### Defining C++ abstraction functions to run CUDA kernels and obtain the results:\n",
+    "This allows us to run calls to all matrix-matrix, matrix-vector and vector-vector CUDA operations without dealing with CUDA memory and syntax in the actual algorithm code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6ff06da4-1430-4720-affb-b9c8b1dc7a26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> mataddCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
+    "    int rows = a.size();\n",
+    "    int cols = a[0].size();\n",
+    "    \n",
+    "    double* h_a = new double[rows*cols];\n",
+    "    double* h_b = new double[rows*cols];\n",
+    "    double* h_c = new double[rows*cols];\n",
+    "    \n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            h_a[i*cols + j] = a[i][j];\n",
+    "            h_b[i*cols + j] = b[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "    \n",
+    "    double* d_a, * d_b, * d_c;\n",
+    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    dim3 dimBlock(16, 16);\n",
+    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
+    "    matAddKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);\n",
+    "\n",
+    "    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_c);\n",
+    "\n",
+    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            result[i][j] = h_c[i*cols + j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    delete[] h_a;\n",
+    "    delete[] h_b;\n",
+    "    delete[] h_c;\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b70f7f32-b7c2-4a1c-96a0-aea42b15f2aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> matsubCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
+    "    int rows = a.size();\n",
+    "    int cols = a[0].size();\n",
+    "    \n",
+    "    double* h_a = new double[rows*cols];\n",
+    "    double* h_b = new double[rows*cols];\n",
+    "    double* h_c = new double[rows*cols];\n",
+    "    \n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            h_a[i*cols + j] = a[i][j];\n",
+    "            h_b[i*cols + j] = b[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    double* d_a, * d_b, * d_c;\n",
+    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    dim3 dimBlock(16, 16);  \n",
+    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
+    "    matSubKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, rows, cols);\n",
+    "\n",
+    "    cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_c);\n",
+    "\n",
+    "    std::vector<std::vector<double>> result(rows, std::vector<double>(cols));\n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            result[i][j] = h_c[i*cols + j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    delete[] h_a;\n",
+    "    delete[] h_b;\n",
+    "    delete[] h_c;\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "86b4a452-f04c-45a4-b69b-d87bcb2e5954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> mattransposeCUDA(const std::vector<std::vector<double>>& a) {\n",
+    "    int rows = a.size();\n",
+    "    int cols = a[0].size();\n",
+    "    \n",
+    "    double* h_a = new double[rows*cols];\n",
+    "    double* h_c = new double[cols*rows];\n",
+    "    \n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            h_a[i*cols + j] = a[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    double* d_a, * d_c;\n",
+    "    cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n",
+    "    cudaMalloc((void**)&d_c, cols*rows*sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    dim3 dimBlock(16, 16);\n",
+    "    dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n",
+    "    matTransposeKernel<<<dimGrid, dimBlock>>>(d_a, d_c, rows, cols);\n",
+    "\n",
+    "    cudaMemcpy(h_c, d_c, cols*rows*sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_c);\n",
+    "\n",
+    "    std::vector<std::vector<double>> result(cols, std::vector<double>(rows));\n",
+    "    for (int i = 0; i < cols; i++) {\n",
+    "        for (int j = 0; j < rows; j++) {\n",
+    "            result[i][j] = h_c[i*rows + j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    delete[] h_a;\n",
+    "    delete[] h_c;\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0d925dce-377c-4d23-96d6-9c4fead297cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> matvecmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<double>& b) {\n",
+    "    int rows = a.size();\n",
+    "    int cols = a[0].size();\n",
+    "\n",
+    "    if (cols != b.size()) {\n",
+    "        throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n",
+    "    }\n",
+    "\n",
+    "    std::vector<double> h_mat(rows * cols);\n",
+    "    for (int i = 0; i < rows; i++) {\n",
+    "        for (int j = 0; j < cols; j++) {\n",
+    "            h_mat[i * cols + j] = a[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    double *d_mat, *d_vec, *d_result;\n",
+    "    cudaMalloc((void**)&d_mat, rows * cols * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_vec, cols * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_result, rows * sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_mat, h_mat.data(), rows * cols * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_vec, b.data(), cols * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    int blockSize = 256;\n",
+    "    int gridSize = (rows + blockSize - 1) / blockSize;\n",
+    "\n",
+    "    matvecmulKernel<<<gridSize, blockSize>>>(d_mat, d_vec, d_result, rows, cols);\n",
+    "\n",
+    "    std::vector<double> result(rows);\n",
+    "    cudaMemcpy(result.data(), d_result, rows * sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_mat);\n",
+    "    cudaFree(d_vec);\n",
+    "    cudaFree(d_result);\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "69714460-9a3e-4966-9de7-5e4111fc40af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> matmulCUDA(const std::vector<std::vector<double>>& a, const std::vector<std::vector<double>>& b) {\n",
+    "    int rowsA = a.size();\n",
+    "    int colsA = a[0].size();\n",
+    "    int rowsB = b.size();\n",
+    "    int colsB = b[0].size();\n",
+    "\n",
+    "    if (colsA != rowsB) {\n",
+    "        throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n",
+    "    }\n",
+    "\n",
+    "    std::vector<double> h_a(rowsA * colsA);\n",
+    "    std::vector<double> h_b(rowsB * colsB);\n",
+    "\n",
+    "    for (int i = 0; i < rowsA; i++) {\n",
+    "        for (int j = 0; j < colsA; j++) {\n",
+    "            h_a[i * colsA + j] = a[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    for (int i = 0; i < rowsB; i++) {\n",
+    "        for (int j = 0; j < colsB; j++) {\n",
+    "            h_b[i * colsB + j] = b[i][j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    double *d_a, *d_b, *d_result;\n",
+    "    \n",
+    "    cudaMalloc((void**)&d_a, rowsA * colsA * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, rowsB * colsB * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_result, rowsA * colsB * sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, h_a.data(), rowsA * colsA * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, h_b.data(), rowsB * colsB * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    dim3 blockSize(16, 16);\n",
+    "    dim3 gridSize((colsB + blockSize.x - 1) / blockSize.x, (rowsA + blockSize.y - 1) / blockSize.y);\n",
+    "\n",
+    "    matmulKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, rowsA, colsA, colsB);\n",
+    "\n",
+    "    std::vector<double> h_result(rowsA * colsB);\n",
+    "    cudaMemcpy(h_result.data(), d_result, rowsA * colsB * sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_result);\n",
+    "    \n",
+    "    std::vector<std::vector<double>> result(rowsA, std::vector<double>(colsB));\n",
+    "    for (int i = 0; i < rowsA; i++) {\n",
+    "        for (int j = 0; j < colsB; j++) {\n",
+    "            result[i][j] = h_result[i * colsB + j];\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    return result;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7295e929-dcfc-43be-9b3b-82e9f9fbf9db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> vecsubCUDA(const std::vector<double>& a, const std::vector<double>& b) {\n",
+    "    int len = a.size();\n",
+    "\n",
+    "    double* d_a;\n",
+    "    double* d_b;\n",
+    "    double* d_result;\n",
+    "\n",
+    "    cudaMalloc((void**)&d_a, len * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_b, len * sizeof(double));\n",
+    "    cudaMalloc((void**)&d_result, len * sizeof(double));\n",
+    "\n",
+    "    cudaMemcpy(d_a, a.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "    cudaMemcpy(d_b, b.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "    int blockSize = 256; \n",
+    "    int gridSize = (len + blockSize - 1) / blockSize;\n",
+    "\n",
+    "    vecsubKernel<<<gridSize, blockSize>>>(d_a, d_b, d_result, len);\n",
+    "\n",
+    "    std::vector<double> result(len);\n",
+    "    cudaMemcpy(result.data(), d_result, len * sizeof(double), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "    cudaFree(d_a);\n",
+    "    cudaFree(d_b);\n",
+    "    cudaFree(d_result);\n",
+    "\n",
+    "    return result;\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87d7dcc2-bac9-4ca4-aefc-f2808d8e13ce",
+   "metadata": {},
+   "source": [
+    "### Function to obtain 2x2 and 3x3 matrix inverse using adjoint:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "cb968a24-75c4-491f-9930-0f16a48f7560",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> matinverse(const std::vector<std::vector<double>>& a) {\n",
+    "    size_t n = a.size();\n",
+    "    \n",
+    "    if (n != a[0].size()) {\n",
+    "        std::cout<<\" Shape of a : \"<<a.size()<<\",\"<<a[0].size()<<\"\\n\";\n",
+    "        throw std::runtime_error(\"Only square matrices are supported for inversion.\");\n",
+    "    }\n",
+    "\n",
+    "    // Handle 1x1 matrix\n",
+    "    if (n == 1) {\n",
+    "        if (a[0][0] == 0) {\n",
+    "            throw std::runtime_error(\"singular matrix\");\n",
+    "        }\n",
+    "        return {{1.0 / a[0][0]}};\n",
+    "    }\n",
+    "    \n",
+    "    if (n == 2) {\n",
+    "        double determinant = a[0][0] * a[1][1] - a[0][1] * a[1][0];\n",
+    "        if (determinant == 0) {\n",
+    "            throw std::runtime_error(\"singular matrix\");\n",
+    "        }\n",
+    "\n",
+    "        std::vector<std::vector<double>> result(2, std::vector<double>(2));\n",
+    "        result[0][0] = a[1][1] / determinant;\n",
+    "        result[0][1] = -a[0][1] / determinant;\n",
+    "        result[1][0] = -a[1][0] / determinant;\n",
+    "        result[1][1] = a[0][0] / determinant;\n",
+    "\n",
+    "        return result;\n",
+    "    }\n",
+    "\n",
+    "    if (n == 3) {\n",
+    "        double determinant = a[0][0]*(a[1][1]*a[2][2]-a[2][1]*a[1][2]) \n",
+    "                             - a[0][1]*(a[1][0]*a[2][2]-a[1][2]*a[2][0]) \n",
+    "                             + a[0][2]*(a[1][0]*a[2][1]-a[1][1]*a[2][0]);\n",
+    "        \n",
+    "        if (determinant == 0) {\n",
+    "            throw std::runtime_error(\"singular matrix\");\n",
+    "        }\n",
+    "\n",
+    "        std::vector<std::vector<double>> result(3, std::vector<double>(3));\n",
+    "\n",
+    "        result[0][0] = (a[1][1] * a[2][2] - a[2][1] * a[1][2]) / determinant;\n",
+    "        result[0][1] = (a[0][2] * a[2][1] - a[0][1] * a[2][2]) / determinant;\n",
+    "        result[0][2] = (a[0][1] * a[1][2] - a[0][2] * a[1][1]) / determinant;\n",
+    "        result[1][0] = (a[1][2] * a[2][0] - a[1][0] * a[2][2]) / determinant;\n",
+    "        result[1][1] = (a[0][0] * a[2][2] - a[0][2] * a[2][0]) / determinant;\n",
+    "        result[1][2] = (a[1][0] * a[0][2] - a[0][0] * a[1][2]) / determinant;\n",
+    "        result[2][0] = (a[1][0] * a[2][1] - a[2][0] * a[1][1]) / determinant;\n",
+    "        result[2][1] = (a[2][0] * a[0][1] - a[0][0] * a[2][1]) / determinant;\n",
+    "        result[2][2] = (a[0][0] * a[1][1] - a[1][0] * a[0][1]) / determinant;\n",
+    "\n",
+    "        return result;\n",
+    "    }\n",
+    "\n",
+    "    throw std::runtime_error(\"Only 2x2 and 3x3 matrices supported for inversion\");\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c82451a0-3439-4b31-930d-9637db19f729",
+   "metadata": {},
+   "source": [
+    "### Overloaded functions to print both matrices and vectors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "84f29c1e-d4c2-4680-a579-14d63a2bde08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void printMatrix(const std::vector<std::vector<double>>& matrix) {\n",
+    "    for (size_t i = 0; i < matrix.size(); i++) {\n",
+    "        for (size_t j = 0; j < matrix[i].size(); j++) {\n",
+    "            std::cout << matrix[i][j] << \" \";\n",
+    "        }\n",
+    "        std::cout << std::endl;\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "fa143e89-d1c5-4b54-b22d-9921d7e1000d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void printMatrix(const std::vector<double>& vec) {\n",
+    "    for (size_t i = 0; i < vec.size(); i++) {\n",
+    "            std::cout << vec[i] << \" \";\n",
+    "        }\n",
+    "        std::cout << std::endl;\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79ebaf27-693f-4213-800d-8e8b5c9abaa3",
+   "metadata": {},
+   "source": [
+    "### Test for matrix-matrix multiplication using CUDA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "6d718c47-3bc6-4c7a-893c-4fedc399926d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> matrixA = {\n",
+    "        {1.0, 2.0},\n",
+    "        {3.0, 4.0},\n",
+    "        {5.0, 6.0}\n",
+    "    };\n",
+    "\n",
+    "std::vector<std::vector<double>> matrixB = {\n",
+    "        {1.0, 2.0, 3.0, 4.0},\n",
+    "        {5.0, 6.0, 7.0, 8.0}\n",
+    "    };\n",
+    "\n",
+    "std::vector<std::vector<double>> result = matmulCUDA(matrixA, matrixB);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "3e1c925e-c164-46af-8e0f-65c05f0ee1c0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Result matrix:\n",
+      "11 14 17 20 \n",
+      "23 30 37 44 \n",
+      "35 46 57 68 \n"
+     ]
+    }
+   ],
+   "source": [
+    "std::cout << \"Result matrix:\" << std::endl;\n",
+    "printMatrix(result);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9312a581-d000-4da6-b257-b22a7da942f6",
+   "metadata": {},
+   "source": [
+    "## Constructor to initialise an object of the Kalman Filter class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "f4c9238f-674a-4cf8-8128-840cc7f61e0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "KalmanFilter::KalmanFilter(\n",
+    "    double dt,\n",
+    "    const std::vector<std::vector<double>>& A,\n",
+    "    const std::vector<std::vector<double>>& C,\n",
+    "    const std::vector<std::vector<double>>& Q,\n",
+    "    const std::vector<std::vector<double>>& R,\n",
+    "    const std::vector<std::vector<double>>& P)\n",
+    "  : A(A), C(C), Q(Q), R(R), P0(P),\n",
+    "    m(C.size()), n(A.size()), dt(dt), initialized(false),\n",
+    "    I(n, std::vector<double>(n)), x_hat(n), x_hat_new(n)\n",
+    "{\n",
+    "    for (int i = 0; i < n; i++) {\n",
+    "        I[i][i] = 1.0;\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "KalmanFilter::KalmanFilter() {}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb0ecf07-b676-4a2f-bb91-4d0ebfe4fc17",
+   "metadata": {},
+   "source": [
+    "### Describe the `KalmanFilter::init` function to initialize the filter with initial states as zero."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "ed99dd28-5959-409a-8226-7e292bc4b0de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void KalmanFilter::init() {\n",
+    "    std::fill(x_hat.begin(), x_hat.end(), 0.0);\n",
+    "    P = P0;\n",
+    "    t0 = 0;\n",
+    "    t = t0;\n",
+    "    initialized = true;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f4291823-8a85-40bf-baf7-c0686388e435",
+   "metadata": {},
+   "source": [
+    "### Describe the `KalmanFilter::init` function to initialize the filter with a guess for initial states."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "bd953ec6-ceed-489c-8e33-8eb7ae6bb32f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void KalmanFilter::init(double t0, const std::vector<double>& x0) {\n",
+    "    x_hat = x0;\n",
+    "    P = P0;\n",
+    "    this->t0 = t0;\n",
+    "    t = t0;\n",
+    "    initialized = true;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b71922a-0448-4fa0-8196-08142c2f64b8",
+   "metadata": {},
+   "source": [
+    "### Defining a function that executes the two stages of the ongoing discrete Kalman filter cycle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "ea057f20-9097-4dd3-946e-ab7a383afe69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> KalmanFilter::update(const std::vector<double>& y) {\n",
+    "    if (!initialized)\n",
+    "        throw std::runtime_error(\"Filter is not initialized!\");\n",
+    "    \n",
+    "    // Discrete Kalman filter time update \n",
+    "    x_hat_new = matvecmulCUDA(A, x_hat);\n",
+    "    P = mataddCUDA(matmulCUDA(matmulCUDA(A, P), mattransposeCUDA(A)), Q);\n",
+    "    \n",
+    "    // Discrete Kalman filter measurement update\n",
+    "    std::vector<std::vector<double>> inv = matinverse(mataddCUDA(matmulCUDA(matmulCUDA(C, P), mattransposeCUDA(C)), R));\n",
+    "    K = matmulCUDA(matmulCUDA(P, mattransposeCUDA(C)), inv);\n",
+    "    std::vector<double> temp = matvecmulCUDA(C, x_hat_new);\n",
+    "    std::vector<double> difference = vecsubCUDA(y, temp);\n",
+    "    std::vector<double> gain = K[0];\n",
+    "    for (size_t i = 0; i < x_hat_new.size(); i++) {\n",
+    "        x_hat_new[i] += matvecmulCUDA(K, difference)[i];\n",
+    "    }\n",
+    "\n",
+    "    P = matmulCUDA(matsubCUDA(I, matmulCUDA(K, C)), P);\n",
+    "\n",
+    "    x_hat = x_hat_new;\n",
+    "    t += dt;\n",
+    "    \n",
+    "    return gain;\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b1d8963-599a-4034-80de-e7c92c59c321",
+   "metadata": {},
+   "source": [
+    "### Update the estimated state based on measured values, using the given time step and dynamics matrix:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "08d28163-a086-46a9-bf41-044f48530929",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void KalmanFilter::update(const std::vector<double>& y, double dt, const std::vector<std::vector<double>>& A) {\n",
+    "    this->A = A;\n",
+    "    this->dt = dt;\n",
+    "    update(y);\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "7c31cf57-c669-43af-8aba-0b03c08b575d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> measurements;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c93b8e0e-1b66-4d26-98fb-ec7a4e75459b",
+   "metadata": {},
+   "source": [
+    "### The cross language support allows us to read the measurements from the python side and initialise the std::vector to be passed to the Kalman Filter\n",
+    "\n",
+    "#### setData here recieves the python list as a `cppyy.gbl.std.vector` and passes it to `std::vector<double> measurements` "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "caa856ab-e4e1-4d88-984d-058fb21da79f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void setData(const std::vector<double>& input) {\n",
+    "    measurements = input;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "ee23b991-3495-4ef5-94c0-b906867d4544",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: matplotlib in /opt/conda/envs/.venv/lib/python3.10/site-packages (3.8.0)\n",
+      "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.1.1)\n",
+      "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
+      "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (4.42.1)\n",
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n",
+      "Requirement already satisfied: numpy<2,>=1.21 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.26.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (23.1)\n",
+      "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (10.0.1)\n",
+      "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (3.1.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install matplotlib\n",
+    "!pip install PyYaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "c35e13e9-4f7f-4cb7-abd7-03362ddd6848",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import yaml\n",
+    "import cppyy\n",
+    "\n",
+    "with open('data/measurements.yml', 'r') as file:\n",
+    "    data_dict = yaml.safe_load(file)\n",
+    "    data_list = list(float(x) for x in data_dict['data'])\n",
+    "\n",
+    "measurements_vector = cppyy.gbl.std.vector['double'](data_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1401cbcd-42e1-415a-a866-43a3325b7986",
+   "metadata": {},
+   "source": [
+    "### Lets verify the data integrity by plotting a position vs time graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "4b203531-45e5-409b-82ec-86582309cfd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "dt = 1/30\n",
+    "    \n",
+    "# Generating time points for the x-axis based on dt\n",
+    "time_points = [i*dt for i in range(len(data_list))]\n",
+    "\n",
+    "# Plotting\n",
+    "plt.plot(time_points, data_list, '-o', label='Position')\n",
+    "plt.xlabel('Time (s)')\n",
+    "plt.ylabel('Position (x)')\n",
+    "plt.title('Position vs. Time')\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.savefig(\"position_graph.png\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88774a21-8d16-4abc-ae71-3f81328acf08",
+   "metadata": {},
+   "source": [
+    "<img src=\"position_graph.png\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9188118-ab05-450d-a6e9-9e0fac78bb50",
+   "metadata": {},
+   "source": [
+    "### Now we pass it to the C++ side by calling `setData`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "bc5c3165-5cc3-4017-9e5f-13f61388f727",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "cppyy.gbl.setData(measurements_vector)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "018496cc-7f4d-499e-a982-00610d7f9651",
+   "metadata": {},
+   "source": [
+    "### Putting it all together:\n",
+    "We define the `run_kf` function that initialises the following:\n",
+    "- Number of states\n",
+    "- Dimensions of each measurement\n",
+    "- Time steps used $\\begin{aligned} & {dt}\\end{aligned}$\n",
+    "- Covariance matrices\n",
+    "- Passes the measurement data to $\\begin{aligned} & \\hat{x}_0^{-}\\end{aligned}$\n",
+    "\n",
+    "Then the measurements are fed into filter, and output estimated states\n",
+    "\n",
+    "The algorithm then runs by looping over the measurements:\n",
+    "\n",
+    "$\\begin{gathered} & for\\:y\\:in\\:measurements\\::\\end{gathered}$\n",
+    "$\\begin{gathered} & kf.update(y)\\end{gathered}$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "8938a288-fc2c-4104-82c6-added4466c8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<std::vector<double>> run_kf(bool verbose) {\n",
+    "    \n",
+    "    int n = 3; \n",
+    "    int m = 1; \n",
+    "\n",
+    "    double dt = 1.0 / 30; // Time step\n",
+    "    \n",
+    "    std::vector<double> g_preds;\n",
+    "    \n",
+    "    std::vector<std::vector<double>> A(n, std::vector<double>(n));\n",
+    "    std::vector<std::vector<double>> C(m, std::vector<double>(n));\n",
+    "    std::vector<std::vector<double>> Q(n, std::vector<double>(n));\n",
+    "    std::vector<std::vector<double>> R(m, std::vector<double>(m));\n",
+    "    std::vector<std::vector<double>> P(n, std::vector<double>(n));\n",
+    "    \n",
+    "    A = {{1, dt, 0}, {0, 1, dt}, {0, 0, 1}};\n",
+    "    C = {{1, 0, 0}};\n",
+    "    Q = {{.05, .05, .0}, {.05, .05, .0}, {.0, .0, .0}};\n",
+    "    R = {{5}};\n",
+    "    P = {{.1, .1, .1}, {.1, 10000, 10}, {.1, 10, 100}};\n",
+    "    \n",
+    "    KalmanFilter kf(dt, A, C, Q, R, P);\n",
+    "    \n",
+    "    std::vector<double> x0 = {measurements[0], 0, -15};\n",
+    "    std::vector<std::vector<double>> gain;\n",
+    "    kf.init(0, x0);\n",
+    "\n",
+    "    \n",
+    "    std::vector<double> y(m);\n",
+    "    if(verbose) {\n",
+    "        std::cout << \"t = \" << 0 << \", \" << \"x_hat[0]: \";\n",
+    "        for (auto& val : kf.state()){\n",
+    "            std::cout << val << \" \";\n",
+    "        }\n",
+    "        std::cout << std::endl;\n",
+    "    }\n",
+    "    \n",
+    "    int i;\n",
+    "    for (i = 0; i < measurements.size(); i++) {\n",
+    "        y[0] = measurements[i];\n",
+    "        gain.push_back(kf.update(y));\n",
+    "        if(verbose) {\n",
+    "            std::cout << \"t = \" << (i + 1) * dt << \", y[\" << i << \"] = \" << y[0] << \", x_hat[\" << i << \"] = \";\n",
+    "            for (auto& val : kf.state()) {\n",
+    "                std::cout << val << \" \";\n",
+    "            }\n",
+    "            g_preds.push_back(kf.state()[2]);\n",
+    "            std::cout << std::endl;\n",
+    "        }\n",
+    "    }\n",
+    "    std::cout << std::endl;\n",
+    "    std::cout<<\"Exec Success, Final kf states:\";\n",
+    "    for (auto& val : kf.state()) std::cout << val << \" \";\n",
+    "    std::cout << std::endl;\n",
+    "\n",
+    "    std::vector<std::vector<double>> g_res;\n",
+    "    for (size_t i = 0; i < g_preds.size(); ++i) {\n",
+    "        std::vector<double> pair = {g_preds[i], gain[i][0]};\n",
+    "        g_res.push_back(pair);\n",
+    "    }\n",
+    "    \n",
+    "    return g_res;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "7a910405-c4ff-41fa-aec5-69aa24598f9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "t = 0, x_hat[0]: 1.04203 0 -15 \n",
+      "t = 0.0333333, y[0] = 1.04203, x_hat[0] = 1.04203 -0.5 -15 \n",
+      "t = 0.0666667, y[1] = 1.10727, x_hat[1] = 1.08556 -0.0966619 -14.9988 \n",
+      "t = 0.1, y[2] = 1.29135, x_hat[2] = 1.21317 0.720024 -14.9952 \n",
+      "t = 0.133333, y[3] = 1.48485, x_hat[3] = 1.36865 1.21707 -14.9881 \n",
+      "t = 0.166667, y[4] = 1.72826, x_hat[4] = 1.55548 1.60875 -14.9732 \n",
+      "t = 0.2, y[5] = 1.74216, x_hat[5] = 1.66278 1.38374 -14.9637 \n",
+      "t = 0.233333, y[6] = 2.11672, x_hat[6] = 1.85606 1.53382 -14.9229 \n",
+      "t = 0.266667, y[7] = 2.14529, x_hat[7] = 1.98512 1.34018 -14.8908 \n",
+      "t = 0.3, y[8] = 2.1603, x_hat[8] = 2.06901 0.981678 -14.8682 \n",
+      "t = 0.333333, y[9] = 2.21269, x_hat[9] = 2.13267 0.58593 -14.8441 \n",
+      "t = 0.366667, y[10] = 2.57709, x_hat[10] = 2.26319 0.425535 -14.7321 \n",
+      "t = 0.4, y[11] = 2.66822, x_hat[11] = 2.37389 0.210228 -14.6096 \n",
+      "t = 0.433333, y[12] = 2.51642, x_hat[12] = 2.41279 -0.189016 -14.56 \n",
+      "t = 0.466667, y[13] = 2.76034, x_hat[13] = 2.4865 -0.459522 -14.4115 \n",
+      "t = 0.5, y[14] = 2.88132, x_hat[14] = 2.56093 -0.701772 -14.2172 \n",
+      "t = 0.533333, y[15] = 2.88374, x_hat[15] = 2.61135 -0.980106 -14.0346 \n",
+      "t = 0.566667, y[16] = 2.94485, x_hat[16] = 2.65526 -1.24377 -13.8225 \n",
+      "t = 0.6, y[17] = 2.82867, x_hat[17] = 2.65812 -1.58494 -13.6876 \n",
+      "t = 0.633333, y[18] = 3.00066, x_hat[18] = 2.68613 -1.81973 -13.4217 \n",
+      "t = 0.666667, y[19] = 3.12921, x_hat[19] = 2.72799 -1.9816 -13.0631 \n",
+      "t = 0.7, y[20] = 2.85836, x_hat[20] = 2.70186 -2.30405 -12.9167 \n",
+      "t = 0.733333, y[21] = 2.83808, x_hat[21] = 2.66841 -2.61016 -12.7523 \n",
+      "t = 0.766667, y[22] = 2.68975, x_hat[22] = 2.60351 -2.971 -12.6666 \n",
+      "t = 0.8, y[23] = 2.66533, x_hat[23] = 2.53742 -3.29663 -12.5373 \n",
+      "t = 0.833333, y[24] = 2.81613, x_hat[24] = 2.50742 -3.47892 -12.2229 \n",
+      "t = 0.866667, y[25] = 2.81004, x_hat[25] = 2.47782 -3.63091 -11.8847 \n",
+      "t = 0.9, y[26] = 2.88322, x_hat[26] = 2.46572 -3.70491 -11.4628 \n",
+      "t = 0.933333, y[27] = 2.69789, x_hat[27] = 2.41597 -3.86954 -11.1819 \n",
+      "t = 0.966667, y[28] = 2.43422, x_hat[28] = 2.31755 -4.15264 -11.068 \n",
+      "t = 1, y[29] = 2.23465, x_hat[29] = 2.19065 -4.48803 -11.0261 \n",
+      "t = 1.03333, y[30] = 2.30279, x_hat[30] = 2.09529 -4.69914 -10.8344 \n",
+      "t = 1.06667, y[31] = 2.0207, x_hat[31] = 1.95562 -5.01191 -10.7763 \n",
+      "t = 1.1, y[32] = 1.94394, x_hat[32] = 1.82057 -5.28093 -10.6701 \n",
+      "t = 1.13333, y[33] = 1.82498, x_hat[33] = 1.68155 -5.53372 -10.5515 \n",
+      "t = 1.16667, y[34] = 1.52526, x_hat[34] = 1.50284 -5.86969 -10.5338 \n",
+      "t = 1.2, y[35] = 1.86968, x_hat[35] = 1.42124 -5.91291 -10.1937 \n",
+      "t = 1.23333, y[36] = 1.18073, x_hat[36] = 1.2154 -6.27594 -10.2188 \n",
+      "t = 1.26667, y[37] = 1.1073, x_hat[37] = 1.02642 -6.56369 -10.1629 \n",
+      "t = 1.3, y[38] = 0.916168, x_hat[38] = 0.829169 -6.84703 -10.1055 \n",
+      "t = 1.33333, y[39] = 0.678548, x_hat[39] = 0.616214 -7.14521 -10.0663 \n",
+      "t = 1.36667, y[40] = 0.562382, x_hat[40] = 0.41403 -7.39117 -9.97735 \n",
+      "t = 1.4, y[41] = 0.355468, x_hat[41] = 0.204015 -7.63475 -9.89085 \n",
+      "t = 1.43333, y[42] = -0.155607, x_hat[42] = -0.0706529 -8.01301 -9.93708 \n",
+      "t = 1.46667, y[43] = -0.287199, x_hat[43] = -0.328135 -8.32148 -9.91585 \n",
+      "t = 1.5, y[44] = -0.602973, x_hat[44] = -0.605038 -8.65089 -9.91483 \n",
+      "t = 1.53333, y[45] = -0.292649, x_hat[45] = -0.781097 -8.72417 -9.68453 \n",
+      "t = 1.56667, y[46] = -0.602973, x_hat[46] = -0.985005 -8.85119 -9.51267 \n",
+      "t = 1.6, y[47] = -0.924198, x_hat[47] = -1.21467 -9.02335 -9.38793 \n",
+      "t = 1.63333, y[48] = -1.25632, x_hat[48] = -1.46826 -9.2333 -9.30101 \n",
+      "t = 1.66667, y[49] = -1.59935, x_hat[49] = -1.74413 -9.4748 -9.24427 \n",
+      "t = 1.7, y[50] = -1.95327, x_hat[50] = -2.04085 -9.74255 -9.21144 \n",
+      "t = 1.73333, y[51] = -2.3181, x_hat[51] = -2.35716 -10.032 -9.19743 \n",
+      "t = 1.76667, y[52] = -2.69382, x_hat[52] = -2.69196 -10.3394 -9.19807 \n",
+      "t = 1.8, y[53] = -3.08044, x_hat[53] = -3.04427 -10.6615 -9.20997 \n",
+      "t = 1.83333, y[54] = -3.47797, x_hat[54] = -3.41323 -10.9956 -9.23039 \n",
+      "t = 1.86667, y[55] = -3.88639, x_hat[55] = -3.7981 -11.3393 -9.25711 \n",
+      "t = 1.9, y[56] = -4.30572, x_hat[56] = -4.19822 -11.6907 -9.28834 \n",
+      "t = 1.93333, y[57] = -4.73594, x_hat[57] = -4.61301 -12.0482 -9.32265 \n",
+      "t = 1.96667, y[58] = -5.17707, x_hat[58] = -5.04196 -12.4105 -9.3589 \n",
+      "t = 2, y[59] = -5.62909, x_hat[59] = -5.48464 -12.7763 -9.39618 \n",
+      "t = 2.03333, y[60] = -6.09202, x_hat[60] = -5.94065 -13.1448 -9.43379 \n",
+      "t = 2.06667, y[61] = -6.56584, x_hat[61] = -6.40966 -13.5151 -9.47117 \n",
+      "t = 2.1, y[62] = -7.05057, x_hat[62] = -6.89136 -13.8865 -9.5079 \n",
+      "t = 2.13333, y[63] = -7.54619, x_hat[63] = -7.38549 -14.2587 -9.54365 \n",
+      "t = 2.16667, y[64] = -8.05271, x_hat[64] = -7.89184 -14.6309 -9.57819 \n",
+      "t = 2.2, y[65] = -8.57014, x_hat[65] = -8.41019 -15.003 -9.61135 \n",
+      "t = 2.23333, y[66] = -9.09846, x_hat[66] = -8.94038 -15.3747 -9.64301 \n",
+      "t = 2.26667, y[67] = -9.63769, x_hat[67] = -9.48225 -15.7456 -9.67311 \n",
+      "t = 2.3, y[68] = -10.1878, x_hat[68] = -10.0357 -16.1156 -9.70161 \n",
+      "t = 2.33333, y[69] = -10.7488, x_hat[69] = -10.6005 -16.4845 -9.7285 \n",
+      "t = 2.36667, y[70] = -11.3208, x_hat[70] = -11.1767 -16.8523 -9.75379 \n",
+      "t = 2.4, y[71] = -11.9036, x_hat[71] = -11.7642 -17.2188 -9.77751 \n",
+      "t = 2.43333, y[72] = -12.4973, x_hat[72] = -12.3628 -17.5841 -9.7997 \n",
+      "t = 2.46667, y[73] = -13.1019, x_hat[73] = -12.9725 -17.9479 -9.82041 \n",
+      "t = 2.5, y[74] = -13.7175, x_hat[74] = -13.5932 -18.3104 -9.83969 \n",
+      "t = 2.53333, y[75] = -14.3439, x_hat[75] = -14.225 -18.6716 -9.8576 \n",
+      "t = 2.56667, y[76] = -14.9812, x_hat[76] = -14.8677 -19.0313 -9.87422 \n",
+      "t = 2.6, y[77] = -15.6294, x_hat[77] = -15.5214 -19.3897 -9.88959 \n",
+      "t = 2.63333, y[78] = -16.2886, x_hat[78] = -16.1859 -19.7467 -9.90379 \n",
+      "t = 2.66667, y[79] = -16.9586, x_hat[79] = -16.8613 -20.1025 -9.91688 \n",
+      "t = 2.7, y[80] = -17.6395, x_hat[80] = -17.5475 -20.4569 -9.92892 \n",
+      "t = 2.73333, y[81] = -18.3313, x_hat[81] = -18.2446 -20.8101 -9.93998 \n",
+      "t = 2.76667, y[82] = -19.0341, x_hat[82] = -18.9524 -21.162 -9.9501 \n",
+      "t = 2.8, y[83] = -19.7477, x_hat[83] = -19.6711 -21.5128 -9.95936 \n",
+      "t = 2.83333, y[84] = -20.4722, x_hat[84] = -20.4006 -21.8624 -9.96781 \n",
+      "t = 2.86667, y[85] = -21.2076, x_hat[85] = -21.1408 -22.2109 -9.97549 \n",
+      "t = 2.9, y[86] = -21.954, x_hat[86] = -21.8918 -22.5584 -9.98246 \n",
+      "t = 2.93333, y[87] = -22.7112, x_hat[87] = -22.6535 -22.9048 -9.98876 \n",
+      "t = 2.96667, y[88] = -23.4793, x_hat[88] = -23.4261 -23.2503 -9.99445 \n",
+      "t = 3, y[89] = -24.2583, x_hat[89] = -24.2094 -23.5948 -9.99955 \n",
+      "t = 3.03333, y[90] = -25.0483, x_hat[90] = -25.0034 -23.9384 -10.0041 \n",
+      "t = 3.06667, y[91] = -25.8491, x_hat[91] = -25.8082 -24.2812 -10.0082 \n",
+      "t = 3.1, y[92] = -26.6608, x_hat[92] = -26.6238 -24.6231 -10.0118 \n",
+      "t = 3.13333, y[93] = -27.4834, x_hat[93] = -27.4501 -24.9642 -10.015 \n",
+      "t = 3.16667, y[94] = -28.3169, x_hat[94] = -28.2872 -25.3046 -10.0177 \n",
+      "t = 3.2, y[95] = -29.1614, x_hat[95] = -29.135 -25.6443 -10.0201 \n",
+      "t = 3.23333, y[96] = -30.0167, x_hat[96] = -29.9937 -25.9833 -10.0222 \n",
+      "t = 3.26667, y[97] = -30.8829, x_hat[97] = -30.863 -26.3216 -10.0239 \n",
+      "t = 3.3, y[98] = -31.76, x_hat[98] = -31.7432 -26.6593 -10.0254 \n",
+      "t = 3.33333, y[99] = -32.6481, x_hat[99] = -32.6341 -26.9964 -10.0265 \n",
+      "t = 3.36667, y[100] = -33.547, x_hat[100] = -33.5358 -27.333 -10.0274 \n",
+      "t = 3.4, y[101] = -34.4568, x_hat[101] = -34.4483 -27.669 -10.0281 \n",
+      "t = 3.43333, y[102] = -35.3775, x_hat[102] = -35.3716 -28.0045 -10.0286 \n",
+      "t = 3.46667, y[103] = -36.3092, x_hat[103] = -36.3056 -28.3395 -10.0289 \n",
+      "t = 3.5, y[104] = -37.2517, x_hat[104] = -37.2505 -28.674 -10.029 \n",
+      "t = 3.53333, y[105] = -38.2051, x_hat[105] = -38.2061 -29.0081 -10.0289 \n",
+      "t = 3.56667, y[106] = -39.1694, x_hat[106] = -39.1726 -29.3418 -10.0287 \n",
+      "t = 3.6, y[107] = -40.1447, x_hat[107] = -40.1498 -29.6751 -10.0283 \n",
+      "t = 3.63333, y[108] = -41.1308, x_hat[108] = -41.1378 -30.008 -10.0278 \n",
+      "t = 3.66667, y[109] = -42.1278, x_hat[109] = -42.1367 -30.3405 -10.0272 \n",
+      "t = 3.7, y[110] = -43.1357, x_hat[110] = -43.1463 -30.6727 -10.0265 \n",
+      "t = 3.73333, y[111] = -44.1546, x_hat[111] = -44.1668 -31.0046 -10.0257 \n",
+      "t = 3.76667, y[112] = -45.1843, x_hat[112] = -45.1981 -31.3362 -10.0247 \n",
+      "t = 3.8, y[113] = -46.2249, x_hat[113] = -46.2402 -31.6674 -10.0238 \n",
+      "t = 3.83333, y[114] = -47.2764, x_hat[114] = -47.2932 -31.9984 -10.0227 \n",
+      "t = 3.86667, y[115] = -48.3389, x_hat[115] = -48.3569 -32.3292 -10.0216 \n",
+      "t = 3.9, y[116] = -49.4122, x_hat[116] = -49.4315 -32.6597 -10.0204 \n",
+      "t = 3.93333, y[117] = -50.4964, x_hat[117] = -50.517 -32.9899 -10.0191 \n",
+      "t = 3.96667, y[118] = -51.5915, x_hat[118] = -51.6132 -33.3199 -10.0179 \n",
+      "t = 4, y[119] = -52.6976, x_hat[119] = -52.7204 -33.6497 -10.0165 \n",
+      "t = 4.03333, y[120] = -53.8145, x_hat[120] = -53.8383 -33.9793 -10.0152 \n",
+      "t = 4.06667, y[121] = -54.9423, x_hat[121] = -54.9671 -34.3087 -10.0138 \n",
+      "t = 4.1, y[122] = -56.081, x_hat[122] = -56.1067 -34.638 -10.0123 \n",
+      "t = 4.13333, y[123] = -57.2307, x_hat[123] = -57.2572 -34.967 -10.0109 \n",
+      "t = 4.16667, y[124] = -58.3912, x_hat[124] = -58.4185 -35.2959 -10.0094 \n",
+      "t = 4.2, y[125] = -59.5626, x_hat[125] = -59.5907 -35.6246 -10.0079 \n",
+      "t = 4.23333, y[126] = -60.7449, x_hat[126] = -60.7738 -35.9532 -10.0064 \n",
+      "t = 4.26667, y[127] = -61.9382, x_hat[127] = -61.9677 -36.2817 -10.0048 \n",
+      "t = 4.3, y[128] = -63.1423, x_hat[128] = -63.1724 -36.61 -10.0033 \n",
+      "t = 4.33333, y[129] = -64.3573, x_hat[129] = -64.388 -36.9381 -10.0018 \n",
+      "t = 4.36667, y[130] = -65.5832, x_hat[130] = -65.6145 -37.2662 -10.0002 \n",
+      "t = 4.4, y[131] = -66.8201, x_hat[131] = -66.8518 -37.5942 -9.99866 \n",
+      "t = 4.43333, y[132] = -68.0678, x_hat[132] = -68.1 -37.922 -9.9971 \n",
+      "t = 4.46667, y[133] = -69.3264, x_hat[133] = -69.3591 -38.2498 -9.99554 \n",
+      "t = 4.5, y[134] = -70.5959, x_hat[134] = -70.629 -38.5774 -9.99399 \n",
+      "t = 4.53333, y[135] = -71.8764, x_hat[135] = -71.9099 -38.905 -9.99244 \n",
+      "t = 4.56667, y[136] = -73.1677, x_hat[136] = -73.2015 -39.2324 -9.99089 \n",
+      "t = 4.6, y[137] = -74.4699, x_hat[137] = -74.5041 -39.5598 -9.98934 \n",
+      "t = 4.63333, y[138] = -75.783, x_hat[138] = -75.8175 -39.8871 -9.98781 \n",
+      "t = 4.66667, y[139] = -77.1071, x_hat[139] = -77.1418 -40.2144 -9.98628 \n",
+      "t = 4.7, y[140] = -78.442, x_hat[140] = -78.477 -40.5416 -9.98476 \n",
+      "t = 4.73333, y[141] = -79.7878, x_hat[141] = -79.823 -40.8687 -9.98324 \n",
+      "t = 4.76667, y[142] = -81.1445, x_hat[142] = -81.18 -41.1957 -9.98174 \n",
+      "t = 4.8, y[143] = -82.5122, x_hat[143] = -82.5478 -41.5227 -9.98025 \n",
+      "t = 4.83333, y[144] = -83.8907, x_hat[144] = -83.9265 -41.8497 -9.97877 \n",
+      "\n",
+      "Exec Success, Final kf states:-83.9265 -41.8497 -9.97877 \n"
+     ]
+    }
+   ],
+   "source": [
+    "std::vector<std::vector<double>> g_res = run_kf(true);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "835df3bf-9017-4a24-ac29-8e9d821bb881",
+   "metadata": {},
+   "source": [
+    "### Defining a C++ function to resolve the 2D vector result *{Kalman Filter Estimates, Kalman Gain}*\n",
+    "\n",
+    "- The function returns the KF estimates if the provided axis is 0 and Kalman Gains if axis = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "2a47ecf6-f4e9-44b6-b72c-a935ed02fdc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> ret_1d_vector(std::vector<std::vector<double>> res, int axis) {\n",
+    "    std::vector<double> ret;\n",
+    "    for (int i = 0; i < res.size(); i++) {\n",
+    "        ret.push_back(res[i][axis]);\n",
+    "    }\n",
+    "    return ret;\n",
+    "}\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1566d4bf-3952-4616-8b45-b0231c5be9f5",
+   "metadata": {},
+   "source": [
+    "#### We obtain the final results as two std::vectors that can be accessed in Python:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "4e3c7506-8fb8-42f0-b5af-e8950d088a3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "std::vector<double> py_g_pred = ret_1d_vector(g_res, 0);\n",
+    "std::vector<double> py_kf_gains = ret_1d_vector(g_res, 1);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f7b1159-8a34-41c2-99d0-56a7e5714cab",
+   "metadata": {},
+   "source": [
+    "### Lets plot the trend of Kalman Gain across the time steps to verify its working\n",
+    "\n",
+    "If the algorithm converged properly, we should see an exponential decrease in the gain  with time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "129a199c-70b0-454b-a70d-68373c4786f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "kalman_gains = list(cppyy.gbl.py_kf_gains)\n",
+    "x = range(len(kalman_gains))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "f7bcd055-7470-42c9-aad8-238ff3d3566e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(x, kalman_gains, color='blue', marker='v')\n",
+    "    \n",
+    "plt.xlabel('Time Steps')\n",
+    "plt.ylabel('Kalman Gain')\n",
+    "plt.title('Kalman Gain Plot')\n",
+    "plt.savefig(\"kalman_gain_plot.png\")\n",
+    "    \n",
+    "plt.yscale('symlog')\n",
+    "    \n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "842b0e92-4722-4551-81a6-e3a00229da0f",
+   "metadata": {},
+   "source": [
+    "<img src=\"kalman_gain_plot.png\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ae33816-740c-455b-82d2-4cacf29206c1",
+   "metadata": {},
+   "source": [
+    "### Now we can plot the values of Gravitational Acceleration *(g)* obtained as Kalman Estimates :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "8801e6da-b52b-46b1-b895-e138bbaa39c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "true_val = 9.81\n",
+    "g_pred = list(cppyy.gbl.py_g_pred)\n",
+    "g_pred = list(-x for x in g_pred)\n",
+    "    \n",
+    "x = range(len(g_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "09889e69-195e-44c0-87ee-a82616492101",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%python\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.axhline(y=true_val, color='green', linestyle='-')\n",
+    "plt.plot(x, g_pred, color='orange', marker='o', label='KF Estimates')\n",
+    "plt.annotate(f'{true_val}', xy=(-0.5, true_val), color='green',\n",
+    "             verticalalignment='center', horizontalalignment = 'left')\n",
+    "    \n",
+    "plt.xlabel('Index')\n",
+    "plt.ylabel('Acceleration (m/sâ‚‚)')\n",
+    "plt.title('True Value vs. g_pred')\n",
+    "plt.legend()\n",
+    "plt.savefig(\"1D_KF_plot.png\")\n",
+    "    \n",
+    "plt.yscale('symlog')\n",
+    "    \n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96887ee7-3a13-4440-8f27-8e93b5bc96db",
+   "metadata": {},
+   "source": [
+    "### Result : The Kalman Filter begins to converge on the value of 9.81"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6596ee2f-a810-4e5c-a4c7-5634d71ec608",
+   "metadata": {},
+   "source": [
+    "<img src=\"1D_KF_plot.png\">"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CUDA (C++17)",
+   "language": "CUDA",
+   "name": "cuda-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/omp/hello_world.ipynb b/notebooks/omp/hello_world.ipynb
new file mode 100644
index 00000000..d0764753
--- /dev/null
+++ b/notebooks/omp/hello_world.ipynb
@@ -0,0 +1,101 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "73cbab37-71dd-477d-981b-f2ec28c01bd6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <stdio.h>\n",
+    "#include <omp.h>\n",
+    "#include <clang/Interpreter/CppInterOp.h>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ef1cd58a-672c-4a6f-843a-6c88fc4911f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Cpp::LoadLibrary(\"libomp\");\n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c2b754ad-9553-4a42-b990-f990a9a269ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int main() {\n",
+    "    int max_threads = omp_get_max_threads();\n",
+    "\n",
+    "    printf(\"max threads: %d\\n\", max_threads);\n",
+    "    omp_set_num_threads(max_threads);\n",
+    "\n",
+    "#pragma omp parallel\n",
+    "    {\n",
+    "        int id = omp_get_thread_num();\n",
+    "        printf(\"Hello World from thread = %d with %d threads\\n\", id, omp_get_num_threads());\n",
+    "    }\n",
+    "\n",
+    "    printf(\"all done, with hopefully %d threads\\n\", max_threads);\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a37a13d4-fc82-496e-8f42-9e718a8c2aa0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "max threads: 8\n",
+      "Hello World from thread = 2 with 8 threads\n",
+      "Hello World from thread = 0 with 8 threads\n",
+      "Hello World from thread = 1 with 8 threads\n",
+      "Hello World from thread = 7 with 8 threads\n",
+      "Hello World from thread = 4 with 8 threads\n",
+      "Hello World from thread = 5 with 8 threads\n",
+      "Hello World from thread = 3 with 8 threads\n",
+      "Hello World from thread = 6 with 8 threads\n",
+      "all done, with hopefully 8 threads\n"
+     ]
+    }
+   ],
+   "source": [
+    "main();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af54945c-2412-4aee-bbfc-a2f6fc72d23f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++17 (+OpenMP)",
+   "language": "C++17",
+   "name": "omp-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/omp/linked_list.ipynb b/notebooks/omp/linked_list.ipynb
new file mode 100644
index 00000000..106cecd6
--- /dev/null
+++ b/notebooks/omp/linked_list.ipynb
@@ -0,0 +1,224 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "156447d2-9279-45a0-890b-4e519d2c796b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <stdlib.h>\n",
+    "#include <stdio.h>\n",
+    "#include <omp.h>\n",
+    "#include </home/jovyan/CppInterOp/include/clang/Interpreter/CppInterOp.h>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c96fdeb0-817d-48c0-af8e-20a52947d60b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#ifndef N\n",
+    "#define N 5\n",
+    "#endif\n",
+    "#ifndef FS\n",
+    "#define FS 38\n",
+    "#endif"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8da842e1-db02-49e0-929d-4e67cbc08172",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Cpp::LoadLibrary(\"libomp\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22f97c49-78d1-496e-ac7c-978aed95331a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "struct node {\n",
+    "  int data;\n",
+    "  int fibdata;\n",
+    "  struct node *next;\n",
+    "};"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b16b1e8a-8831-4b8d-9d57-09deeaaa88ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "struct node *init_list(struct node *p);\n",
+    "void processwork(struct node *p);\n",
+    "int fib(int n);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ef8af6c-1d6f-4c68-84bc-3dd1d8092b06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int fib(int n) {\n",
+    "  int x, y;\n",
+    "  if (n < 2) {\n",
+    "    return (n);\n",
+    "  } else {\n",
+    "    x = fib(n - 1);\n",
+    "    y = fib(n - 2);\n",
+    "    return (x + y);\n",
+    "  }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fa0307d-fdc9-4503-95cb-1c6448791354",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void processwork(struct node *p) {\n",
+    "  int n, temp;\n",
+    "  n = p->data;\n",
+    "  temp = fib(n);\n",
+    "\n",
+    "  p->fibdata = temp;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03acb599-9329-49ff-8aff-c0902adb6c3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "struct node *init_list(struct node *p) {\n",
+    "  int i;\n",
+    "  struct node *head = NULL;\n",
+    "  struct node *temp = NULL;\n",
+    "\n",
+    "  head = (struct node*) malloc(sizeof(struct node));\n",
+    "  p = head;\n",
+    "  p->data = FS;\n",
+    "  p->fibdata = 0;\n",
+    "  for (i = 0; i < N; i++) {\n",
+    "    temp = (struct node*) malloc(sizeof(struct node));\n",
+    "    p->next = temp;\n",
+    "    p = temp;\n",
+    "    p->data = FS + i + 1;\n",
+    "    p->fibdata = i + 1;\n",
+    "  }\n",
+    "\n",
+    "  p->next = NULL;\n",
+    "  return head;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2dfb41b-e55f-43c0-b7f6-546a1697acb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int main() {\n",
+    "  double start, end;\n",
+    "  struct node *p = NULL;\n",
+    "  struct node *temp = NULL;\n",
+    "  struct node *head = NULL;\n",
+    "\n",
+    "  printf(\"Process linked list\\n\");\n",
+    "  printf(\"  Each linked list node will be processed by function 'processwork()'\\n\");\n",
+    "  printf(\"  Each ll node will compute %d fibonacci numbers beginning with %d\\n\", N, FS);\n",
+    "\n",
+    "  omp_set_num_threads(omp_get_max_threads());\n",
+    "\n",
+    "  p = init_list(p);\n",
+    "  head = p;\n",
+    "\n",
+    "  start = omp_get_wtime();\n",
+    "\n",
+    "#pragma omp parallel\n",
+    "  {\n",
+    "#pragma omp master\n",
+    "    printf(\"Threads:      %d\\n\", omp_get_num_threads());\n",
+    "\n",
+    "#pragma omp single\n",
+    "    {\n",
+    "      p = head;\n",
+    "      while (p) {\n",
+    "#pragma omp task firstprivate(p) // first private is required\n",
+    "        {\n",
+    "          processwork(p);\n",
+    "        }\n",
+    "        p = p->next;\n",
+    "      }\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  end = omp_get_wtime();\n",
+    "  p = head;\n",
+    "  while (p != NULL) {\n",
+    "    printf(\"%d : %d\\n\", p->data, p->fibdata);\n",
+    "    temp = p->next;\n",
+    "    free(p);\n",
+    "    p = temp;\n",
+    "  }\n",
+    "\n",
+    "  free(p);\n",
+    "  printf(\"Compute Time: %f seconds\\n\", end - start);\n",
+    "\n",
+    "  return 0;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "353e5dfd-fcae-43e6-97e3-ec98070811a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "main();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a24f21c1-2ddc-4e46-b9ee-7cc98fc2821a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++14",
+   "language": "C++14",
+   "name": "xcpp14"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/omp/mandel.ipynb b/notebooks/omp/mandel.ipynb
new file mode 100644
index 00000000..7c499cfa
--- /dev/null
+++ b/notebooks/omp/mandel.ipynb
@@ -0,0 +1,132 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5059dbdd-821d-498a-8716-eb0fcf8a8f5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <stdio.h>\n",
+    "#include <stdlib.h>\n",
+    "#include <math.h>\n",
+    "#include <omp.h>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b66f96a-14ef-4f23-8024-bcfc42b31e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#define NPOINTS 1000\n",
+    "#define MAXITER 1000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d89dd57c-fe19-4233-a33a-df9b24fae98a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int numoutside = 0;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c35c479-2f79-46b7-bc66-24be6b1694e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void testpoint(double creal, double cimag) {\n",
+    "    // iterate z=z*z+c, until |z| > 2 when point is known to be outside set\n",
+    "    // If loop count reaches MAXITER, point is considered to be inside the set\n",
+    "\n",
+    "    double zreal, zimag, temp;\n",
+    "    int iter;\n",
+    "    zreal = creal;\n",
+    "    zimag = cimag;\n",
+    "\n",
+    "    for (iter = 0; iter < MAXITER; iter++) {\n",
+    "        temp = (zreal * zreal) - (zimag * zimag) + creal;\n",
+    "        zimag = zreal * zimag * 2 + cimag;\n",
+    "        zreal = temp;\n",
+    "        if ((zreal * zreal + zimag * zimag) > 4.0) {\n",
+    "            numoutside++;\n",
+    "            break;\n",
+    "        }\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea116fef-7d05-4e29-97a1-55c85c7241d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int main() {\n",
+    "    int i, j;\n",
+    "    double area, error, eps = 1.0e-5;\n",
+    "    double cimag, creal;\n",
+    "    // Loop over grid of points in the complex plane which contains the Mandelbrot set,\n",
+    "    // testing each point to see whether it is inside or outside the set.\n",
+    "\n",
+    "#pragma omp parallel for private(eps)\n",
+    "    for (i = 0; i < NPOINTS; i++) {\n",
+    "        for (j = 0; j < NPOINTS; j++) {\n",
+    "            creal = -2.0 + 2.5 * (double) (i) / (double) (NPOINTS) + eps;\n",
+    "            cimag = 1.125 * (double) (j) / (double) (NPOINTS) + eps;\n",
+    "            testpoint(creal, cimag);\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    // Calculate area of set and error estimate and output the results\n",
+    "    area = 2.0 * 2.5 * 1.125 * (double) (NPOINTS * NPOINTS - numoutside) / (double) (NPOINTS * NPOINTS);\n",
+    "    error = area / (double) NPOINTS;\n",
+    "\n",
+    "    printf(\"Area of Mandlebrot set = %12.8f +/- %12.8f\\n\", area, error);\n",
+    "    printf(\"Correct answer should be around 1.510659\\n\");\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39cf129c-8106-4e67-a2f1-1a7fff17cd38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "main();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bcf082a-e0a5-4260-8729-cbfab515cc6a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++14",
+   "language": "C++14",
+   "name": "xcpp14"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/omp/pi_integral.ipynb b/notebooks/omp/pi_integral.ipynb
new file mode 100644
index 00000000..50b9b88d
--- /dev/null
+++ b/notebooks/omp/pi_integral.ipynb
@@ -0,0 +1,109 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8967ce2-994c-441e-b796-4099c6c6853c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <omp.h>\n",
+    "#include <stdio.h>\n",
+    "#include </home/jovyan/CppInterOp/include/clang/Interpreter/CppInterOp.h>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93b89565-44fe-4729-980b-d4f897161b0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Cpp::LoadLibrary(\"libomp\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9078ac79-ca50-4fef-b785-37f35fec3cab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "static long num_steps = 100000000;\n",
+    "double step;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3c10995-6f29-4d71-9e61-1993ca9d1cc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int main() {\n",
+    "    int i, j, num_threads_allocated;\n",
+    "    double x, pi, sum = 0.0;\n",
+    "    double start_time, run_time;\n",
+    "\n",
+    "    step = 1.0 / (double)num_steps;\n",
+    "    printf(\"Num threads available: %d\\n\", omp_get_max_threads());\n",
+    "    for (i = 1; i <= 4; i++) {\n",
+    "        sum = 0.0;\n",
+    "        omp_set_num_threads(i);\n",
+    "        start_time = omp_get_wtime();\n",
+    "#pragma omp parallel\n",
+    "        {\n",
+    "            num_threads_allocated = omp_get_num_threads();\n",
+    "#pragma omp single\n",
+    "            printf(\"Num threads allocated for this run: %d\\n\", num_threads_allocated);\n",
+    "\n",
+    "#pragma omp for reduction(+ : sum)\n",
+    "            for (j = 1; j <= num_steps; j++) {\n",
+    "                x = (j - 0.5) * step;\n",
+    "                sum = sum + 4.0 / (1.0 + x * x);\n",
+    "            }\n",
+    "        }\n",
+    "\n",
+    "        pi = step * sum;\n",
+    "        run_time = omp_get_wtime() - start_time;\n",
+    "        printf(\"pi is %f in %f seconds using %d threads\\n\\n\", pi, run_time, num_threads_allocated);\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f84442a-d947-4860-bd3c-aeeea963b419",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "main();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7066da1-729b-4230-980f-9be9430c19d6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++14",
+   "language": "C++14",
+   "name": "xcpp14"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/openmp-demo.ipynb b/notebooks/openmp-demo.ipynb
new file mode 100644
index 00000000..a9ac4a2a
--- /dev/null
+++ b/notebooks/openmp-demo.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0c15570-ee24-42ed-b61f-11a3fc858b2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <iostream>\n",
+    "#include \"omp.h\"\n",
+    "#include </home/jovyan/CppInterOp/include/clang/Interpreter/CppInterOp.h>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b4dac8e-3ad2-46eb-b801-ba717e664b93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Cpp::LoadLibrary(\"libomp\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5001e441-1fa5-4bdc-9fa5-2ca103ae484f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void example1() {\n",
+    "  std::cout << \"Hello World!\" << std::endl;\n",
+    "}\n",
+    "example1();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53fb7656-b72e-42bc-ade7-2ae2077142da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void example2() {\n",
+    "    #pragma omp parallel\n",
+    "    {\n",
+    "    std::cout << \"Hello World!\" << std::endl;\n",
+    "    }\n",
+    "}\n",
+    "example2();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "efcdfdb6-a60b-46af-8194-75ef9cc0e27f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void example3() {\n",
+    "    #pragma omp parallel\n",
+    "    {\n",
+    "    std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n",
+    "    }\n",
+    "}\n",
+    "example3();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d86a9efa-ba28-4cb6-bbfc-abc00ee63506",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void example4() {\n",
+    "    #pragma omp parallel\n",
+    "    {\n",
+    "    std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n",
+    "    }\n",
+    "\n",
+    "    std::cout << \"This is another message! (\" << omp_get_thread_num() << \")\" << std::endl;\n",
+    "\n",
+    "    #pragma omp parallel num_threads(2)\n",
+    "    {\n",
+    "    std::cout << \"Goodbye World! (\" << omp_get_thread_num() << \")\" << std::endl;\n",
+    "    }\n",
+    "}\n",
+    "example4();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5557e01a-7c7d-4b54-8545-962ad11027df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void example5() {\n",
+    "    double start_time = omp_get_wtime();\n",
+    "    double start_loop;\n",
+    "    \n",
+    "    const int N = 1000000000;\n",
+    "    int* a = new int[N];\n",
+    "    int* b = new int[N];\n",
+    "    \n",
+    "    start_loop = omp_get_wtime();\n",
+    "    #pragma omp parallel for\n",
+    "    for (int i=0; i<N; i++) {\n",
+    "        a[i] = 1.0;\n",
+    "    }\n",
+    "    std::cout << \"Initialize a[] time: \" << omp_get_wtime()-start_loop << std::endl;\n",
+    "\n",
+    "    start_loop = omp_get_wtime();\n",
+    "    #pragma omp parallel for\n",
+    "    for (int i=0; i<N; i++) {\n",
+    "        b[i] = 1.0 + double(i);\n",
+    "    }\n",
+    "    std::cout << \"Initialize b[] time: \" << omp_get_wtime()-start_loop << std::endl;\n",
+    "\n",
+    "    start_loop = omp_get_wtime();\n",
+    "    #pragma omp parallel for\n",
+    "    for (int i=0; i<N; i++) {\n",
+    "        a[i] = a[i] + b[i];\n",
+    "    }\n",
+    "    std::cout << \"Add arrays time: \" << omp_get_wtime()-start_loop << std::endl;\n",
+    "    \n",
+    "    start_loop = omp_get_wtime();\n",
+    "    double average = 0.0;\n",
+    "    #pragma omp parallel for reduction(+:average)\n",
+    "    for (int i=0; i<N; i++) {\n",
+    "        average += a[i];\n",
+    "    }\n",
+    "    average = average/double(N);\n",
+    "    std::cout << \"Average result time: \" << omp_get_wtime()-start_loop << std::endl;\n",
+    "    \n",
+    "    std::cout << \"Average: \" << average << std::endl;\n",
+    "\n",
+    "    std::cout << \"Total time: \" << omp_get_wtime()-start_time << std::endl;\n",
+    "}\n",
+    "example5();"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++11 (with OpenMP)",
+   "language": "C++11",
+   "name": "xcpp11"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/python_cpp_plot.png b/notebooks/python_cpp_plot.png
new file mode 100644
index 00000000..104edfce
Binary files /dev/null and b/notebooks/python_cpp_plot.png differ
diff --git a/notebooks/smallptCuda.ipynb b/notebooks/smallptCuda.ipynb
new file mode 100644
index 00000000..73313134
--- /dev/null
+++ b/notebooks/smallptCuda.ipynb
@@ -0,0 +1,506 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c05462f5-b321-48b4-a6ad-a2a5258406d2",
+   "metadata": {},
+   "source": [
+    "# SmallPT CUDA Demo\n",
+    "\n",
+    "A simple cuda version of [smallpt](http://www.kevinbeason.com/smallpt/) with some memory optimization.\n",
+    "\n",
+    "Adapted from https://github.com/BentleyBlanks/smallptCuda.git"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6b13691-e23d-49de-838b-04c78b348194",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Helpers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b71d65d8-5ff1-4a3d-bf10-57653ae20b07",
+   "metadata": {},
+   "source": [
+    "### Math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b05a70cb-e401-4b1b-b5d3-9ee911ec4da9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//#include \"helper_math.h\"\n",
+    "#include <math.h>\n",
+    "#include <cstdint>\n",
+    "#include \"cuda_runtime.h\"\n",
+    "\n",
+    "inline __device__ __host__ float clamp(float f, float a, float b)\n",
+    "{\n",
+    "    return fmaxf(a, fminf(f, b));\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29109b8c-ac5e-4826-aa48-6e167a48c059",
+   "metadata": {},
+   "source": [
+    "### Image helper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0c76fca2-0878-4b0d-bea4-16de34b31cf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <cstdio>\n",
+    "\n",
+    "#ifdef __unix\n",
+    "#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL\n",
+    "#endif\n",
+    "\n",
+    "inline uint8_t ToByte(float color, float gamma = 2.2f) noexcept\n",
+    "{\n",
+    "  const float gcolor = std::pow(color, 1.0 / gamma);\n",
+    "  return static_cast<int8_t>(clamp(255.0f * gcolor, 0.0f, 255.0f));\n",
+    "}\n",
+    "\n",
+    "inline void WritePPM(uint32_t w, uint32_t h, \n",
+    "             const float3* Ls, \n",
+    "             const char* fname = \"smallptCuda.ppm\") noexcept\n",
+    "{\n",
+    "    FILE* fp;\n",
+    "    \n",
+    "    fopen_s(&fp, fname, \"w\");\n",
+    "    \n",
+    "    std::fprintf(fp, \"P3\\n%u %u\\n%u\\n\", w, h, 255u);\n",
+    "    for (std::size_t i = 0; i < w * h; ++i) {\n",
+    "      std::fprintf(fp, \"%u %u %u \", \n",
+    "             ToByte(Ls[i].x), \n",
+    "             ToByte(Ls[i].y), \n",
+    "             ToByte(Ls[i].z));\n",
+    "    }\n",
+    "    \n",
+    "    std::fclose(fp);\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "caa7fb92-d64d-45b9-8d4f-9f9d81e75bfa",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# SmallPT CUDA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cd12e5f-59ed-4367-8d22-65838179dc33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <curand_kernel.h>\n",
+    "#include <device_launch_parameters.h>\n",
+    "#include <iostream>\n",
+    "\n",
+    "///#include <core/sTimer.h>\n",
+    "///#include <core/sRandom.h>\n",
+    "///#include <core/helper_math.h>\n",
+    "///#include <image/svpng.inc>\n",
+    "\n",
+    "#define PI 3.14159265359f\n",
+    "\n",
+    "#define CUDA_SAFE_CALL(call) { \\\n",
+    "cudaError err = ( call);                                                \\\n",
+    "if(cudaSuccess != err) {                                             \\\n",
+    "    fprintf(stderr, \"Cuda error in file '%s' in line %i : %s.\\n\",    \\\n",
+    "            __FILE__, __LINE__, cudaGetErrorString( err) );          \\\n",
+    "} }\n",
+    "\n",
+    "// -----------------------------------GPU Func-----------------------------------\n",
+    "// From [smallpt](http://www.kevinbeason.com/smallpt/)\n",
+    "enum materialType\n",
+    "{ \n",
+    "    DIFFUSE = 0, \n",
+    "    MIRROR, \n",
+    "    GLASS\n",
+    "};\n",
+    "\n",
+    "struct __align__(16) Ray\n",
+    "{\n",
+    "    __device__ Ray() {}\n",
+    "\n",
+    "    __device__ Ray(float3 origin, float3 direction) \n",
+    "        : origin(origin), direction(direction) {}\n",
+    "\n",
+    "    float3 origin;\n",
+    "    float3 direction;\n",
+    "};\n",
+    "\n",
+    "struct __align__(16) sphere\n",
+    "{\n",
+    "    float radius;\n",
+    "    float3 center, emission, reflectance;\n",
+    "    materialType type;\n",
+    "\n",
+    "    __device__ double intersect(const Ray &r) const\n",
+    "    {\n",
+    "\n",
+    "        float3 op = center - r.origin;\n",
+    "        float t, epsilon = 0.0001f;  // epsilon required to prevent floating point precision artefacts\n",
+    "        float b = dot(op, r.direction);    // b in quadratic equation\n",
+    "        float disc = b*b - dot(op, op) + radius*radius;  // discriminant quadratic equation\n",
+    "        if(disc < 0) return 0;       // if disc < 0, no real solution (we're not interested in complex roots) \n",
+    "        else disc = sqrtf(disc);    // if disc >= 0, check for solutions using negative and positive discriminant\n",
+    "        return (t = b - disc) > epsilon ? t : ((t = b + disc) > epsilon ? t : 0); // pick closest point in front of ray origin\n",
+    "    }\n",
+    "};\n",
+    "\n",
+    "__constant__ sphere spheres[] = {\n",
+    "    {1e5f,{1e5f + 1.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{0.75f, 0.25f, 0.25f}, DIFFUSE}, //Left \n",
+    "    {1e5f,{-1e5f + 99.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{.25f, .25f, .75f}, DIFFUSE}, //Rght \n",
+    "    {1e5f,{50.0f, 40.8f, 1e5f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Back \n",
+    "    {1e5f,{50.0f, 40.8f, -1e5f + 170.0f},{0.0f, 0.0f, 0.0f},{0.0f, 0.0f, 0.0f}, DIFFUSE}, //Frnt \n",
+    "    {1e5f,{50.0f, 1e5f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Botm \n",
+    "    {1e5f,{50.0f, -1e5f + 81.6f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Top \n",
+    "    {16.5f,{27.0f, 16.5f, 47.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, MIRROR},//Mirr\n",
+    "    {16.5f,{73.0f, 16.5f, 78.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, GLASS},//Glas\n",
+    "    {600.0f,{50.0f, 681.6f-.27f, 81.6f},{12, 12, 12},{0.0f, 0.0f, 0.0f}, DIFFUSE}  // Light\n",
+    "};\n",
+    "\n",
+    "__constant__ const int nsphere = sizeof(spheres) / sizeof(sphere);\n",
+    "\n",
+    "__device__ float rgbToLuminance(const float3& rgb)\n",
+    "{\n",
+    "    const float YWeight[3] = {0.212671f, 0.715160f, 0.072169f};\n",
+    "    return YWeight[0] * rgb.x + YWeight[1] * rgb.y + YWeight[2] * rgb.z;\n",
+    "}\n",
+    "\n",
+    "__device__ bool intersectScene(const Ray &r, float &t, int &id, sphere* pshere, int &nsp)\n",
+    "{\n",
+    "    float d, inf = t = 1e20;  // t is distance to closest intersection, initialise t to a huge number outside scene\n",
+    "    for(int i = nsp; i--;)\n",
+    "    {\n",
+    "        // find closest hit object and point\n",
+    "        if((d = pshere[i].intersect(r)) && d < t)\n",
+    "        {\n",
+    "            t = d;\n",
+    "            id = i;\n",
+    "        }\n",
+    "    }\n",
+    "        \n",
+    "    return t < inf; // returns true if an intersection with the scene occurred, false when no hit\n",
+    "}\n",
+    "\n",
+    "__device__ float clamp(float x) { return x < 0 ? 0 : x>1 ? 1 : x; }\n",
+    "\n",
+    "__device__ float gammaCorrection(float x)\n",
+    "{\n",
+    "    return pow(clamp(x), 1 / 2.2f);\n",
+    "}\n",
+    "\n",
+    "__device__ float3 radiance(Ray &r, curandState* rs, sphere* pshere, int &nsp)\n",
+    "{\n",
+    "    float3 L = make_float3(0.0f, 0.0f, 0.0f); // accumulates ray colour with each iteration through bounce loop\n",
+    "    float3 throughput = make_float3(1.0f, 1.0f, 1.0f);\n",
+    "    int depth = 0;\n",
+    "\n",
+    "    // ray bounce loop\n",
+    "    while(1)\n",
+    "    {\n",
+    "        float t;    \n",
+    "        int id = 0;         \n",
+    "\n",
+    "        // find closest intersection with object's index\n",
+    "        if(!intersectScene(r, t, id, pshere, nsp))\n",
+    "            break;\n",
+    "\n",
+    "        const sphere &obj = pshere[id];\n",
+    "        float3 hitpoint = r.origin + r.direction * t; \n",
+    "        float3 normal = normalize(hitpoint - obj.center);\n",
+    "        float3 nl = dot(normal, r.direction) < 0 ? normal : normal * -1; // front facing normal\n",
+    "\n",
+    "        // prevent self-intersection\n",
+    "        r.origin = hitpoint + nl * 0.05f;\n",
+    "\n",
+    "        //float pdf = 1.0f;\n",
+    "\n",
+    "        // add emission\n",
+    "        L += throughput * obj.emission;\n",
+    "\n",
+    "        // different material\n",
+    "        if(obj.type == DIFFUSE)\n",
+    "        {        \n",
+    "            // uniform sampling hemisphere\n",
+    "            float r1 = 2 * PI * curand_uniform(rs);\n",
+    "            float r2 = curand_uniform(rs);\n",
+    "            float r2s = sqrtf(r2);\n",
+    "\n",
+    "            // compute local coordinate on the hit point\n",
+    "            float3 w = nl;\n",
+    "            float3 u = normalize(cross((fabs(w.x) > .1 ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));\n",
+    "            float3 v = cross(w, u);\n",
+    "\n",
+    "            // local to world convert\n",
+    "            r.direction = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));\n",
+    "            //pdf = 1.0f / PI;\n",
+    "\n",
+    "            // importance sampling no need costheta\n",
+    "            //throughput *= obj.reflectance * dot(r.direction, nl);\n",
+    "            throughput *= obj.reflectance;\n",
+    "        }\n",
+    "        else if(obj.type == MIRROR)\n",
+    "        {\n",
+    "            r.direction = r.direction - normal * 2 * dot(normal, r.direction);\n",
+    "            throughput *= obj.reflectance;\n",
+    "            //pdf = 1.0f;\n",
+    "        }\n",
+    "        else\n",
+    "        {\n",
+    "            r.origin = hitpoint;\n",
+    "\n",
+    "            // Ideal dielectric REFRACTION\n",
+    "            float3 reflectDir = r.direction - normal * 2 * dot(normal, r.direction);\n",
+    "            // Ray from outside going in?\n",
+    "            bool into = dot(normal, nl) > 0;\n",
+    "            float nc = 1, nt = 1.5, nnt = into ? nc / nt : nt / nc, ddn = dot(r.direction, nl), cos2t;\n",
+    "            \n",
+    "            // total internal reflection\n",
+    "            if((cos2t = 1 - nnt*nnt*(1 - ddn*ddn)) < 0)\n",
+    "            {\n",
+    "                r.direction = reflectDir;\n",
+    "                throughput *= obj.reflectance;\n",
+    "            }\n",
+    "            else\n",
+    "            {\n",
+    "                // refract or reflect\n",
+    "                float3 tdir = normalize(r.direction*nnt - normal*((into ? 1 : -1)*(ddn*nnt + sqrt(cos2t))));\n",
+    "\n",
+    "                float a = nt - nc, b = nt + nc, R0 = a*a / (b*b), c = 1 - (into ? -ddn : dot(tdir, normal));\n",
+    "\n",
+    "                float Re = R0 + (1 - R0)*c*c*c*c*c, Tr = 1 - Re, P = .25 + .5*Re, RP = Re / P, TP = Tr / (1 - P);\n",
+    "                \n",
+    "                if(curand_uniform(rs) < P)\n",
+    "                {\n",
+    "                    // reflect\n",
+    "                    r.direction = reflectDir;\n",
+    "                    throughput *= obj.reflectance * RP;\n",
+    "                }\n",
+    "                else\n",
+    "                {\n",
+    "                    //refract\n",
+    "                    r.direction = tdir;\n",
+    "                    throughput *= obj.reflectance * TP;\n",
+    "                    //throughput *= make_float3(1, 0, 0);\n",
+    "                }\n",
+    "            }\n",
+    "        }\n",
+    "\n",
+    "        // Russian roulette Stop with at least some probability to avoid getting stuck\n",
+    "        if(depth++ >= 5)\n",
+    "        {\n",
+    "            float q = min(0.95f, rgbToLuminance(throughput));\n",
+    "            if(curand_uniform(rs) >= q)\n",
+    "                break;\n",
+    "            throughput /= q;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    return L;\n",
+    "}\n",
+    "\n",
+    "__global__ void render1(int width, int height)\n",
+    "{\n",
+    "    printf(\"!1!\\n\");\n",
+    "}\n",
+    "\n",
+    "__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)\n",
+    "{  \n",
+    "  printf(\"!!!\\n\");\n",
+    "\n",
+    "  int i = blockDim.x * blockIdx.x + threadIdx.x;\n",
+    "\n",
+    "  if (i < numElements) {\n",
+    "    C[i] = A[i] + B[i] + 0.0f;\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "__global__ void render(int spp, int width, int height, float3* output)\n",
+    "{\n",
+    "    printf(\"!!!\\n\");\n",
+    "    \n",
+    "    //copy spheres to shared memory\n",
+    "    __shared__ int nsp;\n",
+    "    __shared__ sphere sspheres[nsphere];\n",
+    "    __shared__ Ray tRay;\n",
+    "    nsp = nsphere;\n",
+    "\n",
+    "    sspheres[threadIdx.x % nsp] = spheres[threadIdx.x % nsp];\n",
+    "\n",
+    "    __syncthreads();\n",
+    "\n",
+    "    // position of current pixel\n",
+    "    int x = blockIdx.x * blockDim.x + threadIdx.x;\n",
+    "    int y = blockIdx.y * blockDim.y + threadIdx.y;\n",
+    "\n",
+    "    // index of current pixel\n",
+    "    //int i = (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;\n",
+    "    int i = (height - y - 1) * width + x;\n",
+    "\n",
+    "    curandState rs;\n",
+    "    curand_init(i, 0, 0, &rs);\n",
+    "\n",
+    "    Ray cam(make_float3(50, 52, 295.6), normalize(make_float3(0, -0.042612, -1))); // cam pos, dir \n",
+    "    float3 cx = make_float3(width * 0.5135f / height, 0.0f, 0.0f);\n",
+    "    // .5135 is field of view angle\n",
+    "    float3 cy = normalize(cross(cx, cam.direction)) * 0.5135f;\n",
+    "    float3 color = make_float3(0.0f);\n",
+    "\n",
+    "    for (int sy = 0; sy < 2; sy++)\n",
+    "    {\n",
+    "        for (int sx = 0; sx < 2; sx++)\n",
+    "        { \n",
+    "            for(int s = 0; s < spp; s++)\n",
+    "            {\n",
+    "                float r1 = curand_uniform(&rs);\n",
+    "                float dx = r1 < 1 ? sqrtf(r1) - 1 : 1-sqrtf(2 - r1);\n",
+    "                float r2 = curand_uniform(&rs);\n",
+    "                float dy = r2 < 1 ? sqrtf(r2) - 1 : 1-sqrtf(2 - r2);\n",
+    "                //--! super sampling\n",
+    "                float3 d = cam.direction + cx*((((sx + dx + .5) / 2) + x) / width - .5) + \n",
+    "                                           cy*((((sy + dy + .5) / 2) + y) / height - .5);\n",
+    "\n",
+    "                //Ray tRay = Ray(cam.origin + d * 140, normalize(d));\n",
+    "                tRay.direction = normalize(d);\n",
+    "                tRay.origin = cam.origin + d * 140;\n",
+    "                color += radiance(tRay, &rs, sspheres, nsp) *(.25f / spp);\n",
+    "            }\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    // output to the cache\n",
+    "    __shared__ float3 temp;\n",
+    "    temp = make_float3(clamp(color.x, 0.0f, 1.0f), clamp(color.y, 0.0f, 1.0f), clamp(color.z, 0.0f, 1.0f));\n",
+    "    output[i] = temp;\n",
+    "}\n",
+    "\n",
+    "// -----------------------------------CPU Func-----------------------------------\n",
+    "\n",
+    "void devicePropertyPrint()\n",
+    "{\n",
+    "  int dev = 0;\n",
+    "  cudaDeviceProp devProp;\n",
+    "  if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)\n",
+    "  {\n",
+    "    printf(\"Device %i, named: %s\\n\", dev, devProp.name);\n",
+    "    printf(\"Device compute capability: %i.%i\\n\", devProp.major, devProp.minor);\n",
+    "    printf(\"Device maxThreadDim: [%i, %i, %i]\\n\", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);\n",
+    "    printf(\"Device maxGridSize: [%i, %i, %i]\\n\", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);\n",
+    "    printf(\"Multi Processor Count: %i\\n\", devProp.multiProcessorCount);\n",
+    "    printf(\"Size of SharedMem Per-Block: %f KB\\n\", devProp.sharedMemPerBlock / 1024.0);\n",
+    "    printf(\"Max Threads Per-Block: %i\\n\", devProp.maxThreadsPerBlock);\n",
+    "    printf(\"Max Threads Per-MultiProcessor: %i\\n\", devProp.maxThreadsPerMultiProcessor);\n",
+    "    printf(\"\\n\");\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "void Render(const std::uint32_t nb_samples) {\n",
+    "\n",
+    "    // Image Size\n",
+    "    //int width = 1024, height = 768;\n",
+    "    int width = 256, height = 192;\n",
+    "    int spp = nb_samples/4;\n",
+    "\n",
+    "    printf(\"\\nRendering Size: [%d, %d], spp: %d\\n\", width, height, spp);\n",
+    "    printf(\"------------------Rendering Started------------------\\n\");\n",
+    "\n",
+    "    //sTimer t;\n",
+    "    \n",
+    "    // Memory on CPU\n",
+    "    float3* outputCPU = new float3[width * height];\n",
+    "    float3* outputGPU;\n",
+    "    CUDA_SAFE_CALL(cudaMalloc(&outputGPU, width * height * sizeof(float3)));\n",
+    "\n",
+    "    // Ray Pool\n",
+    "    dim3 blockSize(32, 32, 1);\n",
+    "    dim3 gridSize(width / blockSize.x, height / blockSize.y, 1);\n",
+    "\n",
+    "    //t.start();\n",
+    "\n",
+    "    \n",
+    "    cudaError_t err = cudaSuccess;\n",
+    "    int numElements = 50000;\n",
+    "    size_t size = numElements * sizeof(float);\n",
+    "    float *d_A = NULL;\n",
+    "    float *d_B = NULL;\n",
+    "    float *d_C = NULL;\n",
+    "    err = cudaMalloc((void **)&d_A, size);\n",
+    "    cudaMalloc((void **)&d_B, size);\n",
+    "    cudaMalloc((void **)&d_C, size);\n",
+    "\n",
+    "    if (err != cudaSuccess) {\n",
+    "      printf(\"Failed to allocate device vector A (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "      exit(EXIT_FAILURE);\n",
+    "    }\n",
+    "\n",
+    "    CUDA_SAFE_CALL(cudaDeviceSynchronize());\n",
+    "    \n",
+    "    // Render on GPU\n",
+    "printf(\"@1@\\n\");\n",
+    "      vectorAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, numElements);\n",
+    "//printf(\"@1@\\n\");\n",
+    "    //render<<<gridSize, blockSize>>>(spp, width, height, outputGPU);\n",
+    "    CUDA_SAFE_CALL(cudaPeekAtLastError());\n",
+    "    CUDA_SAFE_CALL(cudaDeviceSynchronize());\n",
+    "    \n",
+    "    //t.end();\n",
+    "\n",
+    "    // Copy Mem from GPU to CPU\n",
+    "    CUDA_SAFE_CALL(cudaMemcpy(outputCPU, outputGPU, width * height * sizeof(float3), cudaMemcpyDeviceToHost));\n",
+    "\n",
+    "    // free CUDA memory\n",
+    "    cudaFree(outputGPU);\n",
+    "\n",
+    "    printf(\"------------------Rendering Ended------------------\\n\");\n",
+    "    //printf(\"Cost time: %f\\n\", t.difference());\n",
+    "\n",
+    "   WritePPM(width, height, outputCPU);\n",
+    "}\n",
+    "\n",
+    "const std::uint32_t nb_samples = 40;\n",
+    "devicePropertyPrint();\n",
+    "Render(nb_samples);"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CppInterOp (C++17)",
+   "language": "CUDA",
+   "name": "cppinterop-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "C++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/vectorAddCuda.ipynb b/notebooks/vectorAddCuda.ipynb
new file mode 100644
index 00000000..e339d051
--- /dev/null
+++ b/notebooks/vectorAddCuda.ipynb
@@ -0,0 +1,1016 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "57b6c4aa-b3a2-4a0e-9192-cc84e2215f96",
+   "metadata": {},
+   "source": [
+    "# CUDA Vector Add Demo\n",
+    "\n",
+    "Adapted from https://github.com/NVIDIA/cuda-samples.git"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d80193b1-362f-4b64-b8ef-c51d0405e12f",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Main helpers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d1fefca-10c2-4fed-9c77-cec0882808b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#include <cuda_runtime.h>\n",
+    "#include <stdio.h>\n",
+    "\n",
+    "///\n",
+    "\n",
+    "//#include \"cuda_runtime.h\"\n",
+    "#include \"curand_kernel.h\"\n",
+    "\n",
+    "#include <algorithm>\n",
+    "#include <cmath>\n",
+    "#include <cstdint>\n",
+    "#include <limits>\n",
+    "#include <cstdio>\n",
+    "#include <cstdlib>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "597e32ba-8759-4a94-aa25-3b8d11ff99b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "  constexpr double g_pi = 3.14159265358979323846;\n",
+    "\n",
+    "  __host__ __device__ inline double Clamp(double v, \n",
+    "                      double low = 0.0, double high = 1.0) noexcept\n",
+    "  {\n",
+    "    return fmin(fmax(v, low), high);\n",
+    "  }\n",
+    "\n",
+    "  inline std::uint8_t ToByte(double color, double gamma = 2.2) noexcept\n",
+    "  {\n",
+    "    const double gcolor = std::pow(color, 1.0 / gamma);\n",
+    "    return static_cast<std::uint8_t>(Clamp(255.0 * gcolor, 0.0, 255.0));\n",
+    "  }\n",
+    "\n",
+    "  inline void HandleError(cudaError_t err, const char* file, int line)\n",
+    "  {\n",
+    "    if (cudaSuccess != err) {\n",
+    "      std::printf(\"%s in %s at line %d\\n\", \n",
+    "            cudaGetErrorString(err), file, line);\n",
+    "      std::exit(EXIT_FAILURE);\n",
+    "    }\n",
+    "  }\n",
+    "//}\n",
+    "\n",
+    "#define CUDA_SAFE_CALL(call) { \\\n",
+    "cudaError err = ( call);                                                \\\n",
+    "if(cudaSuccess != err) {                                             \\\n",
+    "    fprintf(stderr, \"Cuda error in file '%s' in line %i : %s.\\n\",    \\\n",
+    "            __FILE__, __LINE__, cudaGetErrorString( err) );          \\\n",
+    "} }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f4554eb-e09c-4aac-9c5d-486837177a01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "//namespace smallpt {\n",
+    "\n",
+    "  struct Vector3 {\n",
+    "\n",
+    "  //public:\n",
+    "\n",
+    "    __host__ __device__ explicit Vector3(double xyz = 0.0) noexcept\n",
+    "      : Vector3(xyz, xyz, xyz) {}\n",
+    "    __host__ __device__ Vector3(double x, double y, double z) noexcept\n",
+    "      : m_x(x), m_y(y), m_z(z) {}\n",
+    "    Vector3(const Vector3& v) noexcept = default;\n",
+    "    Vector3(Vector3&& v) noexcept = default;\n",
+    "    ~Vector3() = default;\n",
+    "\n",
+    "    Vector3& operator=(const Vector3& v) = default;\n",
+    "    Vector3& operator=(Vector3&& v) = default;\n",
+    "\n",
+    "    __device__ bool HasNaNs() const {\n",
+    "      return std::isnan(m_x) || std::isnan(m_y) || std::isnan(m_z);\n",
+    "    }\n",
+    "\n",
+    "    __device__ const Vector3 operator-() const {\n",
+    "      return { -m_x, -m_y, -m_z };\n",
+    "    }\n",
+    "\n",
+    "    __device__ const Vector3 operator+(const Vector3& v) const {\n",
+    "      return { m_x + v.m_x, m_y + v.m_y, m_z + v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator-(const Vector3& v) const {\n",
+    "      return { m_x - v.m_x, m_y - v.m_y, m_z - v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator*(const Vector3& v) const {\n",
+    "      return { m_x * v.m_x, m_y * v.m_y, m_z * v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator/(const Vector3& v) const {\n",
+    "      return { m_x / v.m_x, m_y / v.m_y, m_z / v.m_z };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator+(double a) const {\n",
+    "      return { m_x + a, m_y + a, m_z + a };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator-(double a) const {\n",
+    "      return { m_x - a, m_y - a, m_z - a };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator*(double a) const {\n",
+    "      return { m_x * a, m_y * a, m_z * a };\n",
+    "    }\n",
+    "    __device__ const Vector3 operator/(double a) const {\n",
+    "      const double inv_a = 1.0 / a;\n",
+    "      return { m_x * inv_a, m_y * inv_a, m_z * inv_a };\n",
+    "    }\n",
+    "\n",
+    "    __device__ Vector3& operator+=(const Vector3& v) {\n",
+    "      m_x += v.m_x;\n",
+    "      m_y += v.m_y;\n",
+    "      m_z += v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator-=(const Vector3& v) {\n",
+    "      m_x -= v.m_x;\n",
+    "      m_y -= v.m_y;\n",
+    "      m_z -= v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator*=(const Vector3& v) {\n",
+    "      m_x *= v.m_x;\n",
+    "      m_y *= v.m_y;\n",
+    "      m_z *= v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator/=(const Vector3& v) {\n",
+    "      m_x /= v.m_x;\n",
+    "      m_y /= v.m_y;\n",
+    "      m_z /= v.m_z;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator+=(double a) {\n",
+    "      m_x += a;\n",
+    "      m_y += a;\n",
+    "      m_z += a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator-=(double a) {\n",
+    "      m_x -= a;\n",
+    "      m_y -= a;\n",
+    "      m_z -= a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator*=(double a) {\n",
+    "      m_x *= a;\n",
+    "      m_y *= a;\n",
+    "      m_z *= a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "    __device__ Vector3& operator/=(double a) {\n",
+    "      const double inv_a = 1.0 / a;\n",
+    "      m_x *= inv_a;\n",
+    "      m_y *= inv_a;\n",
+    "      m_z *= inv_a;\n",
+    "      return *this;\n",
+    "    }\n",
+    "\n",
+    "    __device__ double Dot(const Vector3& v) const {\n",
+    "      return m_x * v.m_x + m_y * v.m_y + m_z * v.m_z;\n",
+    "    }\n",
+    "    __device__ const Vector3 Cross(const Vector3& v) const {\n",
+    "      return {\n",
+    "        m_y * v.m_z - m_z * v.m_y,\n",
+    "        m_z * v.m_x - m_x * v.m_z,\n",
+    "        m_x * v.m_y - m_y * v.m_x\n",
+    "      };\n",
+    "    }\n",
+    "\n",
+    "    __device__ bool operator==(const Vector3& rhs) const {\n",
+    "      return m_x == rhs.m_x && m_y == rhs.m_y && m_z == rhs.m_z;\n",
+    "    }\n",
+    "    __device__ bool operator!=(const Vector3& rhs) const {\n",
+    "      return !(*this == rhs);\n",
+    "    }\n",
+    "\n",
+    "    __device__ double& operator[](size_t i) {\n",
+    "      return (&m_x)[i];\n",
+    "    }\n",
+    "    __device__ double operator[](size_t i) const {\n",
+    "      return (&m_x)[i];\n",
+    "    }\n",
+    "\n",
+    "    __device__ size_t MinDimension() const {\n",
+    "      return (m_x < m_y && m_x < m_z) ? 0u : ((m_y < m_z) ? 1u : 2u);\n",
+    "    }\n",
+    "    __device__ size_t MaxDimension() const {\n",
+    "      return (m_x > m_y && m_x > m_z) ? 0u : ((m_y > m_z) ? 1u : 2u);\n",
+    "    }\n",
+    "    __device__ double Min() const {\n",
+    "      return fmin(m_x, fmin(m_y, m_z));\n",
+    "    }\n",
+    "    __device__ double Max() const {\n",
+    "      return fmax(m_x, fmax(m_y, m_z));\n",
+    "    }\n",
+    "\n",
+    "    __device__ double Norm2_squared() const {\n",
+    "      return m_x * m_x + m_y * m_y + m_z * m_z;\n",
+    "    }\n",
+    "\n",
+    "    __device__ double Norm2() const {\n",
+    "      return std::sqrt(Norm2_squared());\n",
+    "    }\n",
+    "\n",
+    "    __device__ void Normalize() {\n",
+    "      const double a = 1.0 / Norm2();\n",
+    "      m_x *= a;\n",
+    "      m_y *= a;\n",
+    "      m_z *= a;\n",
+    "    }\n",
+    "\n",
+    "    double m_x, m_y, m_z;\n",
+    "  };\n",
+    "\n",
+    "  __device__ inline const Vector3 operator+(double a, const Vector3& v) {\n",
+    "    return { a + v.m_x, a + v.m_y, a + v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 operator-(double a, const Vector3& v) {\n",
+    "    return { a - v.m_x, a - v.m_y, a - v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 operator*(double a, const Vector3& v) {\n",
+    "    return { a * v.m_x, a * v.m_y, a * v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 operator/(double a, const Vector3& v) {\n",
+    "    return { a / v.m_x, a / v.m_y, a / v.m_z };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Sqrt(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::sqrt(v.m_x),\n",
+    "      std::sqrt(v.m_y),\n",
+    "      std::sqrt(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Pow(const Vector3& v, double a) {\n",
+    "    return {\n",
+    "      std::pow(v.m_x, a),\n",
+    "      std::pow(v.m_y, a),\n",
+    "      std::pow(v.m_z, a)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Abs(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::abs(v.m_x),\n",
+    "      std::abs(v.m_y),\n",
+    "      std::abs(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Min(const Vector3& v1, const Vector3& v2) {\n",
+    "    return {\n",
+    "      fmin(v1.m_x, v2.m_x),\n",
+    "      fmin(v1.m_y, v2.m_y),\n",
+    "      fmin(v1.m_z, v2.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Max(const Vector3& v1, const Vector3& v2) {\n",
+    "    return {\n",
+    "      fmax(v1.m_x, v2.m_x),\n",
+    "      fmax(v1.m_y, v2.m_y),\n",
+    "      fmax(v1.m_z, v2.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Round(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::round(v.m_x),\n",
+    "      std::round(v.m_y),\n",
+    "      std::round(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Floor(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::floor(v.m_x),\n",
+    "      std::floor(v.m_y),\n",
+    "      std::floor(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Ceil(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::ceil(v.m_x),\n",
+    "      std::ceil(v.m_y),\n",
+    "      std::ceil(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Trunc(const Vector3& v) {\n",
+    "    return {\n",
+    "      std::trunc(v.m_x),\n",
+    "      std::trunc(v.m_y),\n",
+    "      std::trunc(v.m_z)\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Clamp(const Vector3& v, \n",
+    "                      double low = 0.0, double high = 1.0) {\n",
+    "    return {\n",
+    "      Clamp(v.m_x, low, high),\n",
+    "      Clamp(v.m_y, low, high),\n",
+    "      Clamp(v.m_z, low, high) }\n",
+    "    ;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Lerp(double a, \n",
+    "                     const Vector3& v1, const Vector3& v2) {\n",
+    "    return v1 + a * (v2 - v1);\n",
+    "  }\n",
+    "\n",
+    "  template<size_t X, size_t Y, size_t Z>\n",
+    "  __device__ inline const Vector3 Permute(const Vector3& v) {\n",
+    "    return { v[X], v[Y], v[Z] };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 Normalize(const Vector3& v) {\n",
+    "    const double a = 1.0 / v.Norm2();\n",
+    "    return a * v;\n",
+    "  }\n",
+    "//}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6426445-b5b5-47a7-82e0-d956b86a0bd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#ifdef __unix\n",
+    "#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL\n",
+    "#endif\n",
+    "\n",
+    "//namespace smallpt {\n",
+    "\n",
+    "  inline void WritePPM(uint32_t w, uint32_t h, \n",
+    "             const Vector3* Ls, \n",
+    "             const char* fname = \"cu-image.ppm\") noexcept {\n",
+    "    \n",
+    "    FILE* fp;\n",
+    "    \n",
+    "    fopen_s(&fp, fname, \"w\");\n",
+    "    \n",
+    "    fprintf(fp, \"P3\\n%u %u\\n%u\\n\", w, h, 255u);\n",
+    "    for (size_t i = 0; i < w * h; ++i) {\n",
+    "      fprintf(fp, \"%u %u %u \", \n",
+    "             ToByte(Ls[i].m_x), \n",
+    "             ToByte(Ls[i].m_y), \n",
+    "             ToByte(Ls[i].m_z));\n",
+    "    }\n",
+    "    \n",
+    "    fclose(fp);\n",
+    "  }\n",
+    "//}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cfd9cc8-9419-4834-9dba-6a8c98e64ba2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#define REFRACTIVE_INDEX_OUT 1.0\n",
+    "#define REFRACTIVE_INDEX_IN  1.5\n",
+    "#define EPSILON_SPHERE 1e-4\n",
+    "\n",
+    "//namespace smallpt {\n",
+    "\n",
+    "  struct __align__(16) Ray {\n",
+    "    __device__ explicit Ray(Vector3 o, \n",
+    "                Vector3 d, \n",
+    "                double tmin = 0.0, \n",
+    "                double tmax = std::numeric_limits< double >::infinity(), \n",
+    "                std::uint32_t depth = 0u) noexcept\n",
+    "      : m_o(std::move(o)),\n",
+    "      m_d(std::move(d)),\n",
+    "      m_tmin(tmin),\n",
+    "      m_tmax(tmax),\n",
+    "      m_depth(depth) {};\n",
+    "    Ray(const Ray& ray) noexcept = default;\n",
+    "    Ray(Ray&& ray) noexcept = default;\n",
+    "    ~Ray() = default;\n",
+    "\n",
+    "    Ray& operator=(const Ray& ray) = default;\n",
+    "    Ray& operator=(Ray&& ray) = default;\n",
+    "\n",
+    "    __device__ const Vector3 operator()(double t) const {\n",
+    "      return m_o + m_d * t;\n",
+    "    }\n",
+    "\n",
+    "    Vector3 m_o, m_d;\n",
+    "    mutable double m_tmin, m_tmax;\n",
+    "    std::uint32_t m_depth;\n",
+    "  };\n",
+    "\n",
+    "  enum struct Reflection_t {\n",
+    "    Diffuse,\n",
+    "    Specular,\n",
+    "    Refractive\n",
+    "  };\n",
+    "\n",
+    "  struct __align__(16) Sphere {\n",
+    "    __host__ __device__ explicit Sphere(double r,\n",
+    "                      Vector3 p,\n",
+    "                      Vector3 e,\n",
+    "                      Vector3 f,\n",
+    "                      Reflection_t reflection_t) noexcept\n",
+    "      : m_r(r),\n",
+    "      m_p(std::move(p)),\n",
+    "      m_e(std::move(e)),\n",
+    "      m_f(std::move(f)),\n",
+    "      m_reflection_t(reflection_t) {}\n",
+    "    Sphere(const Sphere& sphere) noexcept = default;\n",
+    "    Sphere(Sphere&& sphere) noexcept = default;\n",
+    "    ~Sphere() = default;\n",
+    "\n",
+    "    Sphere& operator=(const Sphere& sphere) = default;\n",
+    "    Sphere& operator=(Sphere&& sphere) = default;\n",
+    "    \n",
+    "    __device__ bool Intersect(const Ray& ray) const {\n",
+    "      const Vector3 op = m_p - ray.m_o;\n",
+    "      const double dop = ray.m_d.Dot(op);\n",
+    "      const double D = dop * dop - op.Dot(op) + m_r * m_r;\n",
+    "\n",
+    "      if (0.0 > D) {\n",
+    "        return false;\n",
+    "      }\n",
+    "\n",
+    "      const double sqrtD = sqrt(D);\n",
+    "\n",
+    "      const double tmin = dop - sqrtD;\n",
+    "      if (ray.m_tmin < tmin && tmin < ray.m_tmax) {\n",
+    "        ray.m_tmax = tmin;\n",
+    "        return true;\n",
+    "      }\n",
+    "\n",
+    "      const double tmax = dop + sqrtD;\n",
+    "      if (ray.m_tmin < tmax && tmax < ray.m_tmax) {\n",
+    "        ray.m_tmax = tmax;\n",
+    "        return true;\n",
+    "      }\n",
+    "\n",
+    "      return false;\n",
+    "    }\n",
+    "\n",
+    "    double m_r;\n",
+    "    Vector3 m_p; // position\n",
+    "    Vector3 m_e; // emission\n",
+    "    Vector3 m_f; // reflection\n",
+    "    Reflection_t m_reflection_t;\n",
+    "  };\n",
+    "\n",
+    "    \n",
+    "  const Sphere g_spheres[] = {\n",
+    "    Sphere(1e5,  Vector3(1e5 + 1, 40.8, 81.6),   Vector3(),   Vector3(0.75,0.25,0.25), Reflection_t::Diffuse),    //Left\n",
+    "    Sphere(1e5,  Vector3(-1e5 + 99, 40.8, 81.6), Vector3(),   Vector3(0.25,0.25,0.75), Reflection_t::Diffuse),    //Right\n",
+    "    Sphere(1e5,  Vector3(50, 40.8, 1e5),         Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Back\n",
+    "    Sphere(1e5,  Vector3(50, 40.8, -1e5 + 170),  Vector3(),   Vector3(),               Reflection_t::Diffuse),    //Front\n",
+    "    Sphere(1e5,  Vector3(50, 1e5, 81.6),         Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Bottom\n",
+    "    Sphere(1e5,  Vector3(50, -1e5 + 81.6, 81.6), Vector3(),   Vector3(0.75),           Reflection_t::Diffuse),    //Top\n",
+    "    Sphere(16.5, Vector3(27, 16.5, 47),          Vector3(),   Vector3(0.999),          Reflection_t::Specular),   //Mirror\n",
+    "    Sphere(16.5, Vector3(73, 16.5, 78),          Vector3(),   Vector3(0.999),          Reflection_t::Refractive), //Glass\n",
+    "    Sphere(600,  Vector3(50, 681.6 - .27, 81.6), Vector3(12), Vector3(),               Reflection_t::Diffuse)     //Light\n",
+    "  };\n",
+    "\n",
+    "\n",
+    "  __device__ inline Vector3 UniformSampleOnHemisphere(double u1,  double u2) {\n",
+    "    // u1 := cos_theta\n",
+    "    const double sin_theta = std::sqrt(fmax(0.0, 1.0 - u1 * u1));\n",
+    "    const double phi = 2.0 * g_pi * u2;\n",
+    "    return {\n",
+    "      std::cos(phi) * sin_theta,\n",
+    "      std::sin(phi) * sin_theta,\n",
+    "      u1\n",
+    "    };\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline Vector3 CosineWeightedSampleOnHemisphere(double u1, double u2) {\n",
+    "    const double cos_theta = sqrt(1.0 - u1);\n",
+    "    const double sin_theta = sqrt(u1);\n",
+    "    const double phi = 2.0 * g_pi * u2;\n",
+    "    return {\n",
+    "      std::cos(phi) * sin_theta,\n",
+    "      std::sin(phi) * sin_theta,\n",
+    "      cos_theta\n",
+    "    };\n",
+    "  }\n",
+    "    \n",
+    "  __device__ inline double Reflectance0(double n1, double n2) {\n",
+    "    const double sqrt_R0 = (n1 - n2) / (n1 + n2);\n",
+    "    return sqrt_R0 * sqrt_R0;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline double SchlickReflectance(double n1, \n",
+    "                        double n2, \n",
+    "                        double c) {\n",
+    "    const double R0 = Reflectance0(n1, n2);\n",
+    "    return R0 + (1.0 - R0) * c * c * c * c * c;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 IdealSpecularReflect(const Vector3& d, const Vector3& n) {\n",
+    "    return d - 2.0 * n.Dot(d) * n;\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline const Vector3 IdealSpecularTransmit(const Vector3& d, \n",
+    "                              const Vector3& n, \n",
+    "                              double n_out, \n",
+    "                              double n_in, \n",
+    "                              double& pr, \n",
+    "                              curandState* state) {\n",
+    "    \n",
+    "    const Vector3 d_Re = IdealSpecularReflect(d, n);\n",
+    "\n",
+    "    const bool out_to_in = (0.0 > n.Dot(d));\n",
+    "    const Vector3 nl = out_to_in ? n : -n;\n",
+    "    const double nn = out_to_in ? n_out / n_in : n_in / n_out;\n",
+    "    const double cos_theta = d.Dot(nl);\n",
+    "    const double cos2_phi = 1.0 - nn * nn * (1.0 - cos_theta * cos_theta);\n",
+    "\n",
+    "    // Total Internal Reflection\n",
+    "    if (0.0 > cos2_phi) {\n",
+    "      pr = 1.0;\n",
+    "      return d_Re;\n",
+    "    }\n",
+    "\n",
+    "    const Vector3 d_Tr = Normalize(nn * d - nl * (nn * cos_theta + sqrt(cos2_phi)));\n",
+    "    const double c = 1.0 - (out_to_in ? -cos_theta : d_Tr.Dot(n));\n",
+    "\n",
+    "    const double Re = SchlickReflectance(n_out, n_in, c);\n",
+    "    const double p_Re = 0.25 + 0.5 * Re;\n",
+    "    if (curand_uniform_double(state) < p_Re) {\n",
+    "      pr = (Re / p_Re);\n",
+    "      return d_Re;\n",
+    "    }\n",
+    "    else {\n",
+    "      const double Tr = 1.0 - Re;\n",
+    "      const double p_Tr = 1.0 - p_Re;\n",
+    "      pr = (Tr / p_Tr);\n",
+    "      return d_Tr;\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  __device__ inline bool Intersect(const Sphere* dev_spheres, \n",
+    "                   size_t nb_spheres, \n",
+    "                   const Ray& ray, \n",
+    "                   size_t& id)\n",
+    "  {\n",
+    "    bool hit = false;\n",
+    "    for (size_t i = 0u; i < nb_spheres; ++i) {\n",
+    "      if (dev_spheres[i].Intersect(ray)) {\n",
+    "        hit = true;\n",
+    "        id  = i;\n",
+    "      }\n",
+    "    }\n",
+    "\n",
+    "    return hit;\n",
+    "  }\n",
+    "\n",
+    "//__device__ static Vector3 Radiance(const Sphere* dev_spheres, \n",
+    "  __device__ Vector3 Radiance(const Sphere* dev_spheres, \n",
+    "                     size_t nb_spheres,\n",
+    "                     const Ray& ray, \n",
+    "                     curandState* state)\n",
+    "  {  \n",
+    "    Ray r = ray;\n",
+    "    Vector3 L;\n",
+    "    Vector3 F(1.0);\n",
+    "\n",
+    "    while (true) {\n",
+    "      size_t id;\n",
+    "      if (!Intersect(dev_spheres, nb_spheres, r, id)) {\n",
+    "        return L;\n",
+    "      }\n",
+    "\n",
+    "      const Sphere& shape = dev_spheres[id];\n",
+    "      const Vector3 p = r(r.m_tmax);\n",
+    "      const Vector3 n = Normalize(p - shape.m_p);\n",
+    "\n",
+    "      L += F * shape.m_e;\n",
+    "      F *= shape.m_f;\n",
+    "\n",
+    "      // Russian roulette\n",
+    "      if (4 < r.m_depth) {\n",
+    "        const double continue_probability = shape.m_f.Max();\n",
+    "        if (curand_uniform_double(state) >= continue_probability) {\n",
+    "          return L;\n",
+    "        }\n",
+    "        F /= continue_probability;\n",
+    "      }\n",
+    "\n",
+    "      // Next path segment\n",
+    "      switch (shape.m_reflection_t) {\n",
+    "      \n",
+    "      case Reflection_t::Specular: {\n",
+    "        const Vector3 d = IdealSpecularReflect(r.m_d, n);\n",
+    "        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n",
+    "        break;\n",
+    "      }\n",
+    "      \n",
+    "      case Reflection_t::Refractive: {\n",
+    "        double pr;\n",
+    "        const Vector3 d = IdealSpecularTransmit(r.m_d, n, REFRACTIVE_INDEX_OUT, REFRACTIVE_INDEX_IN, pr, state);\n",
+    "        F *= pr;\n",
+    "        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n",
+    "        break;\n",
+    "      }\n",
+    "      \n",
+    "      default: {\n",
+    "        const Vector3 w = (0.0 > n.Dot(r.m_d)) ? n : -n;\n",
+    "        const Vector3 u = Normalize((abs(w.m_x) > 0.1 ? Vector3(0.0, 1.0, 0.0) : Vector3(1.0, 0.0, 0.0)).Cross(w));\n",
+    "        const Vector3 v = w.Cross(u);\n",
+    "\n",
+    "        const Vector3 sample_d = CosineWeightedSampleOnHemisphere(curand_uniform_double(state), curand_uniform_double(state));\n",
+    "        const Vector3 d = Normalize(sample_d.m_x * u + sample_d.m_y * v + sample_d.m_z * w);\n",
+    "        r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n",
+    "      }\n",
+    "      }\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "//  __global__ static void kernel(const Sphere* dev_spheres, \n",
+    "  __global__ void kernel(const Sphere* dev_spheres, \n",
+    "                  size_t nb_spheres,\n",
+    "                  std::uint32_t w, \n",
+    "                  std::uint32_t h, \n",
+    "                  Vector3* Ls, \n",
+    "                  std::uint32_t nb_samples)\n",
+    "  {\n",
+    "      \n",
+    "    printf(\"!!!\\n\");\n",
+    "\n",
+    "    const std::uint32_t x = threadIdx.x + blockIdx.x * blockDim.x;\n",
+    "    const std::uint32_t y = threadIdx.y + blockIdx.y * blockDim.y;\n",
+    "    const std::uint32_t offset = x + y * blockDim.x * gridDim.x;\n",
+    "\n",
+    "    if (x >= w || y >= h) {\n",
+    "      return;\n",
+    "    }\n",
+    "\n",
+    "    // RNG\n",
+    "    curandState state;\n",
+    "    curand_init(offset, 0u, 0u, &state);\n",
+    "\n",
+    "    const Vector3 eye = { 50.0, 52.0, 295.6 };\n",
+    "    const Vector3 gaze = Normalize(Vector3(0.0, -0.042612, -1.0));\n",
+    "    const double fov = 0.5135;\n",
+    "    const Vector3 cx = { w * fov / h, 0.0, 0.0 };\n",
+    "    const Vector3 cy = Normalize(cx.Cross(gaze)) * fov;\n",
+    "\n",
+    "    for (size_t sy = 0u, i = (h - 1u - y) * w + x; sy < 2u; ++sy) { // 2 subpixel row\n",
+    "\n",
+    "      for (size_t sx = 0u; sx < 2u; ++sx) { // 2 subpixel column\n",
+    "\n",
+    "        Vector3 L;\n",
+    "\n",
+    "        for (size_t s = 0u; s < nb_samples; ++s) { // samples per subpixel\n",
+    "          const double u1 = 2.0 * curand_uniform_double(&state);\n",
+    "          const double u2 = 2.0 * curand_uniform_double(&state);\n",
+    "          const double dx = (u1 < 1.0) ? sqrt(u1) - 1.0 : 1.0 - sqrt(2.0 - u1);\n",
+    "          const double dy = (u2 < 1.0) ? sqrt(u2) - 1.0 : 1.0 - sqrt(2.0 - u2);\n",
+    "          const Vector3 d = cx * (((sx + 0.5 + dx) * 0.5 + x) / w - 0.5) +\n",
+    "                          cy * (((sy + 0.5 + dy) * 0.5 + y) / h - 0.5) + gaze;\n",
+    "          \n",
+    "          L += Radiance(dev_spheres, nb_spheres, \n",
+    "            Ray(eye + d * 130, Normalize(d), EPSILON_SPHERE), &state) \n",
+    "            * (1.0 / nb_samples);\n",
+    "        }\n",
+    "        \n",
+    "        Ls[i] += 0.25 * Clamp(L);\n",
+    "      }\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "//  static void Render(std::uint32_t nb_samples) noexcept\n",
+    "  void Render(std::uint32_t nb_samples) noexcept\n",
+    "  {\n",
+    "    const std::uint32_t w = 256u; //1024u;\n",
+    "    const std::uint32_t h = 192u; //768u;\n",
+    "    const std::uint32_t nb_pixels = w * h;\n",
+    "\n",
+    "    // Set up device memory\n",
+    "    Sphere* dev_spheres;\n",
+    "    CUDA_SAFE_CALL(cudaMalloc((void**)&dev_spheres, sizeof(g_spheres)));\n",
+    "    CUDA_SAFE_CALL(cudaMemcpy(dev_spheres, g_spheres, sizeof(g_spheres), cudaMemcpyHostToDevice));\n",
+    "    Vector3* dev_Ls;\n",
+    "    CUDA_SAFE_CALL(cudaMalloc((void**)&dev_Ls, nb_pixels * sizeof(Vector3)));\n",
+    "    CUDA_SAFE_CALL(cudaMemset(dev_Ls, 0, nb_pixels * sizeof(Vector3)));\n",
+    "\n",
+    "    // Kernel execution\n",
+    "    const dim3 nblocks(w / 16u, h / 16u);\n",
+    "    const dim3 nthreads(16u, 16u);\n",
+    "\n",
+    "printf(\"@@@render kernel@@@\\n\");\n",
+    "    kernel<<<nblocks, nthreads>>>(dev_spheres, sizeof(g_spheres)/sizeof(g_spheres[0]),  w, h, dev_Ls, nb_samples);\n",
+    "    CUDA_SAFE_CALL(cudaPeekAtLastError());\n",
+    "    CUDA_SAFE_CALL(cudaDeviceSynchronize());\n",
+    "      \n",
+    "    // Set up host memory\n",
+    "    Vector3* Ls = (Vector3*)malloc(nb_pixels * sizeof(Vector3));\n",
+    "    // Transfer device -> host\n",
+    "    CUDA_SAFE_CALL(cudaMemcpy(Ls, dev_Ls, nb_pixels * sizeof(Vector3), cudaMemcpyDeviceToHost));\n",
+    "\n",
+    "    // Clean up device memory\n",
+    "    CUDA_SAFE_CALL(cudaFree(dev_Ls));\n",
+    "    CUDA_SAFE_CALL(cudaFree(dev_spheres));\n",
+    "\n",
+    "    WritePPM(w, h, Ls);\n",
+    "\n",
+    "    // Clean up host memory\n",
+    "    free(Ls);\n",
+    "  }\n",
+    "\n",
+    "//const std::uint32_t nb_samples = 100;\n",
+    "//smallpt::\n",
+    "//Render(100);\n",
+    "//printf(\"Done Render\\n\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c70fc5e6-4f6b-4d4d-9cdb-6fa91cb1b09e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "void devicePropertyPrint()\n",
+    "{\n",
+    "  int dev = 0;\n",
+    "  cudaDeviceProp devProp;\n",
+    "  if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)\n",
+    "  {\n",
+    "    printf(\"Device %i, named: %s\\n\", dev, devProp.name);\n",
+    "    printf(\"Device compute capability: %i.%i\\n\", devProp.major, devProp.minor);\n",
+    "    printf(\"Device maxThreadDim: [%i, %i, %i]\\n\", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);\n",
+    "    printf(\"Device maxGridSize: [%i, %i, %i]\\n\", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);\n",
+    "    printf(\"Multi Processor Count: %i\\n\", devProp.multiProcessorCount);\n",
+    "    printf(\"Size of SharedMem Per-Block: %f KB\\n\", devProp.sharedMemPerBlock / 1024.0);\n",
+    "    printf(\"Max Threads Per-Block: %i\\n\", devProp.maxThreadsPerBlock);\n",
+    "    printf(\"Max Threads Per-MultiProcessor: %i\\n\", devProp.maxThreadsPerMultiProcessor);\n",
+    "    printf(\"\\n\");\n",
+    "  }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "601b07fc-dc2b-4d75-90e1-2b6279d77a1d",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## CUDA kernel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d41025be-e3a0-4de4-bfa8-51c2eb7dc4f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)\n",
+    "{  \n",
+    "  printf(\"###\\n\");\n",
+    "\n",
+    "  int i = blockDim.x * blockIdx.x + threadIdx.x;\n",
+    "\n",
+    "  if (i < numElements) {\n",
+    "    C[i] = A[i] + B[i] + 0.0f;\n",
+    "  }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c178646f-0f60-4da9-988e-9019632f4334",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Host code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01808b34-0e17-428e-b5e6-2e5fd3cdb1c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "int main(void) {\n",
+    "\n",
+    "  // Error code to check return values for CUDA calls\n",
+    "  cudaError_t err = cudaSuccess;\n",
+    "\n",
+    "  // Print the vector length to be used, and compute its size\n",
+    "  int numElements = 50000;\n",
+    "  size_t size = numElements * sizeof(float);\n",
+    "  printf(\"[Vector addition of %d elements]\\n\", numElements);\n",
+    "\n",
+    "  // Allocate the host input vector A\n",
+    "  float *h_A = (float *)malloc(size);\n",
+    "\n",
+    "  // Allocate the host input vector B\n",
+    "  float *h_B = (float *)malloc(size);\n",
+    "\n",
+    "  // Allocate the host output vector C\n",
+    "  float *h_C = (float *)malloc(size);\n",
+    "\n",
+    "  // Verify that allocations succeeded\n",
+    "  if (h_A == NULL || h_B == NULL || h_C == NULL) {\n",
+    "    printf(\"Failed to allocate host vectors!\\n\");\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Initialize the host input vectors\n",
+    "  for (int i = 0; i < numElements; ++i) {\n",
+    "    h_A[i] = rand() / (float)RAND_MAX;\n",
+    "    h_B[i] = rand() / (float)RAND_MAX;\n",
+    "  }\n",
+    "\n",
+    "  // Allocate the device input vector A\n",
+    "  float *d_A = NULL;\n",
+    "  err = cudaMalloc((void **)&d_A, size);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\"Failed to allocate device vector A (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Allocate the device input vector B\n",
+    "  float *d_B = NULL;\n",
+    "  err = cudaMalloc((void **)&d_B, size);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\"Failed to allocate device vector B (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Allocate the device output vector C\n",
+    "  float *d_C = NULL;\n",
+    "  err = cudaMalloc((void **)&d_C, size);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\"Failed to allocate device vector C (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Copy the host input vectors A and B in host memory to the device input\n",
+    "  // vectors in\n",
+    "  // device memory\n",
+    "  printf(\"Copy input data from the host memory to the CUDA device\\n\");\n",
+    "  err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\n",
+    "            \"Failed to copy vector A from host to device (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\n",
+    "            \"Failed to copy vector B from host to device (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Launch the Vector Add CUDA Kernel\n",
+    "  int threadsPerBlock = 256;\n",
+    "  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;\n",
+    "  printf(\"CUDA kernel launch with %d blocks of %d threads\\n\", blocksPerGrid,\n",
+    "         threadsPerBlock);\n",
+    "printf(\"@@@add kernel@@@\\n\");\n",
+    "  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);\n",
+    "  err = cudaGetLastError();\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\"Failed to launch vectorAdd kernel (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Copy the device result vector in device memory to the host result vector\n",
+    "  // in host memory.\n",
+    "  printf(\"Copy output data from the CUDA device to the host memory\\n\");\n",
+    "  err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\n",
+    "            \"Failed to copy vector C from device to host (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Verify that the result vector is correct\n",
+    "  for (int i = 0; i < numElements; ++i) {\n",
+    "    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {\n",
+    "      printf(\"Result verification failed at element %d! A=%f B=%f C=%f\\n\", i, h_A[i], h_B[i], h_C[i]);\n",
+    "      return 1;\n",
+    "      exit(EXIT_FAILURE);\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  printf(\"Test PASSED\\n\");\n",
+    "\n",
+    "  // Free device global memory\n",
+    "  err = cudaFree(d_A);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\"Failed to free device vector A (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  err = cudaFree(d_B);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\"Failed to free device vector B (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  err = cudaFree(d_C);\n",
+    "\n",
+    "  if (err != cudaSuccess) {\n",
+    "    printf(\"Failed to free device vector C (error code %s)!\\n\",\n",
+    "            cudaGetErrorString(err));\n",
+    "    exit(EXIT_FAILURE);\n",
+    "  }\n",
+    "\n",
+    "  // Free host memory\n",
+    "  free(h_A);\n",
+    "  free(h_B);\n",
+    "  free(h_C);\n",
+    "\n",
+    "  printf(\"Done Add\\n\");\n",
+    "  return 0;\n",
+    "}\n",
+    "\n",
+    "///\n",
+    "Vector3 v;\n",
+    "///\n",
+    "\n",
+    "devicePropertyPrint();\n",
+    "main();\n",
+    "main();\n",
+    "\n",
+    "Render(100);\n",
+    "printf(\"Done Render\\n\");"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "CUDA (C++17)",
+   "language": "CUDA",
+   "name": "cuda-xcpp17"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "c++",
+   "version": "17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}