instillai
diff --git a/‎code/overview/overfitting/example.py
Lines changed: 0 additions & 50 deletions b/‎code/overview/overfitting/example.py
Lines changed: 0 additions & 50 deletions
diff --git a/‎code/overview/overfitting/overfitting.py
Lines changed: 32 additions & 0 deletions b/‎code/overview/overfitting/overfitting.py
Lines changed: 32 additions & 0 deletions
diff --git a/‎code/supervised/Logistic_Regression/logistic_ex1.py
Lines changed: 57 additions & 0 deletions b/‎code/supervised/Logistic_Regression/logistic_ex1.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎docs/source/content/overview/_img/GoodFit.PNG
10.2 KB b/‎docs/source/content/overview/_img/GoodFit.PNG
10.2 KB
diff --git a/‎docs/source/content/overview/_img/Lagrange_Error.svg.png
-35.4 KB b/‎docs/source/content/overview/_img/Lagrange_Error.svg.png
-35.4 KB
diff --git a/‎docs/source/content/overview/_img/Overfit.PNG
41.7 KB b/‎docs/source/content/overview/_img/Overfit.PNG
41.7 KB
diff --git a/‎docs/source/content/overview/_img/Overfit_small.png
15.3 KB b/‎docs/source/content/overview/_img/Overfit_small.png
15.3 KB
diff --git a/‎docs/source/content/overview/_img/Underfit.PNG
7.12 KB b/‎docs/source/content/overview/_img/Underfit.PNG
7.12 KB
diff --git a/‎docs/source/content/overview/_img/Underfit_Graph.PNG
-13.6 KB b/‎docs/source/content/overview/_img/Underfit_Graph.PNG
-13.6 KB
diff --git a/‎docs/source/content/overview/overfitting.rst
Lines changed: 81 additions & 7 deletions b/‎docs/source/content/overview/overfitting.rst
Lines changed: 81 additions & 7 deletions
diff --git a/‎docs/source/content/supervised/_img/Lagrange_Error.svg.png
-35.4 KB b/‎docs/source/content/supervised/_img/Lagrange_Error.svg.png
-35.4 KB
diff --git a/‎docs/source/content/supervised/_img/Underfit_Graph.PNG
-13.6 KB b/‎docs/source/content/supervised/_img/Underfit_Graph.PNG
-13.6 KB
@@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+
+def real_funct(x):
+    return [-(i**2) for i in x]
+
+def over_funct(x):
+    return [-0.5*(i**3) - (i**2) for i in x]
+
+def under_funct(x):
+    return [6*i + 9 for i in x]
+
+#create x values, and run them through each function
+x = range(-3, 4, 1)
+real_y = real_funct(x)
+over_y = over_funct(x)
+under_y = under_funct(x)
+
+#Use matplotlib to plot the functions so they can be visually compared.
+plt.plot(x, real_y, 'k', label='Real function')
+plt.plot(x, over_y, 'r', label='Overfit function')
+plt.plot(x, under_y, 'b', label='Underfit function')
+plt.legend()
+plt.show()
+
+#Output the data in a well formatted way, for the more numerically inclined.
+print("An underfit model may output something like this:")
+for i in range(0, 7):
+    print("x: "+ str(x[i]) + ", real y: " + str(real_y[i]) + ", y: " + str(under_y[i]))
+
+print("An overfit model may look a little like this")
+for i in range(0, 7):
+    print("x: "+ str(x[i]) + ", real y: " + str(real_y[i]) + ", y: " + str(over_y[i]))
@@ -0,0 +1,57 @@
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+import random
+
+#defines the classification for the training data.
+def true_classifier(i):
+    if i >= 700:
+        return 1
+    return 0
+
+#Generate a random dataset which includes random scores from 0 to 1000.
+x = np.array([ random.randint(0,1000) for i in range(0,1000) ])
+
+#The model will expect a 2D array, so we must reshape
+#For the model, the 2D array must have rows equal to the number of samples,
+#and columns equal to the number of features.
+#For this example, we have 1000 samples and 1 feature.
+x = x.reshape((-1, 1))
+
+#For each point, y is a pass/fail for the grade. The simple threshold is arbitrary,
+#and can be changed as you would like. Classes are 1 for success and 0 for failure
+y = [ true_classifier(x[i][0]) for i in range(0,1000) ]
+
+
+#Again, we need a numpy array, so we convert.
+y = np.array(y)
+
+#Our goal will be to train a logistic regression model to do pass/fail to the same threshold.
+model = LogisticRegression(solver='liblinear')
+
+#The fit method actually fits the model to our training data
+model = model.fit(x,y)
+
+#Create 100 random samples to try against our model as test data
+samples = [random.randint(0,1000) for i in range(0,100)]
+#Once again, we need a 2d Numpy array
+samples = np.array(samples)
+samples = samples.reshape(-1, 1)
+
+#Now we use our model against the samples.  output is the probability, and _class is the class.
+_class = model.predict(samples)
+proba = model.predict_proba(samples)
+
+num_accurate = 0
+
+#Finally, output the results, formatted for nicer viewing.
+#The format is [<sample value>]: Class <class number>, probability [ <probability for class 0> <probability for class 1>]
+#So, the probability array is the probability of failure, followed by the probability of passing.
+#In an example run, [7]: Class 0, probability [  9.99966694e-01   3.33062825e-05]
+#Means that for value 7, the class is 0 (failure) and the probability of failure is 99.9%
+for i in range(0,100):
+    if (true_classifier(samples[i])) == (_class[i] == 1):
+        num_accurate = num_accurate + 1
+    print("" + str(samples[i]) + ": Class " + str(_class[i]) + ", probability " + str(proba[i]))
+#skip a line to separate overall result from sample output
+print("")
+print(str(num_accurate) +" out of 100 correct.")
@@ -9,26 +9,100 @@ Overfitting and Underfitting
 ----------------------------
 Overview
 ----------------------------
-When using machine learning, there are many ways to go wrong.  Some of the most common issues in machine learning are **overfitting** and **underfitting**.  To understand these concepts, let's imagine a machine learning model that is trying to learn to classify numbers, and has access to a training set of data and a testing set of data.  Be sure to **follow along with the sample code** courtesy of scikit-learn.org
+When using machine learning, there are many ways to go wrong.
+Some of the most common issues in machine learning are **overfitting** and **underfitting**.
+To understand these concepts, let's imagine a machine learning model that is
+trying to learn to classify numbers, and has access to a training set of data and a testing set of data.
 
 ----------------------------
 Overfitting
 ----------------------------
 
-A model suffers from **Overfitting** when it has learned too much from the training data, and does not perform well in practice as a result.  This is usually caused by the model having too much exposure to the training data.  For the number classification example, if the model is overfit in this way, it may be picking up on tiny details that are misleading, like stray marks as an indication of a specific number.
+A model suffers from **Overfitting** when it has learned too much from the
+training data, and does not perform well in practice as a result.
+This is usually caused by the model having too much exposure to the training data.
+For the number classification example, if the model is overfit in this way, it
+may be picking up on tiny details that are misleading, like stray marks as an indication of a specific number.
 
-The estimate looks pretty good when you look at the middle of the graph, but the edges have large error.  In practice, this error isn't always at edge cases and can pop up anywhere.  The noise in training can cause the error seen in the graph
+The estimate looks pretty good when you look at the middle of the graph, but the edges have large error.
+In practice, this error isn't always at edge cases and can pop up anywhere.
+The noise in training can cause error as seen in the graph below.
+
+.. figure:: _img/Overfit_small.png
+   :scale: 10 %
+   :alt: Overfit
+(Created using https://www.desmos.com/calculator/dffnj2jbow)
+
+In this example, the data is overfit by a polynomial degree.
+The points indicated are true to the function y = x^2, but does not approximate the function well outside of those points.
 
 ----------------------------
 Underfitting
 ----------------------------
 
-A model suffers from **Underfitting** when it has not learned enough from the training data, and does not perform well in practice as a result.  As a direct contrast to the previous idea, this issue is caused by not letting the model learn enough from training data.  In the number classification example, if the training set is too small or the model has not had enough attempts to learn from it, then it will not be able to pick out key features of the numbers.
+A model suffers from **Underfitting** when it has not learned enough from the
+training data, and does not perform well in practice as a result.
+As a direct contrast to the previous idea, this issue is caused by not letting
+the model learn enough from training data.
+In the number classification example, if the training set is too small or the
+model has not had enough attempts to learn from it, then it will not be able to pick out key features of the numbers.
+
+
+The issue with this estimate is clear to the human eye, the model should be
+nonlinear, and is instead just a simple line.
+In machine learning, this could be a result of underfitting, the model has not
+had enough exposure to training data to adapt to it, and is currently in a simple state.
+
+.. figure:: _img/Underfit.PNG
+   :scale: 50 %
+   :alt: Underfit
+(Created using Wolfram Alpha)
+
+----------------------------
+Motivation
+----------------------------
+
+Finding a good fit is one of the central problems in machine learning.
+Gaining a good grasp of how to avoid fitting problems before even worrying
+about specific methods can keep models on track.
+The mindset of hunting for a good fit, rather than throwing more learning
+time at a model is very important to have.
 
+----------------------------
+Code
+----------------------------
+
+The example code for overfitting shows some basic examples based in polynomial
+interpolation, trying to find the equation of a graph.
+The overfitting.py_ file, you can see that there is a true function being
+modeled, as well as some estimates that are shown to not be accurate.
+
+.. _overfitting.py: https://github.com/machinelearningmindset/machine-learning-course/blob/master/code/overview/overfitting/overfitting.py
+
+The estimates are representations of overfitting and underfitting.
+For overfitting, a higher degree polynomial is used (x cubed instead of squared).
+While the data is relatively close for the chosen points, there are some artifacts outside of them.
+The example of underfitting, however, does not even achieve accuracy at many of the points.
+Underfitting is similar to having a linear model when trying to model a quadratic function.
+The model does well on the point(s) it trained on, in this case the point used
+for the linear estimate, but poorly otherwise.
 
-The issue with this estimate is clear to the human eye, the model should be nonlinear, and is instead just a simple line.  In machine learning, this could be a result of underfitting, the model has not had enough exposure to training data to adapt to it, and is currently in a simple state.
 
 ----------------------------
-How to avoid overfitting
+Conclusion
 ----------------------------
-A key idea in avoiding overfitting issues in machine learning is to maintain a **validation set**.  This set is used for training purposes, but most importantly the model **does not learn from it**.  So, the model first learns from the **training set**, then checks its knowledge on a completely different **validation set**.  When it performs well enough on the **validation set**, we can be more confident that it is not overfit to the training data than if we just looked at training results.
+
+Check out the cross-validation and regularization sections for information on
+how to avoid overfitting in machine learning models.
+Ideally, a good fit looks something like this:
+
+.. figure:: _img/GoodFit.PNG
+   :scale: 50 %
+   :alt: Underfit
+(Created using Wolfram Alpha)
+
+
+When using machine learning in any capacity, issues such as overfitting
+frequently come up, and having a grasp of the concept is very important.
+The modules in this section are among the most important in the whole repository,
+since regardless of the implementation, machine learning always includes these fundamentals.