Skip to content

Commit 4a1bcc4

Browse files
authored
Add files via upload
1 parent 8aff395 commit 4a1bcc4

4 files changed

+1848
-0
lines changed

Case Study - Balancing Class.ipynb

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pd"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 2,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"review_data = pd.read_csv('C:/Users/awant/Documents/Reviews.csv')"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 3,
24+
"metadata": {},
25+
"outputs": [],
26+
"source": [
27+
"review_data = review_data.sample(5000)"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": 4,
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"review_data = review_data[['Text','Score']]"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": 5,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"review_data = review_data[review_data.Score != 3]"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": 6,
51+
"metadata": {},
52+
"outputs": [],
53+
"source": [
54+
"review_data['Sentiment'] = review_data.Score.map(lambda x: 0 if x < 3 else 1)"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 8,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"from sklearn.feature_extraction.text import CountVectorizer\n",
64+
"from sklearn.naive_bayes import MultinomialNB"
65+
]
66+
},
67+
{
68+
"cell_type": "code",
69+
"execution_count": 9,
70+
"metadata": {},
71+
"outputs": [],
72+
"source": [
73+
"cv = CountVectorizer(stop_words='english')\n",
74+
"mnb = MultinomialNB()"
75+
]
76+
},
77+
{
78+
"cell_type": "code",
79+
"execution_count": 10,
80+
"metadata": {},
81+
"outputs": [
82+
{
83+
"data": {
84+
"text/plain": [
85+
"1 3895\n",
86+
"0 692\n",
87+
"Name: Sentiment, dtype: int64"
88+
]
89+
},
90+
"execution_count": 10,
91+
"metadata": {},
92+
"output_type": "execute_result"
93+
}
94+
],
95+
"source": [
96+
"review_data.Sentiment.value_counts()"
97+
]
98+
},
99+
{
100+
"cell_type": "code",
101+
"execution_count": 11,
102+
"metadata": {},
103+
"outputs": [],
104+
"source": [
105+
"from imblearn.over_sampling import SMOTE"
106+
]
107+
},
108+
{
109+
"cell_type": "code",
110+
"execution_count": 25,
111+
"metadata": {},
112+
"outputs": [],
113+
"source": [
114+
"from sklearn.feature_selection import SelectKBest"
115+
]
116+
},
117+
{
118+
"cell_type": "code",
119+
"execution_count": 31,
120+
"metadata": {},
121+
"outputs": [],
122+
"source": [
123+
"from imblearn.pipeline import make_pipeline as make_pipeline_imb"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": 26,
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"word_selector = SelectKBest(k=2000)"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": 32,
138+
"metadata": {},
139+
"outputs": [],
140+
"source": [
141+
"pipeline = make_pipeline_imb(cv, word_selector, SMOTE(random_state=0), mnb)"
142+
]
143+
},
144+
{
145+
"cell_type": "code",
146+
"execution_count": 33,
147+
"metadata": {},
148+
"outputs": [],
149+
"source": [
150+
"from sklearn.model_selection import train_test_split\n",
151+
"trainX,testX, trainY,testY = train_test_split(review_data.Text, review_data.Sentiment)"
152+
]
153+
},
154+
{
155+
"cell_type": "code",
156+
"execution_count": 34,
157+
"metadata": {},
158+
"outputs": [
159+
{
160+
"name": "stderr",
161+
"output_type": "stream",
162+
"text": [
163+
"C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\imblearn\\pipeline.py:190: DeprecationWarning: The 'cachedir' attribute has been deprecated in version 0.12 and will be removed in version 0.14.\n",
164+
"Use os.path.join(memory.location, 'joblib') attribute instead.\n",
165+
" if memory.cachedir is None:\n",
166+
"C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\imblearn\\pipeline.py:190: DeprecationWarning: The 'cachedir' attribute has been deprecated in version 0.12 and will be removed in version 0.14.\n",
167+
"Use os.path.join(memory.location, 'joblib') attribute instead.\n",
168+
" if memory.cachedir is None:\n",
169+
"C:\\Users\\awant\\Anaconda3\\lib\\site-packages\\imblearn\\pipeline.py:190: DeprecationWarning: The 'cachedir' attribute has been deprecated in version 0.12 and will be removed in version 0.14.\n",
170+
"Use os.path.join(memory.location, 'joblib') attribute instead.\n",
171+
" if memory.cachedir is None:\n"
172+
]
173+
},
174+
{
175+
"data": {
176+
"text/plain": [
177+
"Pipeline(memory=None,\n",
178+
" steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
179+
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
180+
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
181+
" ngram_range=(1, 1), preprocessor=None, stop_words='english...svm_estimator=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
182+
]
183+
},
184+
"execution_count": 34,
185+
"metadata": {},
186+
"output_type": "execute_result"
187+
}
188+
],
189+
"source": [
190+
"pipeline.fit(trainX,trainY)"
191+
]
192+
},
193+
{
194+
"cell_type": "code",
195+
"execution_count": 19,
196+
"metadata": {},
197+
"outputs": [],
198+
"source": [
199+
"from sklearn.metrics import confusion_matrix"
200+
]
201+
},
202+
{
203+
"cell_type": "code",
204+
"execution_count": 35,
205+
"metadata": {},
206+
"outputs": [
207+
{
208+
"data": {
209+
"text/plain": [
210+
"array([[107, 59],\n",
211+
" [ 71, 910]], dtype=int64)"
212+
]
213+
},
214+
"execution_count": 35,
215+
"metadata": {},
216+
"output_type": "execute_result"
217+
}
218+
],
219+
"source": [
220+
"pred = pipeline.predict(testX)\n",
221+
"confusion_matrix(y_pred=pred, y_true=testY)"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": 36,
227+
"metadata": {},
228+
"outputs": [],
229+
"source": [
230+
"from sklearn.pipeline import make_pipeline"
231+
]
232+
},
233+
{
234+
"cell_type": "code",
235+
"execution_count": 37,
236+
"metadata": {},
237+
"outputs": [],
238+
"source": [
239+
"review_pipeline = make_pipeline(cv, word_selector, mnb)"
240+
]
241+
},
242+
{
243+
"cell_type": "code",
244+
"execution_count": 38,
245+
"metadata": {},
246+
"outputs": [
247+
{
248+
"data": {
249+
"text/plain": [
250+
"Pipeline(memory=None,\n",
251+
" steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
252+
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
253+
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
254+
" ngram_range=(1, 1), preprocessor=None, stop_words='english...x000002A4FB520840>)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])"
255+
]
256+
},
257+
"execution_count": 38,
258+
"metadata": {},
259+
"output_type": "execute_result"
260+
}
261+
],
262+
"source": [
263+
"review_pipeline.fit(trainX,trainY)"
264+
]
265+
},
266+
{
267+
"cell_type": "code",
268+
"execution_count": 39,
269+
"metadata": {},
270+
"outputs": [
271+
{
272+
"data": {
273+
"text/plain": [
274+
"array([[ 49, 117],\n",
275+
" [ 25, 956]], dtype=int64)"
276+
]
277+
},
278+
"execution_count": 39,
279+
"metadata": {},
280+
"output_type": "execute_result"
281+
}
282+
],
283+
"source": [
284+
"pred = review_pipeline.predict(testX)\n",
285+
"confusion_matrix(y_pred=pred, y_true=testY)"
286+
]
287+
},
288+
{
289+
"cell_type": "code",
290+
"execution_count": null,
291+
"metadata": {},
292+
"outputs": [],
293+
"source": []
294+
}
295+
],
296+
"metadata": {
297+
"kernelspec": {
298+
"display_name": "Python 3",
299+
"language": "python",
300+
"name": "python3"
301+
},
302+
"language_info": {
303+
"codemirror_mode": {
304+
"name": "ipython",
305+
"version": 3
306+
},
307+
"file_extension": ".py",
308+
"mimetype": "text/x-python",
309+
"name": "python",
310+
"nbconvert_exporter": "python",
311+
"pygments_lexer": "ipython3",
312+
"version": "3.6.4"
313+
}
314+
},
315+
"nbformat": 4,
316+
"nbformat_minor": 2
317+
}

0 commit comments

Comments
 (0)