Skip to content

Commit ba9ab52

Browse files
committed
Uploaded files
1 parent 03591bc commit ba9ab52

File tree

76 files changed

+41155
-1091
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+41155
-1091
lines changed

DataFrame_Operations.ipynb renamed to DataFrame_operations_basics.ipynb

Lines changed: 204 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Spark DataFrame basic operations\n",
8+
"### Dr. Tirthajyoti Sarkar, Fremont, CA 94536\n",
9+
"In this notebook, we go through basic operations that can be performed with a Spark DataFrame object. We will use a .CSV file of stock prices to illustrate the code."
10+
]
11+
},
312
{
413
"cell_type": "code",
514
"execution_count": 1,
@@ -35,7 +44,7 @@
3544
},
3645
{
3746
"cell_type": "code",
38-
"execution_count": 4,
47+
"execution_count": 3,
3948
"metadata": {},
4049
"outputs": [],
4150
"source": [
@@ -51,7 +60,7 @@
5160
},
5261
{
5362
"cell_type": "code",
54-
"execution_count": 5,
63+
"execution_count": 4,
5564
"metadata": {},
5665
"outputs": [
5766
{
@@ -83,7 +92,7 @@
8392
},
8493
{
8594
"cell_type": "code",
86-
"execution_count": 6,
95+
"execution_count": 5,
8796
"metadata": {},
8897
"outputs": [
8998
{
@@ -132,7 +141,7 @@
132141
},
133142
{
134143
"cell_type": "code",
135-
"execution_count": 7,
144+
"execution_count": 6,
136145
"metadata": {},
137146
"outputs": [
138147
{
@@ -141,7 +150,7 @@
141150
"['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']"
142151
]
143152
},
144-
"execution_count": 7,
153+
"execution_count": 6,
145154
"metadata": {},
146155
"output_type": "execute_result"
147156
}
@@ -159,7 +168,7 @@
159168
},
160169
{
161170
"cell_type": "code",
162-
"execution_count": 8,
171+
"execution_count": 7,
163172
"metadata": {},
164173
"outputs": [
165174
{
@@ -188,7 +197,7 @@
188197
},
189198
{
190199
"cell_type": "code",
191-
"execution_count": 12,
200+
"execution_count": 8,
192201
"metadata": {},
193202
"outputs": [
194203
{
@@ -198,7 +207,7 @@
198207
" Row(Date=datetime.datetime(2010, 1, 5, 0, 0), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]"
199208
]
200209
},
201-
"execution_count": 12,
210+
"execution_count": 8,
202211
"metadata": {},
203212
"output_type": "execute_result"
204213
}
@@ -209,7 +218,7 @@
209218
},
210219
{
211220
"cell_type": "code",
212-
"execution_count": 14,
221+
"execution_count": 9,
213222
"metadata": {},
214223
"outputs": [],
215224
"source": [
@@ -218,7 +227,7 @@
218227
},
219228
{
220229
"cell_type": "code",
221-
"execution_count": 15,
230+
"execution_count": 10,
222231
"metadata": {},
223232
"outputs": [
224233
{
@@ -233,7 +242,7 @@
233242
" 'Adj Close': 27.727039}"
234243
]
235244
},
236-
"execution_count": 15,
245+
"execution_count": 10,
237246
"metadata": {},
238247
"output_type": "execute_result"
239248
}
@@ -259,7 +268,7 @@
259268
},
260269
{
261270
"cell_type": "code",
262-
"execution_count": 9,
271+
"execution_count": 11,
263272
"metadata": {},
264273
"outputs": [
265274
{
@@ -293,7 +302,7 @@
293302
},
294303
{
295304
"cell_type": "code",
296-
"execution_count": 10,
305+
"execution_count": 12,
297306
"metadata": {},
298307
"outputs": [
299308
{
@@ -322,7 +331,7 @@
322331
},
323332
{
324333
"cell_type": "code",
325-
"execution_count": 11,
334+
"execution_count": 13,
326335
"metadata": {},
327336
"outputs": [
328337
{
@@ -344,6 +353,186 @@
344353
"source": [
345354
"df.filter(\"Close < 500 AND Open > 500\").show(5)"
346355
]
356+
},
357+
{
358+
"cell_type": "markdown",
359+
"metadata": {},
360+
"source": [
361+
"### Now we use DataFrame syntax to achieve the same output "
362+
]
363+
},
364+
{
365+
"cell_type": "code",
366+
"execution_count": 17,
367+
"metadata": {},
368+
"outputs": [
369+
{
370+
"name": "stdout",
371+
"output_type": "stream",
372+
"text": [
373+
"+-------------------+----------+----------+------------------+------------------+---------+------------------+\n",
374+
"| Date| Open| High| Low| Close| Volume| Adj Close|\n",
375+
"+-------------------+----------+----------+------------------+------------------+---------+------------------+\n",
376+
"|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|\n",
377+
"|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|\n",
378+
"|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|\n",
379+
"|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|\n",
380+
"|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|\n",
381+
"+-------------------+----------+----------+------------------+------------------+---------+------------------+\n",
382+
"only showing top 5 rows\n",
383+
"\n"
384+
]
385+
}
386+
],
387+
"source": [
388+
"df.filter(df['Close']<500).show(5)"
389+
]
390+
},
391+
{
392+
"cell_type": "markdown",
393+
"metadata": {},
394+
"source": [
395+
"#### If we need to chain multiple conditions together, use `&` for AND and `|` for OR and clearly separate the conditions by putting them inside parantheses"
396+
]
397+
},
398+
{
399+
"cell_type": "code",
400+
"execution_count": 18,
401+
"metadata": {},
402+
"outputs": [
403+
{
404+
"name": "stdout",
405+
"output_type": "stream",
406+
"text": [
407+
"+-------------------+----------+------------------+------------------+------------------+---------+---------+\n",
408+
"| Date| Open| High| Low| Close| Volume|Adj Close|\n",
409+
"+-------------------+----------+------------------+------------------+------------------+---------+---------+\n",
410+
"|2012-02-15 00:00:00|514.259995| 526.290016|496.88998399999997| 497.669975|376530000|64.477899|\n",
411+
"|2013-09-05 00:00:00|500.250008|500.67997699999995|493.63997699999993|495.26997400000005| 59091900|65.977837|\n",
412+
"|2013-09-10 00:00:00|506.199997| 507.450012| 489.500015|494.63999900000005|185798900|65.893915|\n",
413+
"|2014-01-30 00:00:00|502.539993|506.49997699999994| 496.70002| 499.779984|169625400|66.967353|\n",
414+
"+-------------------+----------+------------------+------------------+------------------+---------+---------+\n",
415+
"\n"
416+
]
417+
}
418+
],
419+
"source": [
420+
"df.filter((df['Close']<500) & (df['Open']>500)).show(5)"
421+
]
422+
},
423+
{
424+
"cell_type": "markdown",
425+
"metadata": {},
426+
"source": [
427+
"#### We can use `==` to compare with an exact value for comparison and `~` for NOT operator"
428+
]
429+
},
430+
{
431+
"cell_type": "code",
432+
"execution_count": 20,
433+
"metadata": {},
434+
"outputs": [
435+
{
436+
"name": "stdout",
437+
"output_type": "stream",
438+
"text": [
439+
"+-------------------+------------------+----------+------+------+---------+---------+\n",
440+
"| Date| Open| High| Low| Close| Volume|Adj Close|\n",
441+
"+-------------------+------------------+----------+------+------+---------+---------+\n",
442+
"|2010-01-22 00:00:00|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|\n",
443+
"+-------------------+------------------+----------+------+------+---------+---------+\n",
444+
"\n"
445+
]
446+
}
447+
],
448+
"source": [
449+
"df.filter(df['Low']==197.16).show()"
450+
]
451+
},
452+
{
453+
"cell_type": "markdown",
454+
"metadata": {},
455+
"source": [
456+
"### Use the `collect` method instead of `show`, to collect the actual data"
457+
]
458+
},
459+
{
460+
"cell_type": "code",
461+
"execution_count": 21,
462+
"metadata": {},
463+
"outputs": [],
464+
"source": [
465+
"low_data = df.filter(df['Low']==197.16).collect()"
466+
]
467+
},
468+
{
469+
"cell_type": "code",
470+
"execution_count": 22,
471+
"metadata": {},
472+
"outputs": [
473+
{
474+
"data": {
475+
"text/plain": [
476+
"[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]"
477+
]
478+
},
479+
"execution_count": 22,
480+
"metadata": {},
481+
"output_type": "execute_result"
482+
}
483+
],
484+
"source": [
485+
"low_data"
486+
]
487+
},
488+
{
489+
"cell_type": "markdown",
490+
"metadata": {},
491+
"source": [
492+
"#### It is still a list. So, grab the 0-index element as a Row object and convert it to a dictionary using `asDict` method"
493+
]
494+
},
495+
{
496+
"cell_type": "code",
497+
"execution_count": 24,
498+
"metadata": {},
499+
"outputs": [],
500+
"source": [
501+
"dt = low_data[0]"
502+
]
503+
},
504+
{
505+
"cell_type": "code",
506+
"execution_count": 26,
507+
"metadata": {},
508+
"outputs": [
509+
{
510+
"data": {
511+
"text/plain": [
512+
"{'Date': datetime.datetime(2010, 1, 22, 0, 0),\n",
513+
" 'Open': 206.78000600000001,\n",
514+
" 'High': 207.499996,\n",
515+
" 'Low': 197.16,\n",
516+
" 'Close': 197.75,\n",
517+
" 'Volume': 220441900,\n",
518+
" 'Adj Close': 25.620401}"
519+
]
520+
},
521+
"execution_count": 26,
522+
"metadata": {},
523+
"output_type": "execute_result"
524+
}
525+
],
526+
"source": [
527+
"dt.asDict()"
528+
]
529+
},
530+
{
531+
"cell_type": "markdown",
532+
"metadata": {},
533+
"source": [
534+
"Now, you can do whatever processing you want to do with the dictionary object!"
535+
]
347536
}
348537
],
349538
"metadata": {
@@ -362,7 +551,7 @@
362551
"name": "python",
363552
"nbconvert_exporter": "python",
364553
"pygments_lexer": "ipython3",
365-
"version": "3.6.6"
554+
"version": "3.6.8"
366555
}
367556
},
368557
"nbformat": 4,

0 commit comments

Comments
 (0)