|
1 | 1 | {
|
2 | 2 | "cells": [
|
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# Spark DataFrame basic operations\n", |
| 8 | + "### Dr. Tirthajyoti Sarkar, Fremont, CA 94536\n", |
| 9 | + "In this notebook, we go through basic operations that can be performed with a Spark DataFrame object. We will use a .CSV file of stock prices to illustrate the code." |
| 10 | + ] |
| 11 | + }, |
3 | 12 | {
|
4 | 13 | "cell_type": "code",
|
5 | 14 | "execution_count": 1,
|
|
35 | 44 | },
|
36 | 45 | {
|
37 | 46 | "cell_type": "code",
|
38 |
| - "execution_count": 4, |
| 47 | + "execution_count": 3, |
39 | 48 | "metadata": {},
|
40 | 49 | "outputs": [],
|
41 | 50 | "source": [
|
|
51 | 60 | },
|
52 | 61 | {
|
53 | 62 | "cell_type": "code",
|
54 |
| - "execution_count": 5, |
| 63 | + "execution_count": 4, |
55 | 64 | "metadata": {},
|
56 | 65 | "outputs": [
|
57 | 66 | {
|
|
83 | 92 | },
|
84 | 93 | {
|
85 | 94 | "cell_type": "code",
|
86 |
| - "execution_count": 6, |
| 95 | + "execution_count": 5, |
87 | 96 | "metadata": {},
|
88 | 97 | "outputs": [
|
89 | 98 | {
|
|
132 | 141 | },
|
133 | 142 | {
|
134 | 143 | "cell_type": "code",
|
135 |
| - "execution_count": 7, |
| 144 | + "execution_count": 6, |
136 | 145 | "metadata": {},
|
137 | 146 | "outputs": [
|
138 | 147 | {
|
|
141 | 150 | "['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']"
|
142 | 151 | ]
|
143 | 152 | },
|
144 |
| - "execution_count": 7, |
| 153 | + "execution_count": 6, |
145 | 154 | "metadata": {},
|
146 | 155 | "output_type": "execute_result"
|
147 | 156 | }
|
|
159 | 168 | },
|
160 | 169 | {
|
161 | 170 | "cell_type": "code",
|
162 |
| - "execution_count": 8, |
| 171 | + "execution_count": 7, |
163 | 172 | "metadata": {},
|
164 | 173 | "outputs": [
|
165 | 174 | {
|
|
188 | 197 | },
|
189 | 198 | {
|
190 | 199 | "cell_type": "code",
|
191 |
| - "execution_count": 12, |
| 200 | + "execution_count": 8, |
192 | 201 | "metadata": {},
|
193 | 202 | "outputs": [
|
194 | 203 | {
|
|
198 | 207 | " Row(Date=datetime.datetime(2010, 1, 5, 0, 0), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]"
|
199 | 208 | ]
|
200 | 209 | },
|
201 |
| - "execution_count": 12, |
| 210 | + "execution_count": 8, |
202 | 211 | "metadata": {},
|
203 | 212 | "output_type": "execute_result"
|
204 | 213 | }
|
|
209 | 218 | },
|
210 | 219 | {
|
211 | 220 | "cell_type": "code",
|
212 |
| - "execution_count": 14, |
| 221 | + "execution_count": 9, |
213 | 222 | "metadata": {},
|
214 | 223 | "outputs": [],
|
215 | 224 | "source": [
|
|
218 | 227 | },
|
219 | 228 | {
|
220 | 229 | "cell_type": "code",
|
221 |
| - "execution_count": 15, |
| 230 | + "execution_count": 10, |
222 | 231 | "metadata": {},
|
223 | 232 | "outputs": [
|
224 | 233 | {
|
|
233 | 242 | " 'Adj Close': 27.727039}"
|
234 | 243 | ]
|
235 | 244 | },
|
236 |
| - "execution_count": 15, |
| 245 | + "execution_count": 10, |
237 | 246 | "metadata": {},
|
238 | 247 | "output_type": "execute_result"
|
239 | 248 | }
|
|
259 | 268 | },
|
260 | 269 | {
|
261 | 270 | "cell_type": "code",
|
262 |
| - "execution_count": 9, |
| 271 | + "execution_count": 11, |
263 | 272 | "metadata": {},
|
264 | 273 | "outputs": [
|
265 | 274 | {
|
|
293 | 302 | },
|
294 | 303 | {
|
295 | 304 | "cell_type": "code",
|
296 |
| - "execution_count": 10, |
| 305 | + "execution_count": 12, |
297 | 306 | "metadata": {},
|
298 | 307 | "outputs": [
|
299 | 308 | {
|
|
322 | 331 | },
|
323 | 332 | {
|
324 | 333 | "cell_type": "code",
|
325 |
| - "execution_count": 11, |
| 334 | + "execution_count": 13, |
326 | 335 | "metadata": {},
|
327 | 336 | "outputs": [
|
328 | 337 | {
|
|
344 | 353 | "source": [
|
345 | 354 | "df.filter(\"Close < 500 AND Open > 500\").show(5)"
|
346 | 355 | ]
|
| 356 | + }, |
| 357 | + { |
| 358 | + "cell_type": "markdown", |
| 359 | + "metadata": {}, |
| 360 | + "source": [ |
| 361 | + "### Now we use DataFrame syntax to achieve the same output " |
| 362 | + ] |
| 363 | + }, |
| 364 | + { |
| 365 | + "cell_type": "code", |
| 366 | + "execution_count": 17, |
| 367 | + "metadata": {}, |
| 368 | + "outputs": [ |
| 369 | + { |
| 370 | + "name": "stdout", |
| 371 | + "output_type": "stream", |
| 372 | + "text": [ |
| 373 | + "+-------------------+----------+----------+------------------+------------------+---------+------------------+\n", |
| 374 | + "| Date| Open| High| Low| Close| Volume| Adj Close|\n", |
| 375 | + "+-------------------+----------+----------+------------------+------------------+---------+------------------+\n", |
| 376 | + "|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996| 214.009998|123432400| 27.727039|\n", |
| 377 | + "|2010-01-05 00:00:00|214.599998|215.589994| 213.249994| 214.379993|150476200|27.774976000000002|\n", |
| 378 | + "|2010-01-06 00:00:00|214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|\n", |
| 379 | + "|2010-01-07 00:00:00| 211.75|212.000006| 209.050005| 210.58|119282800| 27.28265|\n", |
| 380 | + "|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|\n", |
| 381 | + "+-------------------+----------+----------+------------------+------------------+---------+------------------+\n", |
| 382 | + "only showing top 5 rows\n", |
| 383 | + "\n" |
| 384 | + ] |
| 385 | + } |
| 386 | + ], |
| 387 | + "source": [ |
| 388 | + "df.filter(df['Close']<500).show(5)" |
| 389 | + ] |
| 390 | + }, |
| 391 | + { |
| 392 | + "cell_type": "markdown", |
| 393 | + "metadata": {}, |
| 394 | + "source": [ |
| 395 | + "#### If we need to chain multiple conditions together, use `&` for AND and `|` for OR and clearly separate the conditions by putting them inside parantheses" |
| 396 | + ] |
| 397 | + }, |
| 398 | + { |
| 399 | + "cell_type": "code", |
| 400 | + "execution_count": 18, |
| 401 | + "metadata": {}, |
| 402 | + "outputs": [ |
| 403 | + { |
| 404 | + "name": "stdout", |
| 405 | + "output_type": "stream", |
| 406 | + "text": [ |
| 407 | + "+-------------------+----------+------------------+------------------+------------------+---------+---------+\n", |
| 408 | + "| Date| Open| High| Low| Close| Volume|Adj Close|\n", |
| 409 | + "+-------------------+----------+------------------+------------------+------------------+---------+---------+\n", |
| 410 | + "|2012-02-15 00:00:00|514.259995| 526.290016|496.88998399999997| 497.669975|376530000|64.477899|\n", |
| 411 | + "|2013-09-05 00:00:00|500.250008|500.67997699999995|493.63997699999993|495.26997400000005| 59091900|65.977837|\n", |
| 412 | + "|2013-09-10 00:00:00|506.199997| 507.450012| 489.500015|494.63999900000005|185798900|65.893915|\n", |
| 413 | + "|2014-01-30 00:00:00|502.539993|506.49997699999994| 496.70002| 499.779984|169625400|66.967353|\n", |
| 414 | + "+-------------------+----------+------------------+------------------+------------------+---------+---------+\n", |
| 415 | + "\n" |
| 416 | + ] |
| 417 | + } |
| 418 | + ], |
| 419 | + "source": [ |
| 420 | + "df.filter((df['Close']<500) & (df['Open']>500)).show(5)" |
| 421 | + ] |
| 422 | + }, |
| 423 | + { |
| 424 | + "cell_type": "markdown", |
| 425 | + "metadata": {}, |
| 426 | + "source": [ |
| 427 | + "#### We can use `==` to compare with an exact value for comparison and `~` for NOT operator" |
| 428 | + ] |
| 429 | + }, |
| 430 | + { |
| 431 | + "cell_type": "code", |
| 432 | + "execution_count": 20, |
| 433 | + "metadata": {}, |
| 434 | + "outputs": [ |
| 435 | + { |
| 436 | + "name": "stdout", |
| 437 | + "output_type": "stream", |
| 438 | + "text": [ |
| 439 | + "+-------------------+------------------+----------+------+------+---------+---------+\n", |
| 440 | + "| Date| Open| High| Low| Close| Volume|Adj Close|\n", |
| 441 | + "+-------------------+------------------+----------+------+------+---------+---------+\n", |
| 442 | + "|2010-01-22 00:00:00|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|\n", |
| 443 | + "+-------------------+------------------+----------+------+------+---------+---------+\n", |
| 444 | + "\n" |
| 445 | + ] |
| 446 | + } |
| 447 | + ], |
| 448 | + "source": [ |
| 449 | + "df.filter(df['Low']==197.16).show()" |
| 450 | + ] |
| 451 | + }, |
| 452 | + { |
| 453 | + "cell_type": "markdown", |
| 454 | + "metadata": {}, |
| 455 | + "source": [ |
| 456 | + "### Use the `collect` method instead of `show`, to collect the actual data" |
| 457 | + ] |
| 458 | + }, |
| 459 | + { |
| 460 | + "cell_type": "code", |
| 461 | + "execution_count": 21, |
| 462 | + "metadata": {}, |
| 463 | + "outputs": [], |
| 464 | + "source": [ |
| 465 | + "low_data = df.filter(df['Low']==197.16).collect()" |
| 466 | + ] |
| 467 | + }, |
| 468 | + { |
| 469 | + "cell_type": "code", |
| 470 | + "execution_count": 22, |
| 471 | + "metadata": {}, |
| 472 | + "outputs": [ |
| 473 | + { |
| 474 | + "data": { |
| 475 | + "text/plain": [ |
| 476 | + "[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]" |
| 477 | + ] |
| 478 | + }, |
| 479 | + "execution_count": 22, |
| 480 | + "metadata": {}, |
| 481 | + "output_type": "execute_result" |
| 482 | + } |
| 483 | + ], |
| 484 | + "source": [ |
| 485 | + "low_data" |
| 486 | + ] |
| 487 | + }, |
| 488 | + { |
| 489 | + "cell_type": "markdown", |
| 490 | + "metadata": {}, |
| 491 | + "source": [ |
| 492 | + "#### It is still a list. So, grab the 0-index element as a Row object and convert it to a dictionary using `asDict` method" |
| 493 | + ] |
| 494 | + }, |
| 495 | + { |
| 496 | + "cell_type": "code", |
| 497 | + "execution_count": 24, |
| 498 | + "metadata": {}, |
| 499 | + "outputs": [], |
| 500 | + "source": [ |
| 501 | + "dt = low_data[0]" |
| 502 | + ] |
| 503 | + }, |
| 504 | + { |
| 505 | + "cell_type": "code", |
| 506 | + "execution_count": 26, |
| 507 | + "metadata": {}, |
| 508 | + "outputs": [ |
| 509 | + { |
| 510 | + "data": { |
| 511 | + "text/plain": [ |
| 512 | + "{'Date': datetime.datetime(2010, 1, 22, 0, 0),\n", |
| 513 | + " 'Open': 206.78000600000001,\n", |
| 514 | + " 'High': 207.499996,\n", |
| 515 | + " 'Low': 197.16,\n", |
| 516 | + " 'Close': 197.75,\n", |
| 517 | + " 'Volume': 220441900,\n", |
| 518 | + " 'Adj Close': 25.620401}" |
| 519 | + ] |
| 520 | + }, |
| 521 | + "execution_count": 26, |
| 522 | + "metadata": {}, |
| 523 | + "output_type": "execute_result" |
| 524 | + } |
| 525 | + ], |
| 526 | + "source": [ |
| 527 | + "dt.asDict()" |
| 528 | + ] |
| 529 | + }, |
| 530 | + { |
| 531 | + "cell_type": "markdown", |
| 532 | + "metadata": {}, |
| 533 | + "source": [ |
| 534 | + "Now, you can do whatever processing you want to do with the dictionary object!" |
| 535 | + ] |
347 | 536 | }
|
348 | 537 | ],
|
349 | 538 | "metadata": {
|
|
362 | 551 | "name": "python",
|
363 | 552 | "nbconvert_exporter": "python",
|
364 | 553 | "pygments_lexer": "ipython3",
|
365 |
| - "version": "3.6.6" |
| 554 | + "version": "3.6.8" |
366 | 555 | }
|
367 | 556 | },
|
368 | 557 | "nbformat": 4,
|
|
0 commit comments