|
91 | 91 | },
|
92 | 92 | {
|
93 | 93 | "cell_type": "code",
|
94 |
| - "execution_count": 11, |
| 94 | + "execution_count": 4, |
95 | 95 | "metadata": {},
|
96 | 96 | "outputs": [
|
97 | 97 | {
|
|
160 | 160 | },
|
161 | 161 | {
|
162 | 162 | "cell_type": "code",
|
163 |
| - "execution_count": 8, |
| 163 | + "execution_count": 7, |
164 | 164 | "metadata": {},
|
165 | 165 | "outputs": [
|
166 | 166 | {
|
|
169 | 169 | "['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']"
|
170 | 170 | ]
|
171 | 171 | },
|
172 |
| - "execution_count": 8, |
| 172 | + "execution_count": 7, |
173 | 173 | "metadata": {},
|
174 | 174 | "output_type": "execute_result"
|
175 | 175 | }
|
|
180 | 180 | },
|
181 | 181 | {
|
182 | 182 | "cell_type": "code",
|
183 |
| - "execution_count": 9, |
| 183 | + "execution_count": 8, |
184 | 184 | "metadata": {},
|
185 | 185 | "outputs": [
|
186 | 186 | {
|
|
214 | 214 | },
|
215 | 215 | {
|
216 | 216 | "cell_type": "code",
|
217 |
| - "execution_count": 22, |
| 217 | + "execution_count": 9, |
218 | 218 | "metadata": {},
|
219 | 219 | "outputs": [
|
220 | 220 | {
|
|
243 | 243 | },
|
244 | 244 | {
|
245 | 245 | "cell_type": "code",
|
246 |
| - "execution_count": 36, |
| 246 | + "execution_count": 10, |
247 | 247 | "metadata": {},
|
248 | 248 | "outputs": [
|
249 | 249 | {
|
|
267 | 267 | "cell_type": "markdown",
|
268 | 268 | "metadata": {},
|
269 | 269 | "source": [
|
270 |
| - "### Read a file (and create dataframe) by directly running a `spark.sql` method on the file" |
| 270 | + "### Read a file (and create dataframe) by directly running a `spark.sql` method on the file\n", |
| 271 | + "Notice the syntax of `csv.<path->filename.csv>` inside the SQL query" |
271 | 272 | ]
|
272 | 273 | },
|
273 | 274 | {
|
274 | 275 | "cell_type": "code",
|
275 |
| - "execution_count": 34, |
| 276 | + "execution_count": 11, |
276 | 277 | "metadata": {},
|
277 | 278 | "outputs": [],
|
278 | 279 | "source": [
|
|
281 | 282 | },
|
282 | 283 | {
|
283 | 284 | "cell_type": "code",
|
284 |
| - "execution_count": 35, |
| 285 | + "execution_count": 12, |
285 | 286 | "metadata": {},
|
286 | 287 | "outputs": [
|
287 | 288 | {
|
|
312 | 313 | "source": [
|
313 | 314 | "df_sales.show()"
|
314 | 315 | ]
|
| 316 | + }, |
| 317 | + { |
| 318 | + "cell_type": "markdown", |
| 319 | + "metadata": {}, |
| 320 | + "source": [ |
| 321 | + "### Read tables from a local SQLite database file using `JDBC` connection" |
| 322 | + ] |
| 323 | + }, |
| 324 | + { |
| 325 | + "cell_type": "markdown", |
| 326 | + "metadata": {}, |
| 327 | + "source": [ |
| 328 | + "#### First `cd` to the directory where all the PySpark jar files are kept. Then download the SQLite jar file from the [given URL](https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc)" |
| 329 | + ] |
| 330 | + }, |
| 331 | + { |
| 332 | + "cell_type": "code", |
| 333 | + "execution_count": 13, |
| 334 | + "metadata": {}, |
| 335 | + "outputs": [ |
| 336 | + { |
| 337 | + "name": "stdout", |
| 338 | + "output_type": "stream", |
| 339 | + "text": [ |
| 340 | + "Warning: Binary output can mess up your terminal. Use \"--output -\" to tell \n", |
| 341 | + "Warning: curl to output it to your terminal anyway, or consider \"--output \n", |
| 342 | + "Warning: <FILE>\" to save to a file.\n" |
| 343 | + ] |
| 344 | + } |
| 345 | + ], |
| 346 | + "source": [ |
| 347 | + "!cd /home/tirtha/Spark/spark-2.3.1-bin-hadoop2.7/jars/\n", |
| 348 | + "!curl https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.28.0/sqlite-jdbc-3.28.0.jar" |
| 349 | + ] |
| 350 | + }, |
| 351 | + { |
| 352 | + "cell_type": "markdown", |
| 353 | + "metadata": {}, |
| 354 | + "source": [ |
| 355 | + "#### Define driver, path to local .db file, and append that path to `jdbc:sqlite` to construct the `url`" |
| 356 | + ] |
| 357 | + }, |
| 358 | + { |
| 359 | + "cell_type": "code", |
| 360 | + "execution_count": 14, |
| 361 | + "metadata": {}, |
| 362 | + "outputs": [], |
| 363 | + "source": [ |
| 364 | + "driver = \"org.sqlite.JDBC\"\n", |
| 365 | + "path = \"Data/chinook.db\"\n", |
| 366 | + "url = \"jdbc:sqlite:\" + path" |
| 367 | + ] |
| 368 | + }, |
| 369 | + { |
| 370 | + "cell_type": "markdown", |
| 371 | + "metadata": {}, |
| 372 | + "source": [ |
| 373 | + "#### Define `tablename` to be read" |
| 374 | + ] |
| 375 | + }, |
| 376 | + { |
| 377 | + "cell_type": "code", |
| 378 | + "execution_count": 15, |
| 379 | + "metadata": {}, |
| 380 | + "outputs": [], |
| 381 | + "source": [ |
| 382 | + "tablename = \"albums\"" |
| 383 | + ] |
| 384 | + }, |
| 385 | + { |
| 386 | + "cell_type": "code", |
| 387 | + "execution_count": 16, |
| 388 | + "metadata": {}, |
| 389 | + "outputs": [], |
| 390 | + "source": [ |
| 391 | + "df_albums = spark1.read.format(\"jdbc\").option(\"url\", url).option(\"dbtable\", tablename).option(\"driver\", driver).load()" |
| 392 | + ] |
| 393 | + }, |
| 394 | + { |
| 395 | + "cell_type": "code", |
| 396 | + "execution_count": 17, |
| 397 | + "metadata": {}, |
| 398 | + "outputs": [ |
| 399 | + { |
| 400 | + "name": "stdout", |
| 401 | + "output_type": "stream", |
| 402 | + "text": [ |
| 403 | + "+-------+--------------------+--------+\n", |
| 404 | + "|AlbumId| Title|ArtistId|\n", |
| 405 | + "+-------+--------------------+--------+\n", |
| 406 | + "| 1|For Those About T...| 1|\n", |
| 407 | + "| 2| Balls to the Wall| 2|\n", |
| 408 | + "| 3| Restless and Wild| 2|\n", |
| 409 | + "| 4| Let There Be Rock| 1|\n", |
| 410 | + "| 5| Big Ones| 3|\n", |
| 411 | + "| 6| Jagged Little Pill| 4|\n", |
| 412 | + "| 7| Facelift| 5|\n", |
| 413 | + "| 8| Warner 25 Anos| 6|\n", |
| 414 | + "| 9|Plays Metallica B...| 7|\n", |
| 415 | + "| 10| Audioslave| 8|\n", |
| 416 | + "| 11| Out Of Exile| 8|\n", |
| 417 | + "| 12| BackBeat Soundtrack| 9|\n", |
| 418 | + "| 13|The Best Of Billy...| 10|\n", |
| 419 | + "| 14|Alcohol Fueled Br...| 11|\n", |
| 420 | + "| 15|Alcohol Fueled Br...| 11|\n", |
| 421 | + "| 16| Black Sabbath| 12|\n", |
| 422 | + "| 17|Black Sabbath Vol...| 12|\n", |
| 423 | + "| 18| Body Count| 13|\n", |
| 424 | + "| 19| Chemical Wedding| 14|\n", |
| 425 | + "| 20|The Best Of Buddy...| 15|\n", |
| 426 | + "+-------+--------------------+--------+\n", |
| 427 | + "only showing top 20 rows\n", |
| 428 | + "\n" |
| 429 | + ] |
| 430 | + } |
| 431 | + ], |
| 432 | + "source": [ |
| 433 | + "df_albums.show()" |
| 434 | + ] |
| 435 | + }, |
| 436 | + { |
| 437 | + "cell_type": "code", |
| 438 | + "execution_count": 18, |
| 439 | + "metadata": {}, |
| 440 | + "outputs": [ |
| 441 | + { |
| 442 | + "name": "stdout", |
| 443 | + "output_type": "stream", |
| 444 | + "text": [ |
| 445 | + "root\n", |
| 446 | + " |-- AlbumId: integer (nullable = true)\n", |
| 447 | + " |-- Title: string (nullable = true)\n", |
| 448 | + " |-- ArtistId: integer (nullable = true)\n", |
| 449 | + "\n" |
| 450 | + ] |
| 451 | + } |
| 452 | + ], |
| 453 | + "source": [ |
| 454 | + "df_albums.printSchema()" |
| 455 | + ] |
| 456 | + }, |
| 457 | + { |
| 458 | + "cell_type": "markdown", |
| 459 | + "metadata": {}, |
| 460 | + "source": [ |
| 461 | + "#### Don't forget to create temp view to use later" |
| 462 | + ] |
| 463 | + }, |
| 464 | + { |
| 465 | + "cell_type": "code", |
| 466 | + "execution_count": 19, |
| 467 | + "metadata": {}, |
| 468 | + "outputs": [], |
| 469 | + "source": [ |
| 470 | + "df_albums.createTempView('albums')" |
| 471 | + ] |
| 472 | + }, |
| 473 | + { |
| 474 | + "cell_type": "markdown", |
| 475 | + "metadata": {}, |
| 476 | + "source": [ |
| 477 | + "#### Read another table - `artists`" |
| 478 | + ] |
| 479 | + }, |
| 480 | + { |
| 481 | + "cell_type": "code", |
| 482 | + "execution_count": 20, |
| 483 | + "metadata": {}, |
| 484 | + "outputs": [], |
| 485 | + "source": [ |
| 486 | + "tablename = \"artists\"" |
| 487 | + ] |
| 488 | + }, |
| 489 | + { |
| 490 | + "cell_type": "code", |
| 491 | + "execution_count": 21, |
| 492 | + "metadata": {}, |
| 493 | + "outputs": [], |
| 494 | + "source": [ |
| 495 | + "df_artists = spark1.read.format(\"jdbc\").option(\"url\", url).option(\"dbtable\", tablename).option(\"driver\", driver).load()" |
| 496 | + ] |
| 497 | + }, |
| 498 | + { |
| 499 | + "cell_type": "code", |
| 500 | + "execution_count": 22, |
| 501 | + "metadata": {}, |
| 502 | + "outputs": [ |
| 503 | + { |
| 504 | + "name": "stdout", |
| 505 | + "output_type": "stream", |
| 506 | + "text": [ |
| 507 | + "+--------+--------------------+\n", |
| 508 | + "|ArtistId| Name|\n", |
| 509 | + "+--------+--------------------+\n", |
| 510 | + "| 1| AC/DC|\n", |
| 511 | + "| 2| Accept|\n", |
| 512 | + "| 3| Aerosmith|\n", |
| 513 | + "| 4| Alanis Morissette|\n", |
| 514 | + "| 5| Alice In Chains|\n", |
| 515 | + "| 6|Antônio Carlos Jobim|\n", |
| 516 | + "| 7| Apocalyptica|\n", |
| 517 | + "| 8| Audioslave|\n", |
| 518 | + "| 9| BackBeat|\n", |
| 519 | + "| 10| Billy Cobham|\n", |
| 520 | + "| 11| Black Label Society|\n", |
| 521 | + "| 12| Black Sabbath|\n", |
| 522 | + "| 13| Body Count|\n", |
| 523 | + "| 14| Bruce Dickinson|\n", |
| 524 | + "| 15| Buddy Guy|\n", |
| 525 | + "| 16| Caetano Veloso|\n", |
| 526 | + "| 17| Chico Buarque|\n", |
| 527 | + "| 18|Chico Science & N...|\n", |
| 528 | + "| 19| Cidade Negra|\n", |
| 529 | + "| 20| Cláudio Zoli|\n", |
| 530 | + "+--------+--------------------+\n", |
| 531 | + "only showing top 20 rows\n", |
| 532 | + "\n" |
| 533 | + ] |
| 534 | + } |
| 535 | + ], |
| 536 | + "source": [ |
| 537 | + "df_artists.show()" |
| 538 | + ] |
| 539 | + }, |
| 540 | + { |
| 541 | + "cell_type": "code", |
| 542 | + "execution_count": 23, |
| 543 | + "metadata": {}, |
| 544 | + "outputs": [], |
| 545 | + "source": [ |
| 546 | + "df_artists.createTempView('artists')" |
| 547 | + ] |
| 548 | + }, |
| 549 | + { |
| 550 | + "cell_type": "markdown", |
| 551 | + "metadata": {}, |
| 552 | + "source": [ |
| 553 | + "#### Test if SQL query is working fine" |
| 554 | + ] |
| 555 | + }, |
| 556 | + { |
| 557 | + "cell_type": "code", |
| 558 | + "execution_count": 24, |
| 559 | + "metadata": {}, |
| 560 | + "outputs": [ |
| 561 | + { |
| 562 | + "name": "stdout", |
| 563 | + "output_type": "stream", |
| 564 | + "text": [ |
| 565 | + "+--------+---------+\n", |
| 566 | + "|ArtistId| Name|\n", |
| 567 | + "+--------+---------+\n", |
| 568 | + "| 1| AC/DC|\n", |
| 569 | + "| 2| Accept|\n", |
| 570 | + "| 3|Aerosmith|\n", |
| 571 | + "| 9| BackBeat|\n", |
| 572 | + "| 15|Buddy Guy|\n", |
| 573 | + "| 26| Azymuth|\n", |
| 574 | + "| 36| O Rappa|\n", |
| 575 | + "| 37| Ed Motta|\n", |
| 576 | + "| 46|Jorge Ben|\n", |
| 577 | + "| 50|Metallica|\n", |
| 578 | + "+--------+---------+\n", |
| 579 | + "\n" |
| 580 | + ] |
| 581 | + } |
| 582 | + ], |
| 583 | + "source": [ |
| 584 | + "spark1.sql(\"SELECT * FROM artists WHERE length(Name)<10 LIMIT 10\").show()" |
| 585 | + ] |
| 586 | + }, |
| 587 | + { |
| 588 | + "cell_type": "markdown", |
| 589 | + "metadata": {}, |
| 590 | + "source": [ |
| 591 | + "### Join the `albums` and `artists` tables in a single dataframe using SQL query (and order by `ArtistId`)" |
| 592 | + ] |
| 593 | + }, |
| 594 | + { |
| 595 | + "cell_type": "code", |
| 596 | + "execution_count": 25, |
| 597 | + "metadata": {}, |
| 598 | + "outputs": [], |
| 599 | + "source": [ |
| 600 | + "df_combined = spark1.sql(\"SELECT * FROM artists LEFT JOIN albums ON artists.ArtistId=albums.ArtistId order by artists.ArtistId\")" |
| 601 | + ] |
| 602 | + }, |
| 603 | + { |
| 604 | + "cell_type": "code", |
| 605 | + "execution_count": null, |
| 606 | + "metadata": {}, |
| 607 | + "outputs": [], |
| 608 | + "source": [ |
| 609 | + "df_combined.show()" |
| 610 | + ] |
315 | 611 | }
|
316 | 612 | ],
|
317 | 613 | "metadata": {
|
|
0 commit comments