|
54 | 54 | ""
|
55 | 55 | ]
|
56 | 56 | },
|
| 57 | + { |
| 58 | + "cell_type": "markdown", |
| 59 | + "metadata": {}, |
| 60 | + "source": [ |
| 61 | + "### Useful references for this Notebook\n", |
| 62 | + "* [PySpark in Jupyter Notebook — Working with Dataframe & JDBC Data Sources](https://medium.com/@thucnc/pyspark-in-jupyter-notebook-working-with-dataframe-jdbc-data-sources-6f3d39300bf6)\n", |
| 63 | + "* [PySpark - Working with JDBC Sqlite database](http://mitzen.blogspot.com/2017/06/pyspark-working-with-jdbc-sqlite.html)" |
| 64 | + ] |
| 65 | + }, |
57 | 66 | {
|
58 | 67 | "cell_type": "markdown",
|
59 | 68 | "metadata": {},
|
|
642 | 651 | "source": [
|
643 | 652 | "df_combined.show()"
|
644 | 653 | ]
|
| 654 | + }, |
| 655 | + { |
| 656 | + "cell_type": "markdown", |
| 657 | + "metadata": {}, |
| 658 | + "source": [ |
| 659 | + "### What's the difference between temporary and global SQL views? " |
| 660 | + ] |
| 661 | + }, |
| 662 | + { |
| 663 | + "cell_type": "markdown", |
| 664 | + "metadata": {}, |
| 665 | + "source": [ |
| 666 | + "#### A temporary view does not persist (shared) across multiple sessions" |
| 667 | + ] |
| 668 | + }, |
| 669 | + { |
| 670 | + "cell_type": "code", |
| 671 | + "execution_count": 27, |
| 672 | + "metadata": {}, |
| 673 | + "outputs": [ |
| 674 | + { |
| 675 | + "name": "stdout", |
| 676 | + "output_type": "stream", |
| 677 | + "text": [ |
| 678 | + "+--------+--------------------+\n", |
| 679 | + "|ArtistId| Name|\n", |
| 680 | + "+--------+--------------------+\n", |
| 681 | + "| 1| AC/DC|\n", |
| 682 | + "| 2| Accept|\n", |
| 683 | + "| 3| Aerosmith|\n", |
| 684 | + "| 4| Alanis Morissette|\n", |
| 685 | + "| 5| Alice In Chains|\n", |
| 686 | + "| 6|Antônio Carlos Jobim|\n", |
| 687 | + "| 7| Apocalyptica|\n", |
| 688 | + "| 8| Audioslave|\n", |
| 689 | + "| 9| BackBeat|\n", |
| 690 | + "| 10| Billy Cobham|\n", |
| 691 | + "+--------+--------------------+\n", |
| 692 | + "\n" |
| 693 | + ] |
| 694 | + } |
| 695 | + ], |
| 696 | + "source": [ |
| 697 | + "df_artists.createOrReplaceTempView(\"temp_artists\")\n", |
| 698 | + "\n", |
| 699 | + "df_temp = spark1.sql(\"SELECT * FROM temp_artists LIMIT 10\")\n", |
| 700 | + "df_temp.show()" |
| 701 | + ] |
| 702 | + }, |
| 703 | + { |
| 704 | + "cell_type": "markdown", |
| 705 | + "metadata": {}, |
| 706 | + "source": [ |
| 707 | + "#### A new session is created but the temp view `temp_artists` cannot be accessed" |
| 708 | + ] |
| 709 | + }, |
| 710 | + { |
| 711 | + "cell_type": "code", |
| 712 | + "execution_count": 28, |
| 713 | + "metadata": {}, |
| 714 | + "outputs": [], |
| 715 | + "source": [ |
| 716 | + "spark2 = SparkSession.builder.appName('SQL2').getOrCreate()" |
| 717 | + ] |
| 718 | + }, |
| 719 | + { |
| 720 | + "cell_type": "markdown", |
| 721 | + "metadata": {}, |
| 722 | + "source": [ |
| 723 | + "#### We use `try...except` to catch the error and display a generic message" |
| 724 | + ] |
| 725 | + }, |
| 726 | + { |
| 727 | + "cell_type": "code", |
| 728 | + "execution_count": 29, |
| 729 | + "metadata": {}, |
| 730 | + "outputs": [], |
| 731 | + "source": [ |
| 732 | + "try:\n", |
| 733 | + " df_temp = spark2.sql(\"SELECT * FROM temp_artists LIMIT 10\")\n", |
| 734 | + "except:\n", |
| 735 | + " print(\"Error happened in this execution\")" |
| 736 | + ] |
| 737 | + }, |
| 738 | + { |
| 739 | + "cell_type": "markdown", |
| 740 | + "metadata": {}, |
| 741 | + "source": [ |
| 742 | + "#### Now, a global view is created in this session\n", |
| 743 | + "Global temporary view is tied to a system preserved database `global_temp`. So the view name must be referenced as such." |
| 744 | + ] |
| 745 | + }, |
| 746 | + { |
| 747 | + "cell_type": "code", |
| 748 | + "execution_count": 30, |
| 749 | + "metadata": {}, |
| 750 | + "outputs": [], |
| 751 | + "source": [ |
| 752 | + "tablename = \"artists\"\n", |
| 753 | + "df_artists = spark2.read.format(\"jdbc\").option(\"url\", url).option(\"dbtable\", tablename).option(\"driver\", driver).load()" |
| 754 | + ] |
| 755 | + }, |
| 756 | + { |
| 757 | + "cell_type": "code", |
| 758 | + "execution_count": 31, |
| 759 | + "metadata": {}, |
| 760 | + "outputs": [ |
| 761 | + { |
| 762 | + "name": "stdout", |
| 763 | + "output_type": "stream", |
| 764 | + "text": [ |
| 765 | + "+--------+--------------------+\n", |
| 766 | + "|ArtistId| Name|\n", |
| 767 | + "+--------+--------------------+\n", |
| 768 | + "| 1| AC/DC|\n", |
| 769 | + "| 2| Accept|\n", |
| 770 | + "| 3| Aerosmith|\n", |
| 771 | + "| 4| Alanis Morissette|\n", |
| 772 | + "| 5| Alice In Chains|\n", |
| 773 | + "| 6|Antônio Carlos Jobim|\n", |
| 774 | + "| 7| Apocalyptica|\n", |
| 775 | + "| 8| Audioslave|\n", |
| 776 | + "| 9| BackBeat|\n", |
| 777 | + "| 10| Billy Cobham|\n", |
| 778 | + "+--------+--------------------+\n", |
| 779 | + "\n" |
| 780 | + ] |
| 781 | + } |
| 782 | + ], |
| 783 | + "source": [ |
| 784 | + "df_artists.createOrReplaceGlobalTempView(\"global_artists\")\n", |
| 785 | + "\n", |
| 786 | + "df_global = spark2.sql(\"SELECT * FROM global_temp.global_artists LIMIT 10\")\n", |
| 787 | + "df_global.show()" |
| 788 | + ] |
| 789 | + }, |
| 790 | + { |
| 791 | + "cell_type": "markdown", |
| 792 | + "metadata": {}, |
| 793 | + "source": [ |
| 794 | + "#### Start a new session. The view `global_artists` can be accessed across the sessions" |
| 795 | + ] |
| 796 | + }, |
| 797 | + { |
| 798 | + "cell_type": "code", |
| 799 | + "execution_count": 32, |
| 800 | + "metadata": {}, |
| 801 | + "outputs": [], |
| 802 | + "source": [ |
| 803 | + "spark3 = SparkSession.builder.appName('SQL3').getOrCreate()" |
| 804 | + ] |
| 805 | + }, |
| 806 | + { |
| 807 | + "cell_type": "code", |
| 808 | + "execution_count": 33, |
| 809 | + "metadata": {}, |
| 810 | + "outputs": [ |
| 811 | + { |
| 812 | + "name": "stdout", |
| 813 | + "output_type": "stream", |
| 814 | + "text": [ |
| 815 | + "+--------+--------------------+\n", |
| 816 | + "|ArtistId| Name|\n", |
| 817 | + "+--------+--------------------+\n", |
| 818 | + "| 1| AC/DC|\n", |
| 819 | + "| 2| Accept|\n", |
| 820 | + "| 3| Aerosmith|\n", |
| 821 | + "| 4| Alanis Morissette|\n", |
| 822 | + "| 5| Alice In Chains|\n", |
| 823 | + "| 6|Antônio Carlos Jobim|\n", |
| 824 | + "| 7| Apocalyptica|\n", |
| 825 | + "| 8| Audioslave|\n", |
| 826 | + "| 9| BackBeat|\n", |
| 827 | + "| 10| Billy Cobham|\n", |
| 828 | + "+--------+--------------------+\n", |
| 829 | + "\n" |
| 830 | + ] |
| 831 | + } |
| 832 | + ], |
| 833 | + "source": [ |
| 834 | + "df_global = spark3.sql(\"SELECT * FROM global_temp.global_artists LIMIT 10\")\n", |
| 835 | + "df_global.show()" |
| 836 | + ] |
645 | 837 | }
|
646 | 838 | ],
|
647 | 839 | "metadata": {
|
|
0 commit comments