Skip to content

Commit 4bd1576

Browse files
Mark BauerMark Bauer
authored andcommitted
updating figures in part 3, mostly styling and better python syntax
1 parent 590f28d commit 4bd1576

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+1181
-1967
lines changed

3-plotting-data-visualizations/3_data_wrangling.ipynb

Lines changed: 76 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
"metadata": {},
66
"source": [
77
"# Data Analysis Using Python: A Beginner’s Guide Featuring NYC Open Data \n",
8-
"## Part 3: Plotting and Data Visualization in Python\n",
8+
"### Data Wragling for Part 3: Plotting and Data Visualization in Python\n",
9+
"Returning only buildings built between 1990 and 2020\n",
910
"\n",
1011
"Mark Bauer"
1112
]
@@ -14,14 +15,14 @@
1415
"cell_type": "markdown",
1516
"metadata": {},
1617
"source": [
17-
"# Datasets"
18+
"## Datasets"
1819
]
1920
},
2021
{
2122
"cell_type": "markdown",
2223
"metadata": {},
2324
"source": [
24-
"## Building Footprints \n",
25+
"### Building Footprints \n",
2526
"https://data.cityofnewyork.us/Housing-Development/Building-Footprints/nqwf-w8eh\n",
2627
"\n",
2728
"![building-footprints](images/building-footprints.png)\n"
@@ -31,7 +32,7 @@
3132
"cell_type": "markdown",
3233
"metadata": {},
3334
"source": [
34-
"## PLUTO \n",
35+
"### PLUTO \n",
3536
"https://data.cityofnewyork.us/City-Government/Primary-Land-Use-Tax-Lot-Output-Map-MapPLUTO-/f888-ni5f\n",
3637
"\n",
3738
"![pluto](images/pluto.png)"
@@ -46,19 +47,13 @@
4647
},
4748
{
4849
"cell_type": "code",
49-
"execution_count": 1,
50+
"execution_count": 14,
5051
"metadata": {},
5152
"outputs": [],
5253
"source": [
5354
"# importing libraries\n",
54-
"import pandas as pd # the pd is by convention\n",
55-
"import numpy as np # as is the np\n",
56-
"import matplotlib\n",
57-
"import matplotlib.pyplot as plt\n",
58-
"import matplotlib.ticker as ticker\n",
59-
"from matplotlib.ticker import FuncFormatter\n",
60-
"import seaborn as sns\n",
61-
"from scipy import stats\n",
55+
"import pandas as pd \n",
56+
"import numpy as np \n",
6257
"import requests\n",
6358
"import os\n",
6459
"from io import BytesIO\n",
@@ -71,7 +66,7 @@
7166
},
7267
{
7368
"cell_type": "code",
74-
"execution_count": 2,
69+
"execution_count": 15,
7570
"metadata": {},
7671
"outputs": [
7772
{
@@ -110,14 +105,14 @@
110105
},
111106
{
112107
"cell_type": "code",
113-
"execution_count": 15,
108+
"execution_count": 9,
114109
"metadata": {},
115110
"outputs": [
116111
{
117112
"name": "stdout",
118113
"output_type": "stream",
119114
"text": [
120-
"rows: 1,084,399, columns: 15\n"
115+
"rows: 1,084,499, columns: 15\n"
121116
]
122117
},
123118
{
@@ -283,7 +278,7 @@
283278
"4 Photogramm "
284279
]
285280
},
286-
"execution_count": 15,
281+
"execution_count": 9,
287282
"metadata": {},
288283
"output_type": "execute_result"
289284
}
@@ -310,27 +305,27 @@
310305
"output_type": "stream",
311306
"text": [
312307
"<class 'pandas.core.frame.DataFrame'>\n",
313-
"RangeIndex: 1084399 entries, 0 to 1084398\n",
308+
"Int64Index: 93527 entries, 5 to 1084160\n",
314309
"Data columns (total 15 columns):\n",
315-
" # Column Non-Null Count Dtype \n",
316-
"--- ------ -------------- ----- \n",
317-
" 0 the_geom 1084399 non-null object \n",
318-
" 1 NAME 1936 non-null object \n",
319-
" 2 BIN 1084399 non-null int64 \n",
320-
" 3 CNSTRCT_YR 1073525 non-null float64\n",
321-
" 4 LSTMODDATE 1084399 non-null object \n",
322-
" 5 LSTSTATYPE 1084173 non-null object \n",
323-
" 6 DOITT_ID 1084399 non-null int64 \n",
324-
" 7 HEIGHTROOF 1081721 non-null float64\n",
325-
" 8 FEAT_CODE 1084391 non-null float64\n",
326-
" 9 GROUNDELEV 1083857 non-null float64\n",
327-
" 10 SHAPE_AREA 1084399 non-null int64 \n",
328-
" 11 SHAPE_LEN 1084399 non-null int64 \n",
329-
" 12 BASE_BBL 1084395 non-null float64\n",
330-
" 13 MPLUTO_BBL 1082057 non-null float64\n",
331-
" 14 GEOMSOURCE 1084172 non-null object \n",
310+
" # Column Non-Null Count Dtype \n",
311+
"--- ------ -------------- ----- \n",
312+
" 0 the_geom 93527 non-null object \n",
313+
" 1 NAME 250 non-null object \n",
314+
" 2 BIN 93527 non-null int64 \n",
315+
" 3 CNSTRCT_YR 93527 non-null float64\n",
316+
" 4 LSTMODDATE 93527 non-null object \n",
317+
" 5 LSTSTATYPE 93449 non-null object \n",
318+
" 6 DOITT_ID 93527 non-null int64 \n",
319+
" 7 HEIGHTROOF 91832 non-null float64\n",
320+
" 8 FEAT_CODE 93525 non-null float64\n",
321+
" 9 GROUNDELEV 93251 non-null float64\n",
322+
" 10 SHAPE_AREA 93527 non-null int64 \n",
323+
" 11 SHAPE_LEN 93527 non-null int64 \n",
324+
" 12 BASE_BBL 93527 non-null float64\n",
325+
" 13 MPLUTO_BBL 93527 non-null float64\n",
326+
" 14 GEOMSOURCE 93467 non-null object \n",
332327
"dtypes: float64(6), int64(4), object(5)\n",
333-
"memory usage: 124.1+ MB\n"
328+
"memory usage: 11.4+ MB\n"
334329
]
335330
}
336331
],
@@ -348,12 +343,12 @@
348343
"name": "stdout",
349344
"output_type": "stream",
350345
"text": [
351-
"rows: 94,831, columns: 15\n"
346+
"rows: 93,527, columns: 15\n"
352347
]
353348
}
354349
],
355350
"source": [
356-
"# returning only building built between 1945 and 2020\n",
351+
"# returning only building built between 1990 and 2020\n",
357352
"building_footprints = building_footprints.loc[building_footprints['CNSTRCT_YR'].between(1990, 2020)]\n",
358353
"\n",
359354
"# new shape of data\n",
@@ -370,10 +365,11 @@
370365
"name": "stdout",
371366
"output_type": "stream",
372367
"text": [
373-
"count null: 1,302\n",
368+
"count null: 0\n",
374369
"dropping nulls...\n",
370+
"\n",
375371
"count null: 0\n",
376-
"rows: 93,529, columns: 15\n"
372+
"rows: 93,527, columns: 15\n"
377373
]
378374
}
379375
],
@@ -382,7 +378,7 @@
382378
"count_null = building_footprints['MPLUTO_BBL'].isnull().sum()\n",
383379
"print('count null: {:,}'.format(count_null))\n",
384380
"\n",
385-
"print('dropping nulls...')\n",
381+
"print('dropping nulls...\\n')\n",
386382
"building_footprints = building_footprints.dropna(subset=['MPLUTO_BBL'])\n",
387383
"count_null = building_footprints['MPLUTO_BBL'].isnull().sum()\n",
388384
"print('count null: {:,}'.format(count_null))\n",
@@ -1047,7 +1043,7 @@
10471043
"name": "stdout",
10481044
"output_type": "stream",
10491045
"text": [
1050-
"rows: 93,117, columns: 36\n"
1046+
"rows: 93,113, columns: 36\n"
10511047
]
10521048
},
10531049
{
@@ -1281,46 +1277,46 @@
12811277
"output_type": "stream",
12821278
"text": [
12831279
"<class 'pandas.core.frame.DataFrame'>\n",
1284-
"RangeIndex: 93117 entries, 0 to 93116\n",
1280+
"RangeIndex: 93113 entries, 0 to 93112\n",
12851281
"Data columns (total 36 columns):\n",
12861282
" # Column Non-Null Count Dtype \n",
12871283
"--- ------ -------------- ----- \n",
1288-
" 0 the_geom 93117 non-null object \n",
1284+
" 0 the_geom 93113 non-null object \n",
12891285
" 1 NAME 239 non-null object \n",
1290-
" 2 BIN 93117 non-null int64 \n",
1291-
" 3 CNSTRCT_YR 93117 non-null float64\n",
1292-
" 4 LSTMODDATE 93117 non-null object \n",
1293-
" 5 LSTSTATYPE 93043 non-null object \n",
1294-
" 6 DOITT_ID 93117 non-null int64 \n",
1295-
" 7 HEIGHTROOF 91484 non-null float64\n",
1296-
" 8 FEAT_CODE 93115 non-null float64\n",
1297-
" 9 GROUNDELEV 92857 non-null float64\n",
1298-
" 10 SHAPE_AREA 93117 non-null int64 \n",
1299-
" 11 SHAPE_LEN 93117 non-null int64 \n",
1300-
" 12 BASE_BBL 93117 non-null float64\n",
1301-
" 13 MPLUTO_BBL 93117 non-null float64\n",
1302-
" 14 GEOMSOURCE 93060 non-null object \n",
1303-
" 15 borough 93117 non-null object \n",
1304-
" 16 block 93117 non-null int64 \n",
1305-
" 17 lot 93117 non-null int64 \n",
1306-
" 18 cd 93094 non-null float64\n",
1307-
" 19 ct2010 93094 non-null float64\n",
1308-
" 20 cb2010 93094 non-null float64\n",
1309-
" 21 council 93094 non-null float64\n",
1310-
" 22 zipcode 93056 non-null float64\n",
1311-
" 23 bldgclass 93092 non-null object \n",
1312-
" 24 landuse 92845 non-null float64\n",
1286+
" 2 BIN 93113 non-null int64 \n",
1287+
" 3 CNSTRCT_YR 93113 non-null float64\n",
1288+
" 4 LSTMODDATE 93113 non-null object \n",
1289+
" 5 LSTSTATYPE 93039 non-null object \n",
1290+
" 6 DOITT_ID 93113 non-null int64 \n",
1291+
" 7 HEIGHTROOF 91481 non-null float64\n",
1292+
" 8 FEAT_CODE 93111 non-null float64\n",
1293+
" 9 GROUNDELEV 92853 non-null float64\n",
1294+
" 10 SHAPE_AREA 93113 non-null int64 \n",
1295+
" 11 SHAPE_LEN 93113 non-null int64 \n",
1296+
" 12 BASE_BBL 93113 non-null float64\n",
1297+
" 13 MPLUTO_BBL 93113 non-null float64\n",
1298+
" 14 GEOMSOURCE 93056 non-null object \n",
1299+
" 15 borough 93113 non-null object \n",
1300+
" 16 block 93113 non-null int64 \n",
1301+
" 17 lot 93113 non-null int64 \n",
1302+
" 18 cd 93090 non-null float64\n",
1303+
" 19 ct2010 93090 non-null float64\n",
1304+
" 20 cb2010 93090 non-null float64\n",
1305+
" 21 council 93090 non-null float64\n",
1306+
" 22 zipcode 93052 non-null float64\n",
1307+
" 23 bldgclass 93088 non-null object \n",
1308+
" 24 landuse 92841 non-null float64\n",
13131309
" 25 ownertype 3242 non-null object \n",
1314-
" 26 borocode 93117 non-null int64 \n",
1315-
" 27 bbl 93117 non-null int64 \n",
1316-
" 28 tract2010 93094 non-null float64\n",
1317-
" 29 xcoord 93094 non-null float64\n",
1318-
" 30 ycoord 93094 non-null float64\n",
1319-
" 31 latitude 93094 non-null float64\n",
1320-
" 32 longitude 93094 non-null float64\n",
1321-
" 33 plutomapid 93117 non-null int64 \n",
1322-
" 34 firm07_flag 7912 non-null float64\n",
1323-
" 35 pfirm15_flag 10838 non-null float64\n",
1310+
" 26 borocode 93113 non-null int64 \n",
1311+
" 27 bbl 93113 non-null int64 \n",
1312+
" 28 tract2010 93090 non-null float64\n",
1313+
" 29 xcoord 93090 non-null float64\n",
1314+
" 30 ycoord 93090 non-null float64\n",
1315+
" 31 latitude 93090 non-null float64\n",
1316+
" 32 longitude 93090 non-null float64\n",
1317+
" 33 plutomapid 93113 non-null int64 \n",
1318+
" 34 firm07_flag 7913 non-null float64\n",
1319+
" 35 pfirm15_flag 10839 non-null float64\n",
13241320
"dtypes: float64(19), int64(9), object(8)\n",
13251321
"memory usage: 25.6+ MB\n"
13261322
]
@@ -1340,7 +1336,7 @@
13401336
"name": "stdout",
13411337
"output_type": "stream",
13421338
"text": [
1343-
"number of rows in new datframe: 93117\n"
1339+
"number of rows in new datframe: 93113\n"
13441340
]
13451341
},
13461342
{
@@ -1569,7 +1565,7 @@
15691565
},
15701566
{
15711567
"cell_type": "code",
1572-
"execution_count": 28,
1568+
"execution_count": 27,
15731569
"metadata": {},
15741570
"outputs": [],
15751571
"source": [
@@ -1578,7 +1574,7 @@
15781574
},
15791575
{
15801576
"cell_type": "code",
1581-
"execution_count": 29,
1577+
"execution_count": 28,
15821578
"metadata": {},
15831579
"outputs": [
15841580
{

0 commit comments

Comments
 (0)