|
5 | 5 | "metadata": {},
|
6 | 6 | "source": [
|
7 | 7 | "# Data Analysis Using Python: A Beginner’s Guide Featuring NYC Open Data \n",
|
8 |
| - "## Part 3: Plotting and Data Visualization in Python\n", |
| 8 | + "### Data Wragling for Part 3: Plotting and Data Visualization in Python\n", |
| 9 | + "Returning only buildings built between 1990 and 2020\n", |
9 | 10 | "\n",
|
10 | 11 | "Mark Bauer"
|
11 | 12 | ]
|
|
14 | 15 | "cell_type": "markdown",
|
15 | 16 | "metadata": {},
|
16 | 17 | "source": [
|
17 |
| - "# Datasets" |
| 18 | + "## Datasets" |
18 | 19 | ]
|
19 | 20 | },
|
20 | 21 | {
|
21 | 22 | "cell_type": "markdown",
|
22 | 23 | "metadata": {},
|
23 | 24 | "source": [
|
24 |
| - "## Building Footprints \n", |
| 25 | + "### Building Footprints \n", |
25 | 26 | "https://data.cityofnewyork.us/Housing-Development/Building-Footprints/nqwf-w8eh\n",
|
26 | 27 | "\n",
|
27 | 28 | "\n"
|
|
31 | 32 | "cell_type": "markdown",
|
32 | 33 | "metadata": {},
|
33 | 34 | "source": [
|
34 |
| - "## PLUTO \n", |
| 35 | + "### PLUTO \n", |
35 | 36 | "https://data.cityofnewyork.us/City-Government/Primary-Land-Use-Tax-Lot-Output-Map-MapPLUTO-/f888-ni5f\n",
|
36 | 37 | "\n",
|
37 | 38 | ""
|
|
46 | 47 | },
|
47 | 48 | {
|
48 | 49 | "cell_type": "code",
|
49 |
| - "execution_count": 1, |
| 50 | + "execution_count": 14, |
50 | 51 | "metadata": {},
|
51 | 52 | "outputs": [],
|
52 | 53 | "source": [
|
53 | 54 | "# importing libraries\n",
|
54 |
| - "import pandas as pd # the pd is by convention\n", |
55 |
| - "import numpy as np # as is the np\n", |
56 |
| - "import matplotlib\n", |
57 |
| - "import matplotlib.pyplot as plt\n", |
58 |
| - "import matplotlib.ticker as ticker\n", |
59 |
| - "from matplotlib.ticker import FuncFormatter\n", |
60 |
| - "import seaborn as sns\n", |
61 |
| - "from scipy import stats\n", |
| 55 | + "import pandas as pd \n", |
| 56 | + "import numpy as np \n", |
62 | 57 | "import requests\n",
|
63 | 58 | "import os\n",
|
64 | 59 | "from io import BytesIO\n",
|
|
71 | 66 | },
|
72 | 67 | {
|
73 | 68 | "cell_type": "code",
|
74 |
| - "execution_count": 2, |
| 69 | + "execution_count": 15, |
75 | 70 | "metadata": {},
|
76 | 71 | "outputs": [
|
77 | 72 | {
|
|
110 | 105 | },
|
111 | 106 | {
|
112 | 107 | "cell_type": "code",
|
113 |
| - "execution_count": 15, |
| 108 | + "execution_count": 9, |
114 | 109 | "metadata": {},
|
115 | 110 | "outputs": [
|
116 | 111 | {
|
117 | 112 | "name": "stdout",
|
118 | 113 | "output_type": "stream",
|
119 | 114 | "text": [
|
120 |
| - "rows: 1,084,399, columns: 15\n" |
| 115 | + "rows: 1,084,499, columns: 15\n" |
121 | 116 | ]
|
122 | 117 | },
|
123 | 118 | {
|
|
283 | 278 | "4 Photogramm "
|
284 | 279 | ]
|
285 | 280 | },
|
286 |
| - "execution_count": 15, |
| 281 | + "execution_count": 9, |
287 | 282 | "metadata": {},
|
288 | 283 | "output_type": "execute_result"
|
289 | 284 | }
|
|
310 | 305 | "output_type": "stream",
|
311 | 306 | "text": [
|
312 | 307 | "<class 'pandas.core.frame.DataFrame'>\n",
|
313 |
| - "RangeIndex: 1084399 entries, 0 to 1084398\n", |
| 308 | + "Int64Index: 93527 entries, 5 to 1084160\n", |
314 | 309 | "Data columns (total 15 columns):\n",
|
315 |
| - " # Column Non-Null Count Dtype \n", |
316 |
| - "--- ------ -------------- ----- \n", |
317 |
| - " 0 the_geom 1084399 non-null object \n", |
318 |
| - " 1 NAME 1936 non-null object \n", |
319 |
| - " 2 BIN 1084399 non-null int64 \n", |
320 |
| - " 3 CNSTRCT_YR 1073525 non-null float64\n", |
321 |
| - " 4 LSTMODDATE 1084399 non-null object \n", |
322 |
| - " 5 LSTSTATYPE 1084173 non-null object \n", |
323 |
| - " 6 DOITT_ID 1084399 non-null int64 \n", |
324 |
| - " 7 HEIGHTROOF 1081721 non-null float64\n", |
325 |
| - " 8 FEAT_CODE 1084391 non-null float64\n", |
326 |
| - " 9 GROUNDELEV 1083857 non-null float64\n", |
327 |
| - " 10 SHAPE_AREA 1084399 non-null int64 \n", |
328 |
| - " 11 SHAPE_LEN 1084399 non-null int64 \n", |
329 |
| - " 12 BASE_BBL 1084395 non-null float64\n", |
330 |
| - " 13 MPLUTO_BBL 1082057 non-null float64\n", |
331 |
| - " 14 GEOMSOURCE 1084172 non-null object \n", |
| 310 | + " # Column Non-Null Count Dtype \n", |
| 311 | + "--- ------ -------------- ----- \n", |
| 312 | + " 0 the_geom 93527 non-null object \n", |
| 313 | + " 1 NAME 250 non-null object \n", |
| 314 | + " 2 BIN 93527 non-null int64 \n", |
| 315 | + " 3 CNSTRCT_YR 93527 non-null float64\n", |
| 316 | + " 4 LSTMODDATE 93527 non-null object \n", |
| 317 | + " 5 LSTSTATYPE 93449 non-null object \n", |
| 318 | + " 6 DOITT_ID 93527 non-null int64 \n", |
| 319 | + " 7 HEIGHTROOF 91832 non-null float64\n", |
| 320 | + " 8 FEAT_CODE 93525 non-null float64\n", |
| 321 | + " 9 GROUNDELEV 93251 non-null float64\n", |
| 322 | + " 10 SHAPE_AREA 93527 non-null int64 \n", |
| 323 | + " 11 SHAPE_LEN 93527 non-null int64 \n", |
| 324 | + " 12 BASE_BBL 93527 non-null float64\n", |
| 325 | + " 13 MPLUTO_BBL 93527 non-null float64\n", |
| 326 | + " 14 GEOMSOURCE 93467 non-null object \n", |
332 | 327 | "dtypes: float64(6), int64(4), object(5)\n",
|
333 |
| - "memory usage: 124.1+ MB\n" |
| 328 | + "memory usage: 11.4+ MB\n" |
334 | 329 | ]
|
335 | 330 | }
|
336 | 331 | ],
|
|
348 | 343 | "name": "stdout",
|
349 | 344 | "output_type": "stream",
|
350 | 345 | "text": [
|
351 |
| - "rows: 94,831, columns: 15\n" |
| 346 | + "rows: 93,527, columns: 15\n" |
352 | 347 | ]
|
353 | 348 | }
|
354 | 349 | ],
|
355 | 350 | "source": [
|
356 |
| - "# returning only building built between 1945 and 2020\n", |
| 351 | + "# returning only building built between 1990 and 2020\n", |
357 | 352 | "building_footprints = building_footprints.loc[building_footprints['CNSTRCT_YR'].between(1990, 2020)]\n",
|
358 | 353 | "\n",
|
359 | 354 | "# new shape of data\n",
|
|
370 | 365 | "name": "stdout",
|
371 | 366 | "output_type": "stream",
|
372 | 367 | "text": [
|
373 |
| - "count null: 1,302\n", |
| 368 | + "count null: 0\n", |
374 | 369 | "dropping nulls...\n",
|
| 370 | + "\n", |
375 | 371 | "count null: 0\n",
|
376 |
| - "rows: 93,529, columns: 15\n" |
| 372 | + "rows: 93,527, columns: 15\n" |
377 | 373 | ]
|
378 | 374 | }
|
379 | 375 | ],
|
|
382 | 378 | "count_null = building_footprints['MPLUTO_BBL'].isnull().sum()\n",
|
383 | 379 | "print('count null: {:,}'.format(count_null))\n",
|
384 | 380 | "\n",
|
385 |
| - "print('dropping nulls...')\n", |
| 381 | + "print('dropping nulls...\\n')\n", |
386 | 382 | "building_footprints = building_footprints.dropna(subset=['MPLUTO_BBL'])\n",
|
387 | 383 | "count_null = building_footprints['MPLUTO_BBL'].isnull().sum()\n",
|
388 | 384 | "print('count null: {:,}'.format(count_null))\n",
|
|
1047 | 1043 | "name": "stdout",
|
1048 | 1044 | "output_type": "stream",
|
1049 | 1045 | "text": [
|
1050 |
| - "rows: 93,117, columns: 36\n" |
| 1046 | + "rows: 93,113, columns: 36\n" |
1051 | 1047 | ]
|
1052 | 1048 | },
|
1053 | 1049 | {
|
|
1281 | 1277 | "output_type": "stream",
|
1282 | 1278 | "text": [
|
1283 | 1279 | "<class 'pandas.core.frame.DataFrame'>\n",
|
1284 |
| - "RangeIndex: 93117 entries, 0 to 93116\n", |
| 1280 | + "RangeIndex: 93113 entries, 0 to 93112\n", |
1285 | 1281 | "Data columns (total 36 columns):\n",
|
1286 | 1282 | " # Column Non-Null Count Dtype \n",
|
1287 | 1283 | "--- ------ -------------- ----- \n",
|
1288 |
| - " 0 the_geom 93117 non-null object \n", |
| 1284 | + " 0 the_geom 93113 non-null object \n", |
1289 | 1285 | " 1 NAME 239 non-null object \n",
|
1290 |
| - " 2 BIN 93117 non-null int64 \n", |
1291 |
| - " 3 CNSTRCT_YR 93117 non-null float64\n", |
1292 |
| - " 4 LSTMODDATE 93117 non-null object \n", |
1293 |
| - " 5 LSTSTATYPE 93043 non-null object \n", |
1294 |
| - " 6 DOITT_ID 93117 non-null int64 \n", |
1295 |
| - " 7 HEIGHTROOF 91484 non-null float64\n", |
1296 |
| - " 8 FEAT_CODE 93115 non-null float64\n", |
1297 |
| - " 9 GROUNDELEV 92857 non-null float64\n", |
1298 |
| - " 10 SHAPE_AREA 93117 non-null int64 \n", |
1299 |
| - " 11 SHAPE_LEN 93117 non-null int64 \n", |
1300 |
| - " 12 BASE_BBL 93117 non-null float64\n", |
1301 |
| - " 13 MPLUTO_BBL 93117 non-null float64\n", |
1302 |
| - " 14 GEOMSOURCE 93060 non-null object \n", |
1303 |
| - " 15 borough 93117 non-null object \n", |
1304 |
| - " 16 block 93117 non-null int64 \n", |
1305 |
| - " 17 lot 93117 non-null int64 \n", |
1306 |
| - " 18 cd 93094 non-null float64\n", |
1307 |
| - " 19 ct2010 93094 non-null float64\n", |
1308 |
| - " 20 cb2010 93094 non-null float64\n", |
1309 |
| - " 21 council 93094 non-null float64\n", |
1310 |
| - " 22 zipcode 93056 non-null float64\n", |
1311 |
| - " 23 bldgclass 93092 non-null object \n", |
1312 |
| - " 24 landuse 92845 non-null float64\n", |
| 1286 | + " 2 BIN 93113 non-null int64 \n", |
| 1287 | + " 3 CNSTRCT_YR 93113 non-null float64\n", |
| 1288 | + " 4 LSTMODDATE 93113 non-null object \n", |
| 1289 | + " 5 LSTSTATYPE 93039 non-null object \n", |
| 1290 | + " 6 DOITT_ID 93113 non-null int64 \n", |
| 1291 | + " 7 HEIGHTROOF 91481 non-null float64\n", |
| 1292 | + " 8 FEAT_CODE 93111 non-null float64\n", |
| 1293 | + " 9 GROUNDELEV 92853 non-null float64\n", |
| 1294 | + " 10 SHAPE_AREA 93113 non-null int64 \n", |
| 1295 | + " 11 SHAPE_LEN 93113 non-null int64 \n", |
| 1296 | + " 12 BASE_BBL 93113 non-null float64\n", |
| 1297 | + " 13 MPLUTO_BBL 93113 non-null float64\n", |
| 1298 | + " 14 GEOMSOURCE 93056 non-null object \n", |
| 1299 | + " 15 borough 93113 non-null object \n", |
| 1300 | + " 16 block 93113 non-null int64 \n", |
| 1301 | + " 17 lot 93113 non-null int64 \n", |
| 1302 | + " 18 cd 93090 non-null float64\n", |
| 1303 | + " 19 ct2010 93090 non-null float64\n", |
| 1304 | + " 20 cb2010 93090 non-null float64\n", |
| 1305 | + " 21 council 93090 non-null float64\n", |
| 1306 | + " 22 zipcode 93052 non-null float64\n", |
| 1307 | + " 23 bldgclass 93088 non-null object \n", |
| 1308 | + " 24 landuse 92841 non-null float64\n", |
1313 | 1309 | " 25 ownertype 3242 non-null object \n",
|
1314 |
| - " 26 borocode 93117 non-null int64 \n", |
1315 |
| - " 27 bbl 93117 non-null int64 \n", |
1316 |
| - " 28 tract2010 93094 non-null float64\n", |
1317 |
| - " 29 xcoord 93094 non-null float64\n", |
1318 |
| - " 30 ycoord 93094 non-null float64\n", |
1319 |
| - " 31 latitude 93094 non-null float64\n", |
1320 |
| - " 32 longitude 93094 non-null float64\n", |
1321 |
| - " 33 plutomapid 93117 non-null int64 \n", |
1322 |
| - " 34 firm07_flag 7912 non-null float64\n", |
1323 |
| - " 35 pfirm15_flag 10838 non-null float64\n", |
| 1310 | + " 26 borocode 93113 non-null int64 \n", |
| 1311 | + " 27 bbl 93113 non-null int64 \n", |
| 1312 | + " 28 tract2010 93090 non-null float64\n", |
| 1313 | + " 29 xcoord 93090 non-null float64\n", |
| 1314 | + " 30 ycoord 93090 non-null float64\n", |
| 1315 | + " 31 latitude 93090 non-null float64\n", |
| 1316 | + " 32 longitude 93090 non-null float64\n", |
| 1317 | + " 33 plutomapid 93113 non-null int64 \n", |
| 1318 | + " 34 firm07_flag 7913 non-null float64\n", |
| 1319 | + " 35 pfirm15_flag 10839 non-null float64\n", |
1324 | 1320 | "dtypes: float64(19), int64(9), object(8)\n",
|
1325 | 1321 | "memory usage: 25.6+ MB\n"
|
1326 | 1322 | ]
|
|
1340 | 1336 | "name": "stdout",
|
1341 | 1337 | "output_type": "stream",
|
1342 | 1338 | "text": [
|
1343 |
| - "number of rows in new datframe: 93117\n" |
| 1339 | + "number of rows in new datframe: 93113\n" |
1344 | 1340 | ]
|
1345 | 1341 | },
|
1346 | 1342 | {
|
|
1569 | 1565 | },
|
1570 | 1566 | {
|
1571 | 1567 | "cell_type": "code",
|
1572 |
| - "execution_count": 28, |
| 1568 | + "execution_count": 27, |
1573 | 1569 | "metadata": {},
|
1574 | 1570 | "outputs": [],
|
1575 | 1571 | "source": [
|
|
1578 | 1574 | },
|
1579 | 1575 | {
|
1580 | 1576 | "cell_type": "code",
|
1581 |
| - "execution_count": 29, |
| 1577 | + "execution_count": 28, |
1582 | 1578 | "metadata": {},
|
1583 | 1579 | "outputs": [
|
1584 | 1580 | {
|
|
0 commit comments