Skip to content

Commit 417d0f7

Browse files
committed
padi scraper
1 parent 19ca115 commit 417d0f7

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed

padi_scraper.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/usr/bin/python3
2+
3+
'''
4+
A web scraper that collects all diving shop listed by PADI
5+
'''
6+
7+
from time import sleep
8+
import requests
9+
import json
10+
11+
delay = 10
12+
current_page = 0
13+
total_pages = 118
14+
size = 50
15+
key = '516d8696d377'
16+
17+
diveshops = []
18+
sponsored_diveshops = []
19+
20+
url = ''
21+
22+
def save_files():
23+
'''Saves JSON data retrieved from the API to disk'''
24+
25+
with open('diveshops.json', 'w') as diveshop_file:
26+
json.dump(diveshops, diveshop_file)
27+
28+
with open('sponsored_diveshops.json', 'w') as sponsored_diveshop_file:
29+
json.dump(sponsored_diveshops, sponsored_diveshop_file)
30+
31+
32+
def show_progress():
33+
'''Prints out to the console the current progress of the scraper'''
34+
35+
processed_pages = current_page + 1
36+
percentage = processed_pages * 100 / total_pages
37+
print(f'{processed_pages} pages downloaded ({round(percentage, 2)}%)')
38+
39+
40+
def main():
41+
'''Runs on a loop until all available endpoints have been consumed'''
42+
43+
while True:
44+
45+
# Make the request and parse JSON data
46+
response = requests.get(url).text
47+
json_response = json.loads(response)
48+
49+
# Extract from the response the data we are interested in saving
50+
diveshop_results = json_response['results']
51+
sponsored_results = json_response['sponsoredResults']
52+
53+
# Append to global variables each time since response is paginated
54+
diveshops.extend(diveshop_results)
55+
sponsored_diveshops.extend(sponsored_results)
56+
57+
# Print progress to the console
58+
show_progress()
59+
60+
try:
61+
# Attempt to fetch next page from API. Will error out on the very last one
62+
url = json_response['_links']['next']['href']
63+
64+
# Artificially delay next request to avoid spamming the server
65+
sleep(5)
66+
67+
except 'next':
68+
break
69+
70+
save_files()
71+
print(f'{len(diveshops)} dive shops found')
72+
print(f'{len(sponsored_diveshops)} sponsored dive shops found')
73+
print('DONE!')
74+
75+
76+
main()

0 commit comments

Comments
 (0)