Skip to content

Commit 225fc45

Browse files
committed
Add script to check for broken links
1 parent d6ab8c5 commit 225fc45

File tree

1 file changed

+95
-0
lines changed

1 file changed

+95
-0
lines changed

checklinks.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import os
2+
import html.parser
3+
import urllib.parse
4+
import urllib.request
5+
import json
6+
7+
# Checks for broken links, preemptively saving all linked pages
8+
# to the Internet Archive's Wayback Machine.
9+
# Requires Python 3.
10+
11+
self_dir = os.path.dirname(__file__)
12+
html_path = os.path.join(self_dir, "index.html")
13+
14+
def extract_links_from_html_file(path):
15+
class LinkExtractParser(html.parser.HTMLParser):
16+
def __init__(self):
17+
super().__init__()
18+
self.links = []
19+
20+
def handle_starttag(self, tag, attrs):
21+
if tag == "a":
22+
for (k, v) in attrs:
23+
if k == "href":
24+
self.links.append(v)
25+
26+
parser = LinkExtractParser()
27+
28+
with open(html_path) as file:
29+
parser.feed(file.read())
30+
31+
def is_http_link(href):
32+
return href.startswith("https:") or href.startswith("http:")
33+
34+
# Drop duplicates, filter out non-http-links, sort
35+
links = list(filter(is_http_link, sorted(set(parser.links))))
36+
return links
37+
38+
def try_fetch_webpage(url):
39+
try:
40+
response = urllib.request.urlopen(url)
41+
except urllib.error.URLError as e:
42+
if hasattr(e, 'code'):
43+
return {"error": str(e.code)}
44+
elif hasattr(e, 'reason'):
45+
return {"error": e.reason}
46+
else:
47+
if response.geturl() != url:
48+
return {"redirect": response.geturl()}
49+
else:
50+
return {"success": True}
51+
52+
def get_wayback_machine_archived_page(url):
53+
api_url = "https://archive.org/wayback/available?url=" + url
54+
response = urllib.request.urlopen(api_url)
55+
data = json.loads(response.read())
56+
if "closest" in data["archived_snapshots"]:
57+
if data["archived_snapshots"]["closest"]["available"]:
58+
return data["archived_snapshots"]["closest"]["url"]
59+
return None
60+
61+
def save_in_wayback_machine(url):
62+
api_url = "https://web.archive.org/save/" + url
63+
response = urllib.request.urlopen(api_url)
64+
if "Content-Location" in response.info() and response.info()["Content-Location"].startswith("/web/"):
65+
return "https://web.archive.org" + response.info()["Content-Location"]
66+
return None
67+
68+
urls = extract_links_from_html_file(html_path)
69+
for url in urls:
70+
result = try_fetch_webpage(url)
71+
72+
if "success" in result:
73+
print("OK {}".format(url))
74+
if get_wayback_machine_archived_page(url) is None:
75+
try:
76+
wayback_url = save_in_wayback_machine(url)
77+
print(" Archived as: {}".format(wayback_url))
78+
except:
79+
pass
80+
81+
elif "redirect" in result:
82+
print("REDIRECT {}".format(url))
83+
print(" Redirects to: {}".format(result["redirect"]))
84+
85+
elif "error" in result:
86+
print("BROKEN {}".format(url))
87+
print(" Reason: {}".format(result["error"]))
88+
89+
if "success" not in result:
90+
try:
91+
wayback_url = get_wayback_machine_archived_page(url)
92+
if wayback_url is not None:
93+
print(" Archived copy: {}".format(wayback_url))
94+
except:
95+
pass

0 commit comments

Comments
 (0)