Skip to content

Commit 4794110

Browse files
authored
Merge pull request #14 from processing/multithreaded
Added multi-threading to the update script
2 parents e88fea6 + a03e748 commit 4794110

File tree

3 files changed

+27
-14
lines changed

3 files changed

+27
-14
lines changed

scripts/add_new_contribution_to_yaml.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
given properties, add a new contribution to the contributions.yaml database file.
33
"""
4-
from datetime import datetime
4+
from datetime import datetime, UTC
55
import json
66
import pathlib
77
from sys import argv
@@ -39,7 +39,7 @@
3939

4040
# append new contribution with next index
4141
# add status, at top
42-
datetime_today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S%z')
42+
datetime_today = datetime.now(UTC).strftime('%Y-%m-%dT%H:%M:%S%z')
4343
contribution = {
4444
'id': max_index + 1,
4545
'status': 'VALID',

scripts/fetch_updates.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22
Reads in the contributions.yaml file, and updates the entries by hitting the 'source' url.
33
"""
44
import argparse
5-
from datetime import datetime
5+
from datetime import datetime, UTC
66
import pathlib
77
from ruamel.yaml import YAML
8+
from multiprocessing import Pool
89

910
from parse_and_validate_properties_txt import read_properties_txt, parse_text, validate_existing
1011

1112

1213
def update_contribution(contribution, props):
13-
datetime_today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S%z')
14+
datetime_today = datetime.now(UTC).strftime('%Y-%m-%dT%H:%M:%S%z')
1415
contribution['lastUpdated'] = datetime_today
1516
if 'previousVersions' not in contribution:
1617
contribution['previousVersions'] = []
@@ -29,6 +30,7 @@ def update_contribution(contribution, props):
2930

3031
if 'download' not in contribution:
3132
contribution['download'] = contribution['source'][:contribution['source'].rfind('.')] + '.zip'
33+
3234

3335
def log_broken(contribution, msg):
3436
if contribution['status'] == 'VALID':
@@ -37,8 +39,10 @@ def log_broken(contribution, msg):
3739
contribution['log'] = []
3840
contribution['log'].append(msg)
3941

40-
def process_contribution(contribution):
41-
date_today = datetime.utcnow().strftime('%Y-%m-%d')
42+
def process_contribution(item):
43+
index, contribution = item
44+
45+
date_today = datetime.now(UTC).strftime('%Y-%m-%d')
4246
this_version = '0'
4347

4448
if contribution['status'] != 'DEPRECATED':
@@ -51,16 +55,16 @@ def process_contribution(contribution):
5155
properties_raw = read_properties_txt(contribution['source'])
5256
except FileNotFoundError as e:
5357
log_broken(contribution, f'file not found, {e}, {date_today}')
54-
return
58+
return index, contribution
5559
except Exception:
5660
log_broken(contribution, f'url timeout, {date_today}')
57-
return
61+
return index, contribution
5862

5963
try:
6064
props = validate_existing(parse_text(properties_raw))
6165
except Exception:
6266
log_broken(contribution, f'invalid file, {date_today}')
63-
return
67+
return index, contribution
6468

6569
# some library files have field lastUpdated. This also exists in the database, but is defined
6670
# by our scripts, so remove this field.
@@ -71,6 +75,7 @@ def process_contribution(contribution):
7175
if props['version'] != this_version:
7276
# update from online
7377
update_contribution(contribution, props)
78+
return index, contribution
7479

7580

7681
if __name__ == "__main__":
@@ -92,14 +97,22 @@ def process_contribution(contribution):
9297
contributions_list = data['contributions']
9398

9499
if index == 'all':
95-
# update all contributions
96-
for contribution in contributions_list:
97-
process_contribution(contribution)
100+
total = len(contributions_list)
101+
completed = 0
102+
print(f"Starting processing of {total} contributions...")
103+
104+
with Pool(processes=256) as pool:
105+
for index, contribution in pool.imap_unordered(process_contribution, enumerate(contributions_list)):
106+
contributions_list[index] = contribution
107+
completed += 1
108+
print(f"Progress: {completed}/{total} ({(completed/total*100):.1f}%)")
109+
110+
print("All processing complete")
98111
else:
99112
# update only contribution with id==index
100113
contribution = next((x for x in contributions_list if x['id'] == int(index)), None)
101114
print(contribution)
102-
process_contribution(contribution)
115+
process_contribution((index, contribution))
103116
print(contribution)
104117

105118
# write all contributions to database file

scripts/parse_and_validate_properties_txt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def read_properties_txt(properties_url):
6464
'User-Agent': 'Mozilla/5.0',
6565
'Accept': 'text/html',
6666
}
67-
r = requests.get(properties_url, headers=headers)
67+
r = requests.get(properties_url, headers=headers, timeout=30)
6868

6969
if r.status_code != 200:
7070
raise FileNotFoundError(f"status code {r.status_code} returned for url {r.url}")

0 commit comments

Comments
 (0)