Skip to content

Commit f77b777

Browse files
committed
fix: recursively download all files correctly
feat: refactor extract_files to use the metadata from the table
1 parent 17f60e3 commit f77b777

File tree

1 file changed

+16
-33
lines changed

1 file changed

+16
-33
lines changed

main.py

Lines changed: 16 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,8 @@
1616

1717
# Global variables
1818
browser = None
19-
2019
BASE_URL = 'https://cas-simsu.grenet.fr/login?service=http%3A%2F%2Fchamilo.univ-grenoble-alpes.fr%2Fmain%2Fauth%2Fcas%2Flogincas.php'
2120

22-
valid_course_re = re.compile(".*courses\/[^UGA.*].*")
23-
valid_document_re = '/main/document/document.php'
24-
valid_file_ext = ['.jpg', '.png', '.docx', '.pdf']
25-
2621

2722
def get_credentials(username, password):
2823
'''Ask the user for credentials in case they are not provided.'''
@@ -65,6 +60,8 @@ def get_courses(courses_page, blacklist=[]):
6560
'''Extract all courses and their url from the main page of chamilo.'''
6661
# TODO: Allow to blacklist courses
6762
courses = set()
63+
valid_course_re = re.compile(".*courses\/[^UGA.*].*")
64+
6865
courses_soup = bs(courses_page, "html.parser")
6966
headers = courses_soup.findAll('h4', class_='course-items-title')
7067
for course in headers:
@@ -82,48 +79,33 @@ def get_documents_route(course_page):
8279
return None if not docs else docs[0]['href']
8380

8481

85-
def extract_files(base_path, course_name, page_url):
82+
def extract_files(base_path, course_name, page_url, files=set()):
8683
'''Recursively extract all documents names and urls from a course page.'''
87-
files = set()
8884
res = browser.open(page_url)
8985
course = bs(res, 'html.parser')
90-
cdheader = res.get('Content-Disposition', None)
91-
if cdheader is not None:
92-
value, params = cgi.parse_header(cdheader)
93-
if value == 'attachment':
94-
files.add((base_path / course_name, page_url))
95-
return files
9686

9787
table = course.find('table')
9888
if table is None:
9989
return files
10090

101-
# All file links are inside the table
102-
for td in table.findAll('td'):
103-
a = td.find('a', href=True)
104-
if not a or not a.get('href'):
105-
continue
91+
rows = table.findAll('tr')[1:]
92+
for row in rows:
93+
cols = row.find_all('td')
10694

107-
file_name = a.text.strip()
108-
file_url = a['href']
109-
# Don't add the `go back` link
110-
if file_name == course_name or not file_name:
111-
continue
95+
# The image in the first column corresponds to the type of link
96+
img_desc = cols[0].find('img')
11297

113-
# Dirty way to know if the url is for a file or for a folder
114-
file_extensions = file_name.split('.')
115-
file_path = base_path / course_name / file_name
98+
file_a = cols[1].find('a')
99+
file_name = file_a.text.strip()
100+
file_url = file_a['href']
116101

117-
if len(file_extensions) < 2 and len(file_extensions[-1]) < 3:
118-
new_files = extract_files(base_path / course_name, file_name, file_url)
119-
files = files.union(new_files)
102+
if 'folder' in img_desc['src']:
103+
extract_files(base_path / course_name, file_name, file_url, files)
120104
else:
121-
files.add((file_path, file_url))
122-
105+
files.add((base_path / course_name / file_name, file_url))
123106

124107
return files
125108

126-
127109
def run(base_path, username, password):
128110
'''
129111
Entry point of the scraper.
@@ -164,8 +146,9 @@ def run(base_path, username, password):
164146
else:
165147
parent = file_name.parent
166148
parent.mkdir(parents=True, exist_ok=True)
167-
print(f"[DOWNLOAD] {file_name.relative_to(base_path)}")
149+
print(f"[START] {file_name.relative_to(base_path)}", end='')
168150
download_file(file_name, file_url)
151+
print(f"\r[DOWNLOAD] {file_name.relative_to(base_path)}")
169152
sleep(1) # Throttle the connection
170153
print()
171154

0 commit comments

Comments
 (0)