16
16
17
17
# Global variables
18
18
browser = None
19
-
20
19
BASE_URL = 'https://cas-simsu.grenet.fr/login?service=http%3A%2F%2Fchamilo.univ-grenoble-alpes.fr%2Fmain%2Fauth%2Fcas%2Flogincas.php'
21
20
22
- valid_course_re = re .compile (".*courses\/[^UGA.*].*" )
23
- valid_document_re = '/main/document/document.php'
24
- valid_file_ext = ['.jpg' , '.png' , '.docx' , '.pdf' ]
25
-
26
21
27
22
def get_credentials (username , password ):
28
23
'''Ask the user for credentials in case they are not provided.'''
@@ -65,6 +60,8 @@ def get_courses(courses_page, blacklist=[]):
65
60
'''Extract all courses and their url from the main page of chamilo.'''
66
61
# TODO: Allow to blacklist courses
67
62
courses = set ()
63
+ valid_course_re = re .compile (".*courses\/[^UGA.*].*" )
64
+
68
65
courses_soup = bs (courses_page , "html.parser" )
69
66
headers = courses_soup .findAll ('h4' , class_ = 'course-items-title' )
70
67
for course in headers :
@@ -82,48 +79,33 @@ def get_documents_route(course_page):
82
79
return None if not docs else docs [0 ]['href' ]
83
80
84
81
85
- def extract_files (base_path , course_name , page_url ):
82
+ def extract_files (base_path , course_name , page_url , files = set () ):
86
83
'''Recursively extract all documents names and urls from a course page.'''
87
- files = set ()
88
84
res = browser .open (page_url )
89
85
course = bs (res , 'html.parser' )
90
- cdheader = res .get ('Content-Disposition' , None )
91
- if cdheader is not None :
92
- value , params = cgi .parse_header (cdheader )
93
- if value == 'attachment' :
94
- files .add ((base_path / course_name , page_url ))
95
- return files
96
86
97
87
table = course .find ('table' )
98
88
if table is None :
99
89
return files
100
90
101
- # All file links are inside the table
102
- for td in table .findAll ('td' ):
103
- a = td .find ('a' , href = True )
104
- if not a or not a .get ('href' ):
105
- continue
91
+ rows = table .findAll ('tr' )[1 :]
92
+ for row in rows :
93
+ cols = row .find_all ('td' )
106
94
107
- file_name = a .text .strip ()
108
- file_url = a ['href' ]
109
- # Don't add the `go back` link
110
- if file_name == course_name or not file_name :
111
- continue
95
+ # The image in the first column corresponds to the type of link
96
+ img_desc = cols [0 ].find ('img' )
112
97
113
- # Dirty way to know if the url is for a file or for a folder
114
- file_extensions = file_name . split ( '.' )
115
- file_path = base_path / course_name / file_name
98
+ file_a = cols [ 1 ]. find ( 'a' )
99
+ file_name = file_a . text . strip ( )
100
+ file_url = file_a [ 'href' ]
116
101
117
- if len (file_extensions ) < 2 and len (file_extensions [- 1 ]) < 3 :
118
- new_files = extract_files (base_path / course_name , file_name , file_url )
119
- files = files .union (new_files )
102
+ if 'folder' in img_desc ['src' ]:
103
+ extract_files (base_path / course_name , file_name , file_url , files )
120
104
else :
121
- files .add ((file_path , file_url ))
122
-
105
+ files .add ((base_path / course_name / file_name , file_url ))
123
106
124
107
return files
125
108
126
-
127
109
def run (base_path , username , password ):
128
110
'''
129
111
Entry point of the scraper.
@@ -164,8 +146,9 @@ def run(base_path, username, password):
164
146
else :
165
147
parent = file_name .parent
166
148
parent .mkdir (parents = True , exist_ok = True )
167
- print (f"[DOWNLOAD ] { file_name .relative_to (base_path )} " )
149
+ print (f"[START ] { file_name .relative_to (base_path )} " , end = '' )
168
150
download_file (file_name , file_url )
151
+ print (f"\r [DOWNLOAD] { file_name .relative_to (base_path )} " )
169
152
sleep (1 ) # Throttle the connection
170
153
print ()
171
154
0 commit comments