Skip to content

Commit 17f60e3

Browse files
committed
feat: download all documents from chamilo
feat: change structure of script
1 parent 61861a4 commit 17f60e3

File tree

2 files changed

+164
-147
lines changed

2 files changed

+164
-147
lines changed

main.py

Lines changed: 163 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -2,166 +2,182 @@
22
#
33
# Script to download all documents from chamilo
44

5-
import mechanize
6-
import re
5+
import cgi
76
import getpass
7+
import http.cookiejar as cookielib
8+
import re
89
import sys
9-
import os
1010
from pathlib import Path
11+
from time import sleep
12+
import argparse
1113

14+
import mechanize
15+
from bs4 import BeautifulSoup as bs
1216

1317
# Global variables
14-
BASE_URL = 'https://cas-simsu.grenet.fr/login?service=http%3A%2F%2Fchamilo.univ-grenoble-alpes.fr%2Fmain%2Fauth%2Fcas%2Flogincas.php'
15-
br = None
16-
18+
browser = None
1719

18-
class ChamiloUGAScraper:
20+
BASE_URL = 'https://cas-simsu.grenet.fr/login?service=http%3A%2F%2Fchamilo.univ-grenoble-alpes.fr%2Fmain%2Fauth%2Fcas%2Flogincas.php'
1921

20-
course_re = re.compile('.*courses\/[^UGA.*].*')
22+
valid_course_re = re.compile(".*courses\/[^UGA.*].*")
23+
valid_document_re = '/main/document/document.php'
24+
valid_file_ext = ['.jpg', '.png', '.docx', '.pdf']
2125

22-
def __init__(self):
23-
self.browser = mechanize.Browser()
24-
self.__setup_browser()
25-
self.base_url = BASE_URL
26-
self.courses = []
27-
self.base_path = Path('.').resolve()
2826

29-
def get_credentials(self):
27+
def get_credentials(username, password):
28+
'''Ask the user for credentials in case they are not provided.'''
29+
if username is None:
3030
username = input('User: ')
31+
if password is None:
3132
password = getpass.getpass(prompt='Password: ')
32-
return (username, password)
33-
34-
def __setup_browser(self):
35-
cookie_jar = mechanize.LWPCookieJar()
36-
opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookie_jar))
37-
mechanize.install_opener(opener)
38-
self.browser.set_handle_robots(False)
39-
self.browser.set_handle_refresh(False)
40-
self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
41-
self.browser.addheaders = [('User-agent', 'Firefox')]
42-
43-
def download_file(self):
44-
pass
45-
46-
def get_courses(self):
47-
for link in self.browser.links():
48-
if self.course_re.search(link.url):
49-
self.courses.append(link)
50-
51-
def run(self):
52-
# Log the user to access all courses
53-
username, password = self.get_credentials()
54-
55-
self.browser.open(self.base_url)
56-
self.browser.select_form(nr=0)
57-
self.browser.form['username'] = username
58-
self.browser.form['password'] = password
59-
60-
# TODO: Handle errors
61-
res = self.browser.submit()
62-
63-
self.get_courses()
64-
course = None
65-
for c in self.courses:
66-
if c.text == 'M2-EEA-WICS Wireless Communications':
67-
course = c
68-
69-
# TODO: Download courses asynchronously
70-
# for course in list(reversed(self.courses))[0]:
71-
course_path = Path(course.text)
72-
course_path.mkdir(parents=True, exist_ok=True)
73-
74-
# Enter into `Documents` directory and download everything
75-
print(f'Accessing course: {course.text}')
76-
os.chdir(course_path)
77-
request = self.browser.click_link(course)
78-
response = self.browser.follow_link(course)
79-
docs_link = None
80-
for l in self.browser.links():
81-
if l.text == 'Documents':
82-
docs_link = l
83-
84-
request = self.browser.click_link(docs_link)
85-
response = self.browser.follow_link(docs_link)
86-
87-
document_re = '/main/document/document.php'
88-
# TODO: Useful links contain title
89-
for link in self.browser.links():
90-
attrs = dict(link.attrs)
91-
print(attrs)
92-
if document_re in l.url and attrs['title']:
93-
print(l)
94-
95-
os.chdir(self.base_path)
96-
97-
98-
99-
# def download_file(doc_url, filename='', referer=''):
100-
# '''
101-
# Download the given file
102-
# '''
103-
# res = br.click_link(doc_url)
104-
# if referer:
105-
# res.add_header("Referer", referer)
106-
107-
# doc_res = br.open(res)
108-
109-
# # TODO: Check if file exists
110-
# if Path(filename).exists():
111-
# print(f'File {filename} already exists')
112-
# else:
113-
# f = open(filename, 'wb')
114-
# f.write(doc_res.read())
115-
# f.close()
116-
# print(f'File {filename} has been downloaded')
117-
# br.back()
118-
119-
120-
# # TODO: Handle all courses
121-
# for course in courses:
122-
# print(f'Course: {course.text}')
123-
# course_dir = Path(course.text)
124-
# course_dir.mkdir(parents=True, exist_ok=True)
125-
126-
# sys.exit(0)
127-
# request = br.click_link(link)
128-
# response = br.follow_link(link)
129-
130-
# print('--------------------')
131-
# print('Entered in Signal integrity course')
132-
133-
# # Open the documents pages
134-
# docs = None
135-
# for l in br.links():
136-
# if l.text == 'Documents':
137-
# docs = l
138-
139-
140-
# request = br.click_link(docs)
141-
# response = br.follow_link(docs)
142-
143-
# print('--------------------')
144-
# print('Entered in Signal integrity documents')
145-
146-
# # This page should display the documents list
147-
# slides = None
148-
# slides_re = re.compile('^Slides.*')
149-
# for l in br.links():
150-
# if slides_re.search(l.text):
151-
# slides = l
152-
153-
# request = br.click_link(slides)
154-
# response = br.follow_link(slides)
155-
156-
# print('--------------------')
157-
# print('Entered in Signal integrity slides listing')
15833

159-
# for l in br.links():
160-
# if 'pdf' in l.url and l.text: # Is it neccesary to check for pdf?
161-
# name = l.text + '.pdf'
162-
# download_file(l, filename=name, referer=br.geturl())
34+
return (username, password)
35+
36+
37+
def setup_browser():
38+
'''Create the browser and configure it properly.'''
39+
browser = mechanize.Browser()
40+
cookiejar = cookielib.LWPCookieJar()
41+
browser.set_cookiejar(cookiejar)
42+
browser.set_handle_equiv(True)
43+
browser.set_handle_gzip(True)
44+
browser.set_handle_redirect(True)
45+
browser.set_handle_referer(True)
46+
browser.set_handle_robots(False)
47+
browser.set_handle_refresh(
48+
mechanize._http.HTTPRefreshProcessor(), max_time=1
49+
)
50+
browser.addheaders = [
51+
('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36')
52+
]
53+
return browser
54+
55+
56+
def download_file(file_path, file_url):
57+
'''Download the file to the given path.'''
58+
res = browser.open(file_url)
59+
file = open(file_path, 'wb')
60+
file.write(res.read())
61+
file.close()
62+
63+
64+
def get_courses(courses_page, blacklist=[]):
65+
'''Extract all courses and their url from the main page of chamilo.'''
66+
# TODO: Allow to blacklist courses
67+
courses = set()
68+
courses_soup = bs(courses_page, "html.parser")
69+
headers = courses_soup.findAll('h4', class_='course-items-title')
70+
for course in headers:
71+
a = course.find('a', href=True)
72+
if valid_course_re.search(a['href']):
73+
course_name = a.text.strip()
74+
courses.add((course_name, a['href']))
75+
return courses
76+
77+
78+
def get_documents_route(course_page):
79+
'''Given a course page, extract its documents folder url.'''
80+
tags = course_page.findAll('a', href=True)
81+
docs = list(filter(lambda a: a.text.strip() == 'Documents', tags))
82+
return None if not docs else docs[0]['href']
83+
84+
85+
def extract_files(base_path, course_name, page_url):
86+
'''Recursively extract all documents names and urls from a course page.'''
87+
files = set()
88+
res = browser.open(page_url)
89+
course = bs(res, 'html.parser')
90+
cdheader = res.get('Content-Disposition', None)
91+
if cdheader is not None:
92+
value, params = cgi.parse_header(cdheader)
93+
if value == 'attachment':
94+
files.add((base_path / course_name, page_url))
95+
return files
96+
97+
table = course.find('table')
98+
if table is None:
99+
return files
100+
101+
# All file links are inside the table
102+
for td in table.findAll('td'):
103+
a = td.find('a', href=True)
104+
if not a or not a.get('href'):
105+
continue
106+
107+
file_name = a.text.strip()
108+
file_url = a['href']
109+
# Don't add the `go back` link
110+
if file_name == course_name or not file_name:
111+
continue
112+
113+
# Dirty way to know if the url is for a file or for a folder
114+
file_extensions = file_name.split('.')
115+
file_path = base_path / course_name / file_name
116+
117+
if len(file_extensions) < 2 and len(file_extensions[-1]) < 3:
118+
new_files = extract_files(base_path / course_name, file_name, file_url)
119+
files = files.union(new_files)
120+
else:
121+
files.add((file_path, file_url))
122+
123+
124+
return files
125+
126+
127+
def run(base_path, username, password):
128+
'''
129+
Entry point of the scraper.
130+
Ask the user for credentials, login and
131+
download all documents from every course.
132+
'''
133+
134+
# Login to the main page
135+
username, password = get_credentials(username, password)
136+
browser.open(BASE_URL)
137+
browser.select_form(nr=0)
138+
browser.form['username'] = username
139+
browser.form['password'] = password
140+
res = browser.submit()
141+
142+
# Get all courses
143+
courses = get_courses(res)
144+
if not courses:
145+
# TODO: Better error handling
146+
print("There was a problem with the login")
147+
sys.exit(1)
148+
149+
for course_name, course_href in courses:
150+
print(f'Course: {course_name}')
151+
152+
res = browser.open(course_href)
153+
course = bs(res, 'html.parser')
154+
docs_link = get_documents_route(course)
155+
156+
if not docs_link:
157+
print()
158+
continue
159+
160+
files = extract_files(base_path, course_name, docs_link)
161+
for file_name, file_url in files:
162+
if file_name.exists():
163+
print(f'[EXISTS] {file_name.relative_to(base_path)}')
164+
else:
165+
parent = file_name.parent
166+
parent.mkdir(parents=True, exist_ok=True)
167+
print(f"[DOWNLOAD] {file_name.relative_to(base_path)}")
168+
download_file(file_name, file_url)
169+
sleep(1) # Throttle the connection
170+
print()
163171

164172

165173
if __name__ == '__main__':
166-
scraper = ChamiloUGAScraper()
167-
scraper.run()
174+
ap = argparse.ArgumentParser(description='UGA Chamilo scraper')
175+
ap.add_argument("-d", "--dir", required=False, default='.',
176+
help='''Directory to download all courses. Defaults to the current directory.''')
177+
ap.add_argument("-u", "--username", required=False, help="Username")
178+
ap.add_argument("-p", "--password", required=False, help="Password")
179+
args = vars(ap.parse_args())
180+
181+
base_dir = Path(args['dir']).absolute()
182+
browser = setup_browser()
183+
run(base_dir, args['username'], args['password'])

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
mechanize==0.4.5
2+
beautifulsoup4==4.8.1

0 commit comments

Comments
 (0)