Skip to content

Commit 61861a4

Browse files
committed
feat: initial payload
1 parent 13300d8 commit 61861a4

File tree

2 files changed

+168
-0
lines changed

2 files changed

+168
-0
lines changed

main.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#!/usr/bin/env python
2+
#
3+
# Script to download all documents from chamilo
4+
5+
import mechanize
6+
import re
7+
import getpass
8+
import sys
9+
import os
10+
from pathlib import Path
11+
12+
13+
# Global variables
14+
BASE_URL = 'https://cas-simsu.grenet.fr/login?service=http%3A%2F%2Fchamilo.univ-grenoble-alpes.fr%2Fmain%2Fauth%2Fcas%2Flogincas.php'
15+
br = None
16+
17+
18+
class ChamiloUGAScraper:
19+
20+
course_re = re.compile('.*courses\/[^UGA.*].*')
21+
22+
def __init__(self):
23+
self.browser = mechanize.Browser()
24+
self.__setup_browser()
25+
self.base_url = BASE_URL
26+
self.courses = []
27+
self.base_path = Path('.').resolve()
28+
29+
def get_credentials(self):
30+
username = input('User: ')
31+
password = getpass.getpass(prompt='Password: ')
32+
return (username, password)
33+
34+
def __setup_browser(self):
35+
cookie_jar = mechanize.LWPCookieJar()
36+
opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookie_jar))
37+
mechanize.install_opener(opener)
38+
self.browser.set_handle_robots(False)
39+
self.browser.set_handle_refresh(False)
40+
self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
41+
self.browser.addheaders = [('User-agent', 'Firefox')]
42+
43+
def download_file(self):
44+
pass
45+
46+
def get_courses(self):
47+
for link in self.browser.links():
48+
if self.course_re.search(link.url):
49+
self.courses.append(link)
50+
51+
def run(self):
52+
# Log the user to access all courses
53+
username, password = self.get_credentials()
54+
55+
self.browser.open(self.base_url)
56+
self.browser.select_form(nr=0)
57+
self.browser.form['username'] = username
58+
self.browser.form['password'] = password
59+
60+
# TODO: Handle errors
61+
res = self.browser.submit()
62+
63+
self.get_courses()
64+
course = None
65+
for c in self.courses:
66+
if c.text == 'M2-EEA-WICS Wireless Communications':
67+
course = c
68+
69+
# TODO: Download courses asynchronously
70+
# for course in list(reversed(self.courses))[0]:
71+
course_path = Path(course.text)
72+
course_path.mkdir(parents=True, exist_ok=True)
73+
74+
# Enter into `Documents` directory and download everything
75+
print(f'Accessing course: {course.text}')
76+
os.chdir(course_path)
77+
request = self.browser.click_link(course)
78+
response = self.browser.follow_link(course)
79+
docs_link = None
80+
for l in self.browser.links():
81+
if l.text == 'Documents':
82+
docs_link = l
83+
84+
request = self.browser.click_link(docs_link)
85+
response = self.browser.follow_link(docs_link)
86+
87+
document_re = '/main/document/document.php'
88+
# TODO: Useful links contain title
89+
for link in self.browser.links():
90+
attrs = dict(link.attrs)
91+
print(attrs)
92+
if document_re in l.url and attrs['title']:
93+
print(l)
94+
95+
os.chdir(self.base_path)
96+
97+
98+
99+
# def download_file(doc_url, filename='', referer=''):
100+
# '''
101+
# Download the given file
102+
# '''
103+
# res = br.click_link(doc_url)
104+
# if referer:
105+
# res.add_header("Referer", referer)
106+
107+
# doc_res = br.open(res)
108+
109+
# # TODO: Check if file exists
110+
# if Path(filename).exists():
111+
# print(f'File {filename} already exists')
112+
# else:
113+
# f = open(filename, 'wb')
114+
# f.write(doc_res.read())
115+
# f.close()
116+
# print(f'File {filename} has been downloaded')
117+
# br.back()
118+
119+
120+
# # TODO: Handle all courses
121+
# for course in courses:
122+
# print(f'Course: {course.text}')
123+
# course_dir = Path(course.text)
124+
# course_dir.mkdir(parents=True, exist_ok=True)
125+
126+
# sys.exit(0)
127+
# request = br.click_link(link)
128+
# response = br.follow_link(link)
129+
130+
# print('--------------------')
131+
# print('Entered in Signal integrity course')
132+
133+
# # Open the documents pages
134+
# docs = None
135+
# for l in br.links():
136+
# if l.text == 'Documents':
137+
# docs = l
138+
139+
140+
# request = br.click_link(docs)
141+
# response = br.follow_link(docs)
142+
143+
# print('--------------------')
144+
# print('Entered in Signal integrity documents')
145+
146+
# # This page should display the documents list
147+
# slides = None
148+
# slides_re = re.compile('^Slides.*')
149+
# for l in br.links():
150+
# if slides_re.search(l.text):
151+
# slides = l
152+
153+
# request = br.click_link(slides)
154+
# response = br.follow_link(slides)
155+
156+
# print('--------------------')
157+
# print('Entered in Signal integrity slides listing')
158+
159+
# for l in br.links():
160+
# if 'pdf' in l.url and l.text: # Is it neccesary to check for pdf?
161+
# name = l.text + '.pdf'
162+
# download_file(l, filename=name, referer=br.geturl())
163+
164+
165+
if __name__ == '__main__':
166+
scraper = ChamiloUGAScraper()
167+
scraper.run()

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
mechanize==0.4.5

0 commit comments

Comments
 (0)