Skip to content

Commit fd59b11

Browse files
authored
Merge pull request #1 from abshk-jr/main
Generator for Offline Documentation
2 parents 42cc8ed + 0ad9b4f commit fd59b11

File tree

5 files changed

+336
-0
lines changed

5 files changed

+336
-0
lines changed

.github/workflows/release.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
name: release
2+
3+
on:
4+
workflow_dispatch:
5+
6+
jobs:
7+
build:
8+
runs-on: ubuntu-latest
9+
steps:
10+
- name: Checkout
11+
uses: actions/checkout@v2
12+
13+
- name: Run .py script for Doc Generation
14+
run: |
15+
cmake . -Bbuild
16+
cmake --build build
17+
18+
- name: Upload zipped doc file to release
19+
uses: svenstaro/upload-release-action@v2
20+
with:
21+
repo_token: ${{ secrets.GITHUB_TOKEN }}
22+
file: build/docs.zip
23+
asset_name: Generated-Offline-Manual.zip
24+
tag: Docs-Release
25+
overwrite: true
26+
body: "This is a release containing the Docs Generated using Github Actions."
27+

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
cmake_minimum_required(VERSION 3.16)
2+
3+
find_package(Python3 COMPONENTS Interpreter)
4+
5+
set(Doc_Script ${CMAKE_CURRENT_SOURCE_DIR}/scripts/offline-doc-generator.py)
6+
7+
execute_process(COMMAND ${Python3_EXECUTABLE} -m pip install -r ${CMAKE_CURRENT_SOURCE_DIR}/scripts/requirements.txt)
8+
9+
add_custom_target(doc ALL COMMAND ${Python3_EXECUTABLE} ${Doc_Script})

scripts/config.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
url : 'https://en.wikibooks.org/wiki/OpenSCAD_User_Manual'
3+
url_css : 'https://en.wikipedia.org/w/load.php?debug=false&lang=en&modules=mediawiki.legacy.commonPrint,shared|skins.vector.styles&only=styles&skin=vector&*'
4+
url_wiki : 'https://en.wikibooks.org'
5+
url_api : 'https://en.wikibooks.org/w/api.php?action=parse&format=xml&prop=text&page='
6+
7+
pages_for_exclusion : ['https://en.wikibooks.org/wiki/OpenSCAD_User_Manual/Example/Strandbeest']
8+
9+
user_agent_val : 'Generator-for-Offline-Documentation (https://github.com/abshk-jr ; https://github.com/opencax/GSoC/issues/6 ; https://summerofcode.withgoogle.com/projects/#6746958066089984) urllib/3.9.0 [BeautifulSoup/4.9.0]'

scripts/offline-doc-generator.py

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
'''
2+
This is the program for Generator for offline documentation
3+
more about which can be found out at https://github.com/opencax/GSoC/issues/6
4+
and the GSOC project details for the same are present at
5+
https://summerofcode.withgoogle.com/projects/#6746958066089984
6+
7+
'''
8+
import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, os, yaml
9+
from bs4 import BeautifulSoup as bs,Comment, Doctype
10+
import shutil
11+
12+
with open(os.path.join( os.path.dirname(__file__),'config.yml'),'r') as file:
13+
config = yaml.safe_load(file)
14+
15+
globals().update(config)
16+
17+
'''
18+
The line above does the same work as the code given below
19+
---------------------------------------------------------
20+
url = config['url']
21+
url_css = config['url_css']
22+
url_wiki = config['url_wiki']
23+
url_api = config['url_api']
24+
pages_for_exclusion = config['pages_for_exclusion']
25+
user_agent_val = config['user_agent_val']
26+
---------------------------------------------------------
27+
'''
28+
29+
dir_docs = 'openscad_docs'
30+
dir_imgs = os.path.join( dir_docs, 'imgs')
31+
dir_maths = os.path.join( dir_docs, 'imgs','maths')
32+
dir_styles = os.path.join( dir_docs, 'styles')
33+
34+
#Create the directories to save the doc if they don't exist
35+
if not os.path.exists(dir_docs): os.makedirs(dir_docs)
36+
if not os.path.exists(dir_imgs): os.makedirs(dir_imgs)
37+
if not os.path.exists(dir_maths): os.makedirs(dir_maths)
38+
if not os.path.exists(dir_styles): os.makedirs(dir_styles)
39+
40+
pages =[]
41+
pages += pages_for_exclusion
42+
imgs =[]
43+
maths =[]
44+
45+
def getUrl(url):
46+
'''
47+
This function generates the complete url after getting urls form src
48+
/wiki/OpenSCAD_User_Manual get converted to https://en.wikibooks.org/wiki/OpenSCAD_User_Manual
49+
50+
'''
51+
if url.startswith('//'):
52+
url = 'https:'+url
53+
elif not url.startswith( url_wiki ):
54+
url = urllib.parse.urljoin( url_wiki, url[0]=="/" and url[1:] or url)
55+
return url
56+
57+
def getTags(soup):
58+
'''
59+
This function handles the different tags present in the HTML document
60+
for example the image tags
61+
62+
'''
63+
for a in soup.find_all('a'):
64+
href= a.get('href')
65+
if href:
66+
if href[0] != '#':
67+
href = getUrl(href)
68+
if (href.startswith('/wiki/OpenSCAD_User_Manual') or href.startswith(url_wiki + '/wiki/OpenSCAD_User_Manual')):
69+
newhref = (href.replace('#', '.html#') if '#' in href else href+'.html').split('/')[-1]
70+
71+
if 'Print_version.html' not in newhref:
72+
getPages(url=href)
73+
a['href']= newhref
74+
75+
if a.img :
76+
getImages( a )
77+
78+
def getMaths(soup):
79+
'''
80+
This function generates the image version of the math formulas
81+
to be displayed in various HTML files, for example
82+
https://en.wikibooks.org/wiki/OpenSCAD_User_Manual/Mathematical_Operators
83+
and saves them to the directory /openscad_docs/imgs/maths
84+
85+
'''
86+
for img in soup.find_all('img'):
87+
try:
88+
for cls in img['class']:
89+
if('math' in cls):
90+
mathname = img['src'].split("/")[-1].split("\\")[-1] + '.svg'
91+
savepath = os.path.join( dir_maths, mathname)
92+
if (not mathname in maths):
93+
opener = urllib.request.build_opener()
94+
opener.addheaders = [('User-Agent',user_agent_val)]
95+
urllib.request.install_opener(opener)
96+
urllib.request.urlretrieve( img['src'] , savepath )
97+
maths.append( mathname )
98+
linkurl = os.path.join('.','imgs/maths',mathname).replace('\\','/')
99+
img['src'] = linkurl
100+
101+
except:
102+
pass
103+
104+
def getImages(tag):
105+
'''
106+
This function generates the images present the in HTML documents
107+
and saves them to the directory /openscad_docs/imgs
108+
109+
'''
110+
src = getUrl( tag.img['src'] )
111+
imgname = src.split("/")[-1]
112+
imgname = imgname.replace('%','_')
113+
imgpath = os.path.join( dir_imgs, imgname)
114+
115+
#The following is to download the image if it hasn't alrady been downloaded
116+
if not imgpath in imgs:
117+
opener = urllib.request.build_opener()
118+
opener.addheaders = [('User-Agent',user_agent_val)]
119+
urllib.request.install_opener(opener)
120+
urllib.request.urlretrieve(src , imgpath)
121+
imgs.append(imgpath)
122+
123+
del tag.img['srcset']
124+
imgpath = os.path.join('.', 'imgs', imgname).replace('\\','/')
125+
tag.img['src'] = imgpath
126+
tag['href']= imgpath
127+
128+
def cleanSoup(soup):
129+
'''
130+
This function cleans the soup by removing the redundant HTML tags
131+
and the parts that are unrelated to the User Manual
132+
'''
133+
134+
#The following deletes the Tags which aren't required in the User Manual
135+
red_div_cls = ["printfooter","catlinks","noprint","magnify"]
136+
red_table_cls= ['noprint','ambox']
137+
red_input_cls= ['toctogglecheckbox']
138+
for cls in red_div_cls:
139+
for tag in soup.findAll('div',{'class':cls}):
140+
tag.decompose()
141+
for cls in red_table_cls:
142+
for tag in soup.findAll('table',{'class':cls}):
143+
tag.decompose()
144+
for cls in red_input_cls:
145+
for tag in soup.findAll('input',{'class':cls}):
146+
tag.decompose()
147+
for tag in soup.findAll('style'):
148+
tag.decompose()
149+
150+
#The following removes the comments present in the HTML document
151+
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
152+
[comment.extract() for comment in comments]
153+
154+
#The following replaces the redundant div Tags with the content present in inside of them
155+
rep_div_cls = ["mw-highlight"]
156+
for kls in rep_div_cls:
157+
for tag in soup.findAll('div',kls):
158+
tag.replaceWithChildren()
159+
160+
#The following is for the cleaning of some redundant li tags
161+
for _ in range(0,7):
162+
for tag in soup.findAll('li',{'class':f'toclevel-{_}'}):
163+
del tag['class']
164+
165+
#The following is for the cleaning/removal of some redundant span tags
166+
for tag in soup.findAll('span'):
167+
try:
168+
if(len(tag.text)==0):
169+
tag.decompose()
170+
for cls in tag['class']:
171+
if(len(cls) <= 2):
172+
tag.replaceWithChildren()
173+
if('mathml' in cls):
174+
tag.decompose()
175+
if cls in ['toctext']:
176+
tag.replaceWithChildren()
177+
if cls in ['mw-headline']:
178+
del tag['class']
179+
if cls in ['mw-editsection','toctogglespan','noprint']:
180+
tag.decompose()
181+
182+
183+
except:
184+
pass
185+
186+
for tag in soup.findAll('ul'):
187+
tag['style'] = 'list-style-image:none'
188+
189+
190+
def getFooter( url, name ):
191+
'''
192+
This function generates the Footer with the license attribution for all the pages
193+
194+
'''
195+
footer = (f'''<footer class='mw-body' style="font-size:13px;color:darkgray;text-align:center;margin-bottom:-1px">
196+
From the WikiBooks article <a style="color:black" href="{url}">{name}</a>
197+
(provided under <a style="color:black" href="https://creativecommons.org/licenses/by-sa/3.0/">
198+
CC-BY-SA-3.0</a>)</footer>''')
199+
200+
return bs(footer,'html.parser')
201+
202+
def getStyled(soup,title):
203+
tag = Doctype('html')
204+
soup.insert(0, tag)
205+
soup.html['lang']='en'
206+
meta_tag = soup.new_tag('meta')
207+
meta_tag['charset'] = 'UTF-8'
208+
soup.head.insert(0,meta_tag)
209+
css_tag = bs('<link rel="stylesheet" href="./styles/style.css">','html.parser')
210+
soup.head.append(css_tag)
211+
soup.body['class'] = 'mw-body'
212+
soup.body['style']=['height:auto']
213+
del soup.body.div['class']
214+
soup.body.div['id']='bodyContent'
215+
h1_tag = bs(f'<h1 class="firstHeading" id="firstHeading">{title}</h1>','html.parser')
216+
soup.body.insert(0,h1_tag)
217+
218+
def getPages( url=url,folder=dir_docs ):
219+
'''
220+
This is the main function of the program
221+
which generates the HTML document from the given url
222+
and calls different functions to generate the Offline
223+
version of the page and save it under the directory /openscad_docs
224+
225+
'''
226+
url = getUrl(url)
227+
if url.split("#")[0] not in pages:
228+
pages.append( url.split("#")[0] ) #add the url to the `pages` list so that they don't get downloaded again
229+
wiki_url = url
230+
url = url.replace(url_wiki+'/wiki/', "")
231+
url = url_api + url
232+
233+
request = urllib.request.Request(url)
234+
request.add_header('User-Agent',user_agent_val)
235+
response = urllib.request.urlopen(request)
236+
xml = response.read()
237+
soup = bs(xml, 'lxml')
238+
soup = soup.text
239+
soup = bs(soup,'html5lib')
240+
241+
name = url.split("=")[-1]
242+
name = name.split("/")[-1].split('#')[0] #to convert OpenSCAD_User_Manual/String_Functions#str to String_Functions
243+
244+
title = soup.new_tag("title") #to add title to the pages
245+
title.string = name.replace("_" , " ")
246+
soup.html.head.append(title)
247+
248+
name = name + ".html"
249+
filepath = os.path.join( folder, name)
250+
251+
print("Saving: ", filepath)
252+
253+
getStyled(soup,title.string)
254+
cleanSoup(soup)
255+
getMaths(soup)
256+
getTags(soup)
257+
258+
soup.body.append( getFooter( wiki_url, title.text ))
259+
260+
open(filepath, "w", encoding="utf-8").write( str(soup) )
261+
262+
263+
def getCSS():
264+
'''
265+
This function runs once after the HTML files have been downloaded
266+
and downloads the CSS given at https://www.mediawiki.org/wiki/API:Styling_content
267+
and saves it to openscad_docs/styles
268+
269+
'''
270+
request = urllib.request.Request(url_css)
271+
request.add_header('User-Agent',user_agent_val)
272+
response = urllib.request.urlopen(request)
273+
css_soup = response.read()
274+
css = bs(css_soup, 'html5lib')
275+
csspath = os.path.join( dir_styles, 'style.css')
276+
open( csspath, "w" , encoding="utf-8").write(css.body.text)
277+
278+
279+
280+
if(__name__ == '__main__'):
281+
print(f'Started Offline Generator.py\nNow downloading the User-Manual from {url}')
282+
getPages(url)
283+
getCSS()
284+
print("Total number of pages generated is \t:\t", len(pages)-len(pages_for_exclusion))
285+
print("Total number of images generated is \t:\t", len(imgs))
286+
print("Total number of math-images generated is:\t", len(maths))
287+
shutil.make_archive('docs', 'zip', dir_docs)

scripts/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
beautifulsoup4==4.9.3
2+
PyYAML==5.4.1
3+
lxml==4.6.3
4+
html5lib==1.1

0 commit comments

Comments
 (0)