|
| 1 | +''' |
| 2 | +This is the program for Generator for offline documentation |
| 3 | +more about which can be found out at https://github.com/opencax/GSoC/issues/6 |
| 4 | +and the GSOC project details for the same are present at |
| 5 | +https://summerofcode.withgoogle.com/projects/#6746958066089984 |
| 6 | +
|
| 7 | +''' |
| 8 | +import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, os, yaml |
| 9 | +from bs4 import BeautifulSoup as bs,Comment, Doctype |
| 10 | +import shutil |
| 11 | + |
| 12 | +with open(os.path.join( os.path.dirname(__file__),'config.yml'),'r') as file: |
| 13 | + config = yaml.safe_load(file) |
| 14 | + |
| 15 | +globals().update(config) |
| 16 | + |
| 17 | +''' |
| 18 | +The line above does the same work as the code given below |
| 19 | +--------------------------------------------------------- |
| 20 | +url = config['url'] |
| 21 | +url_css = config['url_css'] |
| 22 | +url_wiki = config['url_wiki'] |
| 23 | +url_api = config['url_api'] |
| 24 | +pages_for_exclusion = config['pages_for_exclusion'] |
| 25 | +user_agent_val = config['user_agent_val'] |
| 26 | +--------------------------------------------------------- |
| 27 | +''' |
| 28 | + |
| 29 | +dir_docs = 'openscad_docs' |
| 30 | +dir_imgs = os.path.join( dir_docs, 'imgs') |
| 31 | +dir_maths = os.path.join( dir_docs, 'imgs','maths') |
| 32 | +dir_styles = os.path.join( dir_docs, 'styles') |
| 33 | + |
| 34 | +#Create the directories to save the doc if they don't exist |
| 35 | +if not os.path.exists(dir_docs): os.makedirs(dir_docs) |
| 36 | +if not os.path.exists(dir_imgs): os.makedirs(dir_imgs) |
| 37 | +if not os.path.exists(dir_maths): os.makedirs(dir_maths) |
| 38 | +if not os.path.exists(dir_styles): os.makedirs(dir_styles) |
| 39 | + |
| 40 | +pages =[] |
| 41 | +pages += pages_for_exclusion |
| 42 | +imgs =[] |
| 43 | +maths =[] |
| 44 | + |
| 45 | +def getUrl(url): |
| 46 | + ''' |
| 47 | + This function generates the complete url after getting urls form src |
| 48 | + /wiki/OpenSCAD_User_Manual get converted to https://en.wikibooks.org/wiki/OpenSCAD_User_Manual |
| 49 | +
|
| 50 | + ''' |
| 51 | + if url.startswith('//'): |
| 52 | + url = 'https:'+url |
| 53 | + elif not url.startswith( url_wiki ): |
| 54 | + url = urllib.parse.urljoin( url_wiki, url[0]=="/" and url[1:] or url) |
| 55 | + return url |
| 56 | + |
| 57 | +def getTags(soup): |
| 58 | + ''' |
| 59 | + This function handles the different tags present in the HTML document |
| 60 | + for example the image tags |
| 61 | +
|
| 62 | + ''' |
| 63 | + for a in soup.find_all('a'): |
| 64 | + href= a.get('href') |
| 65 | + if href: |
| 66 | + if href[0] != '#': |
| 67 | + href = getUrl(href) |
| 68 | + if (href.startswith('/wiki/OpenSCAD_User_Manual') or href.startswith(url_wiki + '/wiki/OpenSCAD_User_Manual')): |
| 69 | + newhref = (href.replace('#', '.html#') if '#' in href else href+'.html').split('/')[-1] |
| 70 | + |
| 71 | + if 'Print_version.html' not in newhref: |
| 72 | + getPages(url=href) |
| 73 | + a['href']= newhref |
| 74 | + |
| 75 | + if a.img : |
| 76 | + getImages( a ) |
| 77 | + |
| 78 | +def getMaths(soup): |
| 79 | + ''' |
| 80 | + This function generates the image version of the math formulas |
| 81 | + to be displayed in various HTML files, for example |
| 82 | + https://en.wikibooks.org/wiki/OpenSCAD_User_Manual/Mathematical_Operators |
| 83 | + and saves them to the directory /openscad_docs/imgs/maths |
| 84 | +
|
| 85 | + ''' |
| 86 | + for img in soup.find_all('img'): |
| 87 | + try: |
| 88 | + for cls in img['class']: |
| 89 | + if('math' in cls): |
| 90 | + mathname = img['src'].split("/")[-1].split("\\")[-1] + '.svg' |
| 91 | + savepath = os.path.join( dir_maths, mathname) |
| 92 | + if (not mathname in maths): |
| 93 | + opener = urllib.request.build_opener() |
| 94 | + opener.addheaders = [('User-Agent',user_agent_val)] |
| 95 | + urllib.request.install_opener(opener) |
| 96 | + urllib.request.urlretrieve( img['src'] , savepath ) |
| 97 | + maths.append( mathname ) |
| 98 | + linkurl = os.path.join('.','imgs/maths',mathname).replace('\\','/') |
| 99 | + img['src'] = linkurl |
| 100 | + |
| 101 | + except: |
| 102 | + pass |
| 103 | + |
| 104 | +def getImages(tag): |
| 105 | + ''' |
| 106 | + This function generates the images present the in HTML documents |
| 107 | + and saves them to the directory /openscad_docs/imgs |
| 108 | +
|
| 109 | + ''' |
| 110 | + src = getUrl( tag.img['src'] ) |
| 111 | + imgname = src.split("/")[-1] |
| 112 | + imgname = imgname.replace('%','_') |
| 113 | + imgpath = os.path.join( dir_imgs, imgname) |
| 114 | + |
| 115 | + #The following is to download the image if it hasn't alrady been downloaded |
| 116 | + if not imgpath in imgs: |
| 117 | + opener = urllib.request.build_opener() |
| 118 | + opener.addheaders = [('User-Agent',user_agent_val)] |
| 119 | + urllib.request.install_opener(opener) |
| 120 | + urllib.request.urlretrieve(src , imgpath) |
| 121 | + imgs.append(imgpath) |
| 122 | + |
| 123 | + del tag.img['srcset'] |
| 124 | + imgpath = os.path.join('.', 'imgs', imgname).replace('\\','/') |
| 125 | + tag.img['src'] = imgpath |
| 126 | + tag['href']= imgpath |
| 127 | + |
| 128 | +def cleanSoup(soup): |
| 129 | + ''' |
| 130 | + This function cleans the soup by removing the redundant HTML tags |
| 131 | + and the parts that are unrelated to the User Manual |
| 132 | + ''' |
| 133 | + |
| 134 | + #The following deletes the Tags which aren't required in the User Manual |
| 135 | + red_div_cls = ["printfooter","catlinks","noprint","magnify"] |
| 136 | + red_table_cls= ['noprint','ambox'] |
| 137 | + red_input_cls= ['toctogglecheckbox'] |
| 138 | + for cls in red_div_cls: |
| 139 | + for tag in soup.findAll('div',{'class':cls}): |
| 140 | + tag.decompose() |
| 141 | + for cls in red_table_cls: |
| 142 | + for tag in soup.findAll('table',{'class':cls}): |
| 143 | + tag.decompose() |
| 144 | + for cls in red_input_cls: |
| 145 | + for tag in soup.findAll('input',{'class':cls}): |
| 146 | + tag.decompose() |
| 147 | + for tag in soup.findAll('style'): |
| 148 | + tag.decompose() |
| 149 | + |
| 150 | + #The following removes the comments present in the HTML document |
| 151 | + comments = soup.findAll(text=lambda text: isinstance(text, Comment)) |
| 152 | + [comment.extract() for comment in comments] |
| 153 | + |
| 154 | + #The following replaces the redundant div Tags with the content present in inside of them |
| 155 | + rep_div_cls = ["mw-highlight"] |
| 156 | + for kls in rep_div_cls: |
| 157 | + for tag in soup.findAll('div',kls): |
| 158 | + tag.replaceWithChildren() |
| 159 | + |
| 160 | + #The following is for the cleaning of some redundant li tags |
| 161 | + for _ in range(0,7): |
| 162 | + for tag in soup.findAll('li',{'class':f'toclevel-{_}'}): |
| 163 | + del tag['class'] |
| 164 | + |
| 165 | + #The following is for the cleaning/removal of some redundant span tags |
| 166 | + for tag in soup.findAll('span'): |
| 167 | + try: |
| 168 | + if(len(tag.text)==0): |
| 169 | + tag.decompose() |
| 170 | + for cls in tag['class']: |
| 171 | + if(len(cls) <= 2): |
| 172 | + tag.replaceWithChildren() |
| 173 | + if('mathml' in cls): |
| 174 | + tag.decompose() |
| 175 | + if cls in ['toctext']: |
| 176 | + tag.replaceWithChildren() |
| 177 | + if cls in ['mw-headline']: |
| 178 | + del tag['class'] |
| 179 | + if cls in ['mw-editsection','toctogglespan','noprint']: |
| 180 | + tag.decompose() |
| 181 | + |
| 182 | + |
| 183 | + except: |
| 184 | + pass |
| 185 | + |
| 186 | + for tag in soup.findAll('ul'): |
| 187 | + tag['style'] = 'list-style-image:none' |
| 188 | + |
| 189 | + |
| 190 | +def getFooter( url, name ): |
| 191 | + ''' |
| 192 | + This function generates the Footer with the license attribution for all the pages |
| 193 | +
|
| 194 | + ''' |
| 195 | + footer = (f'''<footer class='mw-body' style="font-size:13px;color:darkgray;text-align:center;margin-bottom:-1px"> |
| 196 | + From the WikiBooks article <a style="color:black" href="{url}">{name}</a> |
| 197 | + (provided under <a style="color:black" href="https://creativecommons.org/licenses/by-sa/3.0/"> |
| 198 | + CC-BY-SA-3.0</a>)</footer>''') |
| 199 | + |
| 200 | + return bs(footer,'html.parser') |
| 201 | + |
| 202 | +def getStyled(soup,title): |
| 203 | + tag = Doctype('html') |
| 204 | + soup.insert(0, tag) |
| 205 | + soup.html['lang']='en' |
| 206 | + meta_tag = soup.new_tag('meta') |
| 207 | + meta_tag['charset'] = 'UTF-8' |
| 208 | + soup.head.insert(0,meta_tag) |
| 209 | + css_tag = bs('<link rel="stylesheet" href="./styles/style.css">','html.parser') |
| 210 | + soup.head.append(css_tag) |
| 211 | + soup.body['class'] = 'mw-body' |
| 212 | + soup.body['style']=['height:auto'] |
| 213 | + del soup.body.div['class'] |
| 214 | + soup.body.div['id']='bodyContent' |
| 215 | + h1_tag = bs(f'<h1 class="firstHeading" id="firstHeading">{title}</h1>','html.parser') |
| 216 | + soup.body.insert(0,h1_tag) |
| 217 | + |
| 218 | +def getPages( url=url,folder=dir_docs ): |
| 219 | + ''' |
| 220 | + This is the main function of the program |
| 221 | + which generates the HTML document from the given url |
| 222 | + and calls different functions to generate the Offline |
| 223 | + version of the page and save it under the directory /openscad_docs |
| 224 | + |
| 225 | + ''' |
| 226 | + url = getUrl(url) |
| 227 | + if url.split("#")[0] not in pages: |
| 228 | + pages.append( url.split("#")[0] ) #add the url to the `pages` list so that they don't get downloaded again |
| 229 | + wiki_url = url |
| 230 | + url = url.replace(url_wiki+'/wiki/', "") |
| 231 | + url = url_api + url |
| 232 | + |
| 233 | + request = urllib.request.Request(url) |
| 234 | + request.add_header('User-Agent',user_agent_val) |
| 235 | + response = urllib.request.urlopen(request) |
| 236 | + xml = response.read() |
| 237 | + soup = bs(xml, 'lxml') |
| 238 | + soup = soup.text |
| 239 | + soup = bs(soup,'html5lib') |
| 240 | + |
| 241 | + name = url.split("=")[-1] |
| 242 | + name = name.split("/")[-1].split('#')[0] #to convert OpenSCAD_User_Manual/String_Functions#str to String_Functions |
| 243 | + |
| 244 | + title = soup.new_tag("title") #to add title to the pages |
| 245 | + title.string = name.replace("_" , " ") |
| 246 | + soup.html.head.append(title) |
| 247 | + |
| 248 | + name = name + ".html" |
| 249 | + filepath = os.path.join( folder, name) |
| 250 | + |
| 251 | + print("Saving: ", filepath) |
| 252 | + |
| 253 | + getStyled(soup,title.string) |
| 254 | + cleanSoup(soup) |
| 255 | + getMaths(soup) |
| 256 | + getTags(soup) |
| 257 | + |
| 258 | + soup.body.append( getFooter( wiki_url, title.text )) |
| 259 | + |
| 260 | + open(filepath, "w", encoding="utf-8").write( str(soup) ) |
| 261 | + |
| 262 | + |
| 263 | +def getCSS(): |
| 264 | + ''' |
| 265 | + This function runs once after the HTML files have been downloaded |
| 266 | + and downloads the CSS given at https://www.mediawiki.org/wiki/API:Styling_content |
| 267 | + and saves it to openscad_docs/styles |
| 268 | + |
| 269 | + ''' |
| 270 | + request = urllib.request.Request(url_css) |
| 271 | + request.add_header('User-Agent',user_agent_val) |
| 272 | + response = urllib.request.urlopen(request) |
| 273 | + css_soup = response.read() |
| 274 | + css = bs(css_soup, 'html5lib') |
| 275 | + csspath = os.path.join( dir_styles, 'style.css') |
| 276 | + open( csspath, "w" , encoding="utf-8").write(css.body.text) |
| 277 | + |
| 278 | + |
| 279 | + |
| 280 | +if(__name__ == '__main__'): |
| 281 | + print(f'Started Offline Generator.py\nNow downloading the User-Manual from {url}') |
| 282 | + getPages(url) |
| 283 | + getCSS() |
| 284 | + print("Total number of pages generated is \t:\t", len(pages)-len(pages_for_exclusion)) |
| 285 | + print("Total number of images generated is \t:\t", len(imgs)) |
| 286 | + print("Total number of math-images generated is:\t", len(maths)) |
| 287 | + shutil.make_archive('docs', 'zip', dir_docs) |
0 commit comments