diff --git a/README.rst b/README.rst index d9ed40a..78eb63a 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,5 @@ -============================= -sphinxcontrib-serializinghtml -============================= +This is a fork of https://github.com/sphinx-doc/sphinxcontrib-serializinghtml -sphinxcontrib-serializinghtml is a sphinx extension which outputs -"serialized" HTML files (json and pickle). +Changes made to this fork are to facilitate the creation of JSON files suitable for consumption by React. -For more details, please visit http://www.sphinx-doc.org/. - -Installing -========== - -Install from PyPI:: - - pip install -U sphinxcontrib-serializinghtml - -Contributing -============ - -See `CONTRIBUTING.rst`__ - -.. __: https://github.com/sphinx-doc/sphinx/blob/master/CONTRIBUTING.rst +Since those changes are very specific, they have not been contributed back to the original repo. diff --git a/pyproject.toml b/pyproject.toml index f14054e..a8a4329 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ "Topic :: Text Processing", "Topic :: Utilities", ] -dependencies = [] +dependencies = ["beautifulsoup4"] dynamic = ["version"] [project.optional-dependencies] diff --git a/sphinxcontrib/serializinghtml/__init__.py b/sphinxcontrib/serializinghtml/__init__.py index bdbeb6f..c44e1f7 100644 --- a/sphinxcontrib/serializinghtml/__init__.py +++ b/sphinxcontrib/serializinghtml/__init__.py @@ -11,7 +11,7 @@ from sphinx.locale import get_translation from sphinx.util.osutil import SEP, copyfile, ensuredir, os_path -from sphinxcontrib.serializinghtml import jsonimpl +from sphinxcontrib.serializinghtml import html_assists, jsonimpl if TYPE_CHECKING: from collections.abc import Sequence @@ -23,7 +23,7 @@ def dumps(self, obj: Any, *args: Any, **kwargs: Any) -> str | bytes: ... def load(self, file: Any, *args: Any, **kwargs: Any) -> Any: ... def loads(self, data: Any, *args: Any, **kwargs: Any) -> Any: ... -__version__ = '2.0.0' +__version__ = '2.0.0+Linaro-241028' __version_info__ = (2, 0, 0) package_dir = path.abspath(path.dirname(__file__)) @@ -55,7 +55,24 @@ class SerializingHTMLBuilder(StandaloneHTMLBuilder): def init(self) -> None: self.build_info = BuildInfo(self.config, self.tags) - self.imagedir = '_images' + # Cope with whether or not Sphinx has the required configuration variables + # set. + # See HTML Builder comments for explanation of image setup & handling + html_image_dir = None + try: + html_image_dir = self.get_builder_config('image_dir', 'html') + except AttributeError: + pass + if html_image_dir is not None: + self.imagedir = html_image_dir + else: + self.imagedir = '_images' + html_image_path = None + try: + html_image_path = self.get_builder_config('image_path', 'html') + except AttributeError: + pass + self.imagepath = html_image_path self.current_docname = '' self.theme = None # type: ignore[assignment] # no theme necessary self.templates = None # no template bridge necessary @@ -64,13 +81,22 @@ def init(self) -> None: self.init_css_files() self.init_js_files() self.use_index = self.get_builder_config('use_index', 'html') + # + # PJC: New configuration to allow mapping of external links to + # relative Hub links. + link_mappings = None + try: + link_mappings = self.get_builder_config('link_mappings', 'html') + except AttributeError: + pass + self.link_mappings = link_mappings def get_target_uri(self, docname: str, typ: str | None = None) -> str: if docname == 'index': - return '' + return "" if docname.endswith(SEP + 'index'): return docname[:-5] # up to sep - return docname + SEP + return docname def dump_context(self, context: dict[str, Any], filename: str | os.PathLike[str]) -> None: context = context.copy() @@ -91,9 +117,24 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p ctx.setdefault('pathto', lambda p: p) self.add_sidebars(pagename, ctx) + # Add the toc tree as a JSON dictionary + ctx['toctree'] = html_assists.convert_nav_html_to_json(self._get_local_toctree(pagename)) + if not outfilename: + # PJC: Ensure that index files are actually written under the name of the + # directory leafname. + parts = pagename.split(SEP) + if parts[len(parts)-1] == "index": + if len(parts) == 1: + # Use the project name + page_filename = self.get_builder_config('project_name', 'html') + else: + page_filename = SEP.join(parts[:-1]) + ctx['current_page_name'] = page_filename + else: + page_filename = pagename outfilename = path.join(self.outdir, - os_path(pagename) + self.out_suffix) + os_path(page_filename) + self.out_suffix) # we're not taking the return value here, since no template is # actually rendered @@ -104,6 +145,20 @@ def handle_page(self, pagename: str, ctx: dict[str, Any], templatename: str = 'p if isinstance(ctx[key], types.FunctionType): del ctx[key] + if "body" in ctx: + # PJC: Some Linaro documentation has encoded attributes in image ALT text + # which then gets decoded when the HTML is loaded into the DOM, so + # we need to alter it by "escaping" the ampersands with & to + # prevent the decoding. + ctx['body'] = html_assists.escape_encoded_alt_text(ctx['body']) + # PJC: Furthermore, if there is any formatted code with encoded attributes, + # e.g. < changed to < then that also needs to be escaped because it is + # also getting decoded. + ctx['body'] = html_assists.escape_encoded_pre_text(ctx['body']) + # PJC: Go through the body, looking for any tags to see if they + # need to be re-mapped to a local Hub path. + ctx['body'] = html_assists.rewrite_hub_links(ctx['body'], self.link_mappings) + ensuredir(path.dirname(outfilename)) self.dump_context(ctx, outfilename) @@ -161,7 +216,7 @@ class JSONHTMLBuilder(SerializingHTMLBuilder): implementation_dumps_unicode = True indexer_format = jsonimpl indexer_dumps_unicode = True - out_suffix = '.fjson' + out_suffix = '.json' globalcontext_filename = 'globalcontext.json' searchindex_filename = 'searchindex.json' diff --git a/sphinxcontrib/serializinghtml/html_assists.py b/sphinxcontrib/serializinghtml/html_assists.py new file mode 100644 index 0000000..1a1940e --- /dev/null +++ b/sphinxcontrib/serializinghtml/html_assists.py @@ -0,0 +1,127 @@ +from bs4 import BeautifulSoup, element +import sys +from html import escape + +def clean_href(href: str) -> str: + """ Make sure the href doesn't start or end with a / """ + if href[0] == "/": + href = href[1:] + if href[-1] == "/": + href = href[:-1] + return href + +def section_links(parent_entry: element.Tag, list_entry: element.Tag) -> dict: + section_result = [] + for child in list_entry.children: + if type(child) is element.Tag and child.name == "li": + section_result.append(convert_tag_to_link(child)) + return { + "type": "expandable-link-group", + "text": parent_entry.contents[0].contents[0], + "href": clean_href(parent_entry.contents[0]["href"]), + "items": section_result + } + +def convert_tag_to_link(item_entry: element.Tag) -> dict: + # The a tag is a child of the li tag + a_tag = item_entry.contents[0] + return { + "type": "link", + "text": a_tag.contents[0], + "href": clean_href(a_tag["href"]) + } + +def process_section(result, child, section, pending_divider) -> bool: + if section != []: + # Yes, there is, so we have a sub-section. If we've got some content + # already, add a divider. + if result != []: + result.append({ "type": "divider" }) + # Now append the current page and the section links. The + # ul tag is the only child returned, hence [0] + result.append(section_links(child, section[0])) + # If there are any "normal" entries after this section + # add a divider first + pending_divider = True + else: + if pending_divider: + result.append({ "type": "divider" }) + pending_divider = False + result.append(convert_tag_to_link(child)) + return pending_divider + +def process_ul_children(result, ul): + pending_divider = False + for child in ul.children: + if type(child) is element.Tag and child.name == "li": + # Is there a new unordered list within this section? + section = child.find_all("ul", limit=1) + pending_divider = process_section(result, child, section, pending_divider) + +def convert_nav_html_to_json(html: str) -> list: + result = [] + soup = BeautifulSoup(html, "html.parser") + + # Start with the unordered list + ul = soup.ul + # Iterate through list items + while ul is not None: + process_ul_children(result, ul) + while True: + ul = ul.next_sibling + if ul is None or type(ul) is element.Tag: + break + # Not an acceptable type - loop and get the next sibling + return result + +def escape_encoded_alt_text(html: str) -> str: + edited = False + soup = BeautifulSoup(html, "html.parser") + images = soup.find_all('img') + for img in images: + if img['alt'] != "": + # At this point, Beautiful Soup has done what a browser does - decode + # any encoded attributes. So we need to re-encode the string, see if + # there are any ampersands and, if so, re-encode them again. + interim = escape(img['alt']) + if interim.find("&") != -1: + img['alt'] = escape(interim) + edited = True + + if edited: + html = str(soup) + return html + +def escape_encoded_pre_text(html: str) -> str: + edited = False + soup = BeautifulSoup(html, "html.parser") + spans = soup.find_all('span', class_="pre") + for span in spans: + # At this point, Beautiful Soup has done what a browser does - decode + # any encoded attributes. So we need to re-encode the string, see if + # there are any ampersands and, if so, re-encode them again. + interim = escape(span.string) + if interim.find("&") != -1: + span.string = escape(interim) + edited = True + + if edited: + html = str(soup) + return html + +def rewrite_hub_links(html: str, link_mappings: dict) -> str: + edited = False + soup = BeautifulSoup(html, "html.parser") + links = soup.find_all('a') + for link in links: + for key in link_mappings: + if link['href'].startswith(key): + # We have a match, so replace the href with the new one + link['href'] = link['href'].replace(key, link_mappings[key]) + # We also have to remove ".html" from the end of the link + link['href'] = link['href'].replace(".html", "") + edited = True + + if edited: + html = str(soup) + return html