dungeon-sheets/dungeonsheets/epub.py

from typing import Mapping
from html.parser import HTMLParser
import re

from ebooklib import epub, ITEM_STYLE
from docutils import core
from sphinx.util.docstrings import prepare_docstring
from docutils.writers.html5_polyglot import Writer as HTMLWriter

from dungeonsheets.forms import dice_re, jinja_environment


def create_epub(
    chapters: Mapping, title: str, basename: str, use_dnd_decorations: bool = False
):
    """Prepare an EPUB file from the list of chapters.

    Parameters
    ==========
    chapters
      A mapping where the keys are chapter names (spines) and the
      values are strings of HTML to be rendered as the chapter
      contents.
    basename
      The basename for saving files (PDFs, etc). The resulting epub
      file will be "{basename}.epub".
    use_dnd_decorations
      If true, style sheets will be included to produce D&D stylized
      stat blocks, etc.

    """
    # Create a new epub book
    book = epub.EpubBook()
    book.set_identifier("id123456")
    book.set_title(title)
    book.set_language("en")
    # Add the css files
    css_template = jinja_env.get_template("dungeonsheets_epub.css")
    style = css_template.render(use_dnd_decorations=use_dnd_decorations)
    css = epub.EpubItem(
        uid="style_default",
        file_name="style/gm_sheet.css",
        media_type="text/css",
        content=style,
    )
    book.add_item(css)
    toc = ["nav"]
    # Create the separate chapters
    html_chapters = []
    for chap_title, content in chapters.items():
        chap_fname = chap_title.replace(" - ", "-").replace(" ", "_").lower()
        chap_fname = "{}.html".format(chap_fname)
        chapter = epub.EpubHtml(
            title=chap_title,
            file_name=chap_fname,
            lang="en",
            media_type="application/xhtml+xml",
        )
        chapter.set_content(content)
        chapter.add_item(css)
        book.add_item(chapter)
        html_chapters.append(chapter)
        # Add entries for the table of contents
        toc.append(
            toc_from_headings(
                html=content, filename=chap_fname, chapter_title=chap_title
            )
        )
    # Add the table of contents
    book.toc = toc
    book.spine = ("nav", *html_chapters)
    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    # Save the file
    epub_fname = f"{basename}.epub"
    epub.write_epub(epub_fname, book)


class HeadingParser(HTMLParser):
    tag_re = re.compile("h(\d+)")
    _curr_level = None
    _curr_id = None
    _curr_title = None

    def __init__(self, *args, **kwargs):
        self.headings = []
        super().__init__(*args, **kwargs)

    def heading_level(self, tag):
        match = self.tag_re.match(tag)
        if match:
            return int(match.group(1))
        else:
            return None

    def handle_starttag(self, tag, attrs):
        this_level = self.heading_level(tag)
        if this_level is not None:
            # Found a heading, so process the properties
            self._curr_level = this_level
            attrs = {k: v for k, v in attrs}
            self._curr_id = attrs.get("id")

    def handle_endtag(self, tag):
        this_level = self.heading_level(tag)
        if this_level is not None and this_level == self._curr_level:
            heading = {
                "level": this_level,
                "id": self._curr_id,
                "title": self._curr_title,
            }
            self.headings.append(heading)

    def handle_data(self, data):
        # Save the title
        if self._curr_level is not None:
            self._curr_title = data


def toc_from_headings(
    html: str, filename: str = "", chapter_title: str = "Sheet"
) -> list:
    """Accept a chapter of HTML, and extract a table of contents segment.

    Parameters
    ----------
    html
      The HTML block to be parsed.
    filename
      The name of this file to be used for hrefs. E.g.
      "index.html#heading_1".

    Returns
    -------
    toc
      A sequence of table-of-contents links.

    """
    # [(<ebooklib.epub.Section at 0x7fdf903595d0>,
    #   [(<ebooklib.epub.Section at 0x7fdf90359310>,
    #     [<ebooklib.epub.Link at 0x7fdf90359bd0>,
    #      <ebooklib.epub.Link at 0x7fdf90359c50>])])]
    # Parse the HTML
    parser = HeadingParser()
    parser.feed(html)
    headings = parser.headings
    # Parse into a table of contents
    if len(headings) == 0:
        # No headings found, so just the chapter link
        toc = epub.Link(href=filename, title=chapter_title, uid=filename)
    else:
        # Add a section for the chapter as a whole
        toc = (epub.Section(href=filename, title=chapter_title), [])
        sections_stack = [toc]
        # Parse all the headings
        for idx, heading in enumerate(headings):
            # Determine where we are in the tree
            href = f"{filename}#{heading['id']}"
            parent_section = sections_stack[-1]
            is_last = idx == (len(headings) - 1)
            is_leaf = is_last or heading["level"] >= headings[idx + 1]["level"]
            # Add a leaf or branch depending on the heading structure
            if is_leaf:
                parent_section[1].append(
                    epub.Link(href=href, title=heading["title"], uid=href)
                )
            else:
                new_section = (epub.Section(href=href, title=heading["title"]), [])
                parent_section[1].append(new_section)
                sections_stack.append(new_section)
            # Walk back up the stack
            if not is_last:
                for idx in range(max(0, heading["level"] - headings[idx + 1]["level"])):
                    sections_stack.pop()

    return toc


def html_parts(
    input_string,
    source_path=None,
    destination_path=None,
    input_encoding="unicode",
    doctitle=True,
    initial_header_level=1,
):
    """
    Given an input string, returns a dictionary of HTML document parts.

    Dictionary keys are the names of parts, and values are Unicode strings;
    encoding is up to the client.

    Parameters:

    - `input_string`: A multi-line text string; required.
    - `source_path`: Path to the source file or object.  Optional, but useful
      for diagnostic output (system messages).
    - `destination_path`: Path to the file or object which will receive the
      output; optional.  Used for determining relative paths (stylesheets,
      source links, etc.).
    - `input_encoding`: The encoding of `input_string`.  If it is an encoded
      8-bit string, provide the correct encoding.  If it is a Unicode string,
      use "unicode", the default.
    - `doctitle`: Disable the promotion of a lone top-level section title to
      document title (and subsequent section title to document subtitle
      promotion); enabled by default.
    - `initial_header_level`: The initial level for header elements (e.g. 1
      for "<h1>").
    """
    # Remove indentation, etc
    input_string = "\n".join(prepare_docstring(input_string))
    # Parse from rst to TeX
    overrides = {
        "input_encoding": input_encoding,
        "doctitle_xform": doctitle,
        "initial_header_level": initial_header_level,
    }
    writer = HTMLWriter()
    parts = core.publish_parts(
        source=input_string,
        source_path=source_path,
        destination_path=destination_path,
        writer=writer,
        settings_overrides=overrides,
    )
    return parts


def rst_to_html(rst, top_heading_level=0):
    """Basic markup of reST to HTML code.

    The translation between reST headings and LaTeX headings is
    modified by the *top_heading_level* parameter. A value of 0
    (default) translates "# Heading" -> "<h1>{Heading}</h1>". A value
    of 1 translates "# Heading" -> "<h2>{Heading}</h2>", etc.

    Note: heading translation is currently broken.

    Parameters
    ==========
    rst
      reStructured text input to be parsed.
    top_heading_level : optional
      The highest level heading that will be added to the HTML as
      described above.

    Returns
    =======
    html : str
      The reST text parsed into HTML markup.

    """
    if rst is None:
        # No reST, so return an empty string
        html = ""
    else:
        # Mark hit dice in monospace font
        rst = dice_re.sub(r"``\1``", rst)
        _html_parts = html_parts(rst)
        html = _html_parts["body"]
    return html


def to_heading_id(inpt: str) -> str:
    """Take a string and make it suitable for use as an HTML header id."""
    return inpt.replace(" ", "-")


# Prepare the jinja environment
jinja_env = jinja_environment()
jinja_env.filters["rst_to_html"] = rst_to_html
jinja_env.filters["to_heading_id"] = to_heading_id