Source code for rst_to_myst.markdownit

"""Convert to markdown-it tokens, which can then be rendered by mdformat."""

from io import StringIO
from textwrap import indent
from typing import IO, Any, NamedTuple, Optional

from docutils import nodes
from markdown_it.token import Token
from mdit_py_plugins import __version__ as mdit_plug_version


[docs]class RenderOutput(NamedTuple):
    tokens: list[Token]
    env: dict[str, Any]


[docs]class MarkdownItRenderer(nodes.GenericNodeVisitor):
    """Render docutils AST to Markdown-It token stream."""

    def __init__(
        self,
        document: nodes.document,
        *,
        warning_stream: Optional[IO] = None,
        raise_on_warning: bool = False,
        cite_prefix: str = "cite_",
        default_role: Optional[str] = None,
        colon_fences: bool = True,
        dollar_math: bool = True,
    ):
        self._document = document
        self._warning_stream = warning_stream or StringIO()
        self.raise_on_warning = raise_on_warning
        # prefix added to citation labels
        self.cite_prefix = cite_prefix
        # if no default role, convert to literal
        self.default_role = default_role
        self.colon_fences = colon_fences
        self.dollar_math = dollar_math

        self.reset_state()

    def reset_state(self):
        # record current state, that can affect children tokens
        self._tokens: list[Token] = []
        self._env = {"references": {}, "duplicate_refs": []}
        self._inline: Optional[Token] = None
        self.parent_tokens: dict[str, int] = {}
        # [(key path, tokens), ...]
        self._front_matter_tokens: list[tuple[list[str], list[Token]]] = []
        self._tight_list = True

    @property
    def document(self) -> nodes.document:
        return self._document

    def warning(self, message: str, line: Optional[int]):
        if line is not None:
            self._warning_stream.write(f"RENDER WARNING:{line}: {message}\n")
        else:
            self._warning_stream.write(f"RENDER WARNING: {message}\n")

[docs]    def to_tokens(self) -> RenderOutput:
        """Reset tokens and convert full document."""
        self.reset_state()
        self._document.walkabout(self)

        # add front-matter that should be nested parsed
        if self._front_matter_tokens:
            fm_tokens = []
            fm_tokens.append(Token("front_matter_tokens_open", "", 1))
            for key_path, tokens in self._front_matter_tokens:
                fm_tokens.append(
                    Token("front_matter_key_open", "", 1, meta={"key_path": key_path})
                )
                fm_tokens.extend(tokens)
                fm_tokens.append(Token("front_matter_key_close", "", -1))
            fm_tokens.append(Token("front_matter_tokens_close", "", -1))
            self._tokens = fm_tokens + self._tokens

        return RenderOutput(self._tokens[:], self._env)

    def nested_parse(self, nodes: list[nodes.Element]) -> list[Token]:
        new_inst = MarkdownItRenderer(
            document=self._document,
            warning_stream=self._warning_stream,
            cite_prefix=self.cite_prefix,
            default_role=self.default_role,
            colon_fences=self.colon_fences,
            dollar_math=self.dollar_math,
        )
        for node in nodes:
            node.walkabout(new_inst)
        return new_inst._tokens

[docs]    def add_token(
        self, ttype: str, tag: str, nesting: int, *, content: str = "", **kwargs: Any
    ) -> Token:
        """A markdown-it token to the stream, handling inline tokens and children."""
        token = Token(ttype, tag, nesting, content=content, **kwargs)
        # record entries and exits
        if ttype.endswith("_open"):
            self.parent_tokens.setdefault(ttype[:-5], 0)
            self.parent_tokens[ttype[:-5]] += 1
        if ttype.endswith("_close"):
            self.parent_tokens.setdefault(ttype[:-6], 0)
            self.parent_tokens[ttype[:-6]] -= 1
            if self.parent_tokens[ttype[:-6]] <= 0:
                self.parent_tokens.pop(ttype[:-6])
        # decide whether we should be adding as an inline child
        if ttype in {"paragraph_open", "heading_open", "th_open", "td_open", "dt_open"}:
            self._tokens.append(token)
            self._inline = Token("inline", "", 0, children=[])
            self._tokens.append(self._inline)
        elif ttype in {
            "paragraph_close",
            "heading_close",
            "th_close",
            "td_close",
            "dt_close",
        }:
            self._tokens.append(token)
            self._inline = None
        elif self._inline:
            self._inline.children.append(token)
        else:
            self._tokens.append(token)
        return token

[docs]    def default_visit(self, node):
        self.unknown_visit(node)

[docs]    def default_departure(self, node):
        self.unknown_departure(node)

[docs]    def unknown_visit(self, node):
        message = f"no visit method for: {node.__class__}"
        self.warning(message, node.line)
        if self.raise_on_warning:
            raise NotImplementedError(message)

[docs]    def unknown_departure(self, node):
        message = f"no depart method for: {node.__class__}"
        self.warning(message, node.line)
        if self.raise_on_warning:
            raise NotImplementedError(message)

    # Skipped components

    def visit_document(self, node):
        pass

    def depart_document(self, node):
        pass

    def visit_Element(self, node):
        pass

    def depart_Element(self, node):
        pass

    def visit_system_message(self, node):
        # ignore
        raise nodes.SkipNode

    def visit_problematic(self, node):
        # ignore
        raise nodes.SkipNode

    # CommonMark components

    def visit_section(self, node):
        pass  # handled by title

    def depart_section(self, node):
        pass

    def visit_title(self, node):
        token = self.add_token("heading_open", f"h{node['level']}", 1)
        token.markup = "#" * node["level"]

    def depart_title(self, node):
        token = self.add_token("heading_close", f"h{node['level']}", -1)
        token.markup = "#" * node["level"]

    def visit_paragraph(self, node):
        if self.parent_tokens.get("th") or self.parent_tokens.get("td"):
            # table cells are treated as paragraphs already
            return
        token = self.add_token("paragraph_open", "p", 1)
        if self.parent_tokens.get("list_item") and self._tight_list:
            # paragraphs in tight lists are hidden
            token.hidden = True

    def depart_paragraph(self, node):
        if self.parent_tokens.get("th") or self.parent_tokens.get("td"):
            # table cells are treated as paragraphs already
            return
        self.add_token("paragraph_close", "p", -1)

    def visit_Text(self, node):
        self.add_token("text", "", 0, content=node.astext())
        raise nodes.SkipNode

    def visit_UnprocessedText(self, node):
        self.add_token("unprocessed", "", 0, content=node.astext())
        raise nodes.SkipNode

    def visit_emphasis(self, node):
        self.add_token("em_open", "em", 1, markup="*")

    def depart_emphasis(self, node):
        self.add_token("em_close", "em", -1, markup="*")

    def visit_strong(self, node):
        self.add_token("strong_open", "strong", 1, markup="**")

    def depart_strong(self, node):
        self.add_token("strong_close", "strong", -1, markup="**")

    def visit_transition(self, node):
        self.add_token("hr", "hr", 0, markup="---")
        raise nodes.SkipNode

    def visit_bullet_list(self, node):
        self.add_token("bullet_list_open", "ul", 1, markup=node["bullet"])

    def depart_bullet_list(self, node):
        self.add_token("bullet_list_close", "ul", -1, markup=node["bullet"])

    def visit_enumerated_list(self, node):
        token = self.add_token("ordered_list_open", "ol", 1, markup=".")
        if "start" in node:
            token.attrs["start"] = node["start"]

    def depart_enumerated_list(self, node):
        self.add_token("ordered_list_close", "ol", -1, markup=".")

    def visit_list_item(self, node):
        token = self.add_token("list_item_open", "li", 1)
        if "style" in node:
            if node["style"] == "bullet":
                token.markup = node["prefix"].strip()
            elif node["style"] == "enumerated":
                token.markup = "."
        # a list is loose if any of its list items directly contain
        # two block-level elements, otherwise tight. In this case paragraphs are hidden
        self._tight_list = len(node.children) < 2

    def depart_list_item(self, node):
        self.add_token("list_item_close", "li", -1)

    def visit_literal(self, node):
        self.add_token("code_inline", "code", 0, markup="`", content=node.astext())
        raise nodes.SkipNode

    def visit_literal_block(self, node):
        text = node.astext()
        if not text.endswith("\n"):
            text += "\n"
        self.add_token("code_block", "code", 0, content=text)
        raise nodes.SkipNode

    def visit_doctest_block(self, node):
        # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#doctest-blocks
        # https://pygments.org/docs/lexers/#pygments.lexers.python.PythonConsoleLexer
        self.warning("Treating doctest block as pycon literal block", node.line)
        text = node.astext()
        if not text.endswith("\n"):
            text += "\n"
        self.add_token(
            "fence",
            "code",
            0,
            content=text,
            markup="```",
            info="pycon",
        )
        raise nodes.SkipNode

    def visit_block_quote(self, node):
        self.add_token("blockquote_open", "blockquote", 1, markup=">")

    def depart_block_quote(self, node):
        self.add_token("blockquote_close", "blockquote", -1, markup=">")

    def visit_attribution(self, node):
        # Markdown block quotes do not have an attribution syntax,
        # so we add a best approximation
        token = self.add_token("html_inline", "", 0)
        token.content = f'<p class="attribution">-{node.astext()}</p>'
        raise nodes.SkipNode

    def visit_reference(self, node):
        # we assume all reference names are plain text
        text = node.astext()

        if "standalone_uri" in node:
            # autolink
            token = self.add_token("link_open", "a", 1, markup="autolink", info="auto")
            token.attrs["href"] = node["refuri"]
            self.add_token("text", "", 0, content=node["refuri"])
            self.add_token("link_close", "a", -1, markup="autolink", info="auto")
        elif "refname" in node:
            # reference a link definition `[refname]: url`, or a target `(refname)=`
            # TODO ensure mdformat does not wrap in <>
            token = self.add_token(
                "link_open",
                "a",
                1,
                attrs={"href": node["refname"]},
                # TODO should only add label if target found?
                meta={"label": node["refname"]},
            )
            self.add_token("text", "", 0, content=text)
            self.add_token("link_close", "a", -1)
        elif "refuri" in node:
            # external link
            # TODO ensure prefixed with http://?
            token = self.add_token("link_open", "a", 1, attrs={"href": node["refuri"]})
            self.add_token("text", "", 0, content=text)
            self.add_token("link_close", "a", -1)
        elif "refid" in node:
            # anonymous links, pointing to internal targets
            # TODO ensure mdformat does not wrap in <>
            token = self.add_token(
                "link_open",
                "a",
                1,
                attrs={"href": node["refid"]},
            )
            self.add_token("text", "", 0, content=text)
            self.add_token("link_close", "a", -1)
        else:
            message = f"unknown reference type: {node.rawsource}"
            self.warning(message, node.line)
            if self.raise_on_warning:
                raise NotImplementedError(message)

        raise nodes.SkipNode

    def visit_target(self, node):
        if node.get("inline"):
            # TODO inline targets
            message = f"inline targets not implemented: {node.rawsource}"
            self.warning(message, node.line)
            if self.raise_on_warning:
                raise NotImplementedError(message)
            self.add_token(
                "code_inline", "code", 0, markup="`", content=str(node.rawsource)
            )
            raise nodes.SkipNode

        if "refuri" in node:
            for name in node["names"]:
                # TODO warn about name starting ^ (clashes with footnotes)
                if name not in self._env["references"]:
                    self._env["references"][name] = {
                        "title": "",
                        "href": node["refuri"],
                        "map": [node.line, node.line],
                    }
                else:
                    self._env["duplicate_refs"].append(
                        {
                            "label": name,
                            "title": "",
                            "href": node["refuri"],
                            "map": [node.line, node.line],
                        }
                    )
        elif "names" in node:
            for name in node["names"]:
                self.add_token(
                    "myst_target", "", 0, attrs={"class": "myst-target"}, content=name
                )
        if "refid" in node:
            self.add_token(
                "myst_target",
                "",
                0,
                attrs={"class": "myst-target"},
                content=node["refid"],
            )

        # TODO check for content?
        raise nodes.SkipNode

    # Standard CommonMark extensions

[docs]    def parse_gfm_table(self, node) -> bool:
        """Check whether an RST table can be converted to a GFM one.

        RST tables can have e.g. cells spanning multiple columns/rows,
        which the GitHub Flavoured Markdown (GFM) table variant does not support.
        """
        # must have one child tgroup
        if len(node.children) != 1 or not isinstance(node.children[0], nodes.tgroup):
            return False
        # tgroup should contain the number of columns
        tgroup = node.children[0]
        if "cols" not in tgroup:
            return False
        ncolumns = tgroup["cols"]
        # trgoup should contain children: (colspec)*, thead, tbody
        if len(tgroup.children) < 2:
            return False
        if not isinstance(tgroup.children[-2], nodes.thead):
            return False
        if not isinstance(tgroup.children[-1], nodes.tbody):
            return False
        thead = tgroup.children[-2]
        tbody = tgroup.children[-1]
        # the header can only have one row with the full amount of columns
        if len(thead.children) != 1 or len(thead.children[0]) != ncolumns:
            return False
        # each body row should have the full amount of columns
        return all(len(row.children) == ncolumns for row in tbody.children)

    def visit_table(self, node):
        if not self.parse_gfm_table(node):
            text = node.rawsource
            if not text.endswith("\n"):
                text += "\n"
            self.add_token(
                "fence", "code", 0, content=text, markup="```", info="{eval-rst}"
            )
            raise nodes.SkipNode

        self.add_token("table_open", "table", 1)

    def depart_table(self, node):
        self.add_token("table_close", "table", -1)

    def visit_tgroup(self, node):
        pass

    def depart_tgroup(self, node):
        pass

    def visit_colspec(self, node):
        raise nodes.SkipNode

    def visit_thead(self, node):
        self.add_token("thead_open", "thead", 1)

    def depart_thead(self, node):
        self.add_token("thead_close", "thead", -1)

    def visit_tbody(self, node):
        self.add_token("tbody_open", "tbody", 1)

    def depart_tbody(self, node):
        self.add_token("tbody_close", "tbody", -1)

    def visit_row(self, node):
        self.add_token("tr_open", "tr", 1)

    def depart_row(self, node):
        self.add_token("tr_close", "tr", -1)

    def visit_entry(self, node):
        tag = "th" if self.parent_tokens.get("thead") else "td"
        self.add_token(f"{tag}_open", tag, 1)

    def depart_entry(self, node):
        tag = "th" if self.parent_tokens.get("thead") else "td"
        # Markdown cells can not include newlines
        # TODO improve or upstream this "fix"
        # maybe replace with html_inline <br> tokens (text will be escaped)
        if self._inline:
            for child in self._inline.children:
                child.content = child.content.replace("\n", " ")
        self.add_token(f"{tag}_close", tag, -1)

    # TODO check if handling of is/subId required for footnotes

    def visit_footnote(self, node, refname=None):
        refname = refname or node["ids"][0]  # assume there is only one id
        self.add_token("footnote_block_open", "", 1)
        self.add_token("footnote_open", "", 1, meta={"label": refname, "id": 0})

    def depart_footnote(self, node):
        self.add_token("footnote_close", "", -1)
        self.add_token("footnote_block_close", "", -1)

    def visit_citation(self, node):
        # treated same as for visit_footnote, but with specific prefix
        # TODO fails if duplicate refname, since names is empty
        refname = node["names"][0]  # assume there is only one name
        refname = f"{self.cite_prefix}{refname}"
        return self.visit_footnote(node, refname=refname)

    def depart_citation(self, node):
        # treated same as for depart_footnote
        return self.depart_footnote(node)

    def visit_footnote_reference(self, node):
        if "refname" in node:
            refname = node["refname"]
        elif "refid" in node:
            refname = node["refid"]
        else:
            message = f"unknown footnote reference type: {node.rawsource}"
            self.warning(message, node.line)
            if self.raise_on_warning:
                raise NotImplementedError(message)

        self.add_token(
            "footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0}
        )

        raise nodes.SkipNode

    def visit_citation_reference(self, node):
        refname = node["refname"] if "refname" in node else node["refid"]
        # for compatibility we treat citations the same as footnotes, with a prefix
        refname = f"{self.cite_prefix}{refname}"
        self.add_token(
            "footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0}
        )
        # the node also contains the refname as text, but we don't need that
        raise nodes.SkipNode

    def visit_definition_list(self, node):
        self.add_token("dl_open", "dl", 1)

    def depart_definition_list(self, node):
        self.add_token("dl_close", "dl", -1)

    def visit_definition_list_item(self, node):
        pass

    def depart_definition_list_item(self, node):
        pass

    def visit_term(self, node):
        self.add_token("dt_open", "dt", 1)

    def depart_term(self, node):
        self.add_token("dt_close", "dt", -1)

    def visit_classifier(self, node):
        # classifiers can follow a term, e.g. `term : classifier`
        # TODO record term classifiers?
        raise nodes.SkipNode

    def visit_definition(self, node):
        self.add_token("dd_open", "dd", 1)

    def depart_definition(self, node):
        self.add_token("dd_close", "dd", -1)

    def visit_FrontMatterNode(self, node):
        for field in node:
            if len(field) != 2:
                continue
            key = field[0][0].astext()
            tokens = self.nested_parse(field[1].children)
            self._front_matter_tokens.append(([key], tokens))

        raise nodes.SkipNode

    def visit_field_list(self, node):
        if node.rawsource:
            text = "\n" + node.rawsource.strip() + "\n"
            self.add_token("fence", "code", 0, content=text, info="{eval-rst}")
        raise nodes.SkipNode

    # MyST Markdown specific

    def visit_RoleNode(self, node):
        # TODO nested parse of specific roles
        role = node["role"] or self.default_role
        if role:
            if self.dollar_math and role == "math":
                self.add_token(
                    "math_inline", "math", 0, markup="$", content=node["text"].strip()
                )
            else:
                self.add_token(
                    "myst_role", "", 0, meta={"name": role}, content=node["text"]
                )
        else:
            self.add_token("code_inline", "code", 0, markup="`", content=node["text"])
        raise nodes.SkipNode

    def visit_comment(self, node):
        # TODO alternately use <!-- -->
        self.add_token(
            "myst_line_comment",
            "hr",
            0,
            attrs={"class": "myst-line-comment"},
            content=indent(node.astext(), " "),
        )
        raise nodes.SkipNode

    def visit_substitution_reference(self, node):
        self.add_token("substitution_inline", "span", 0, content=node["refname"])
        # the node also contains the refname as text, but we don't need that
        raise nodes.SkipNode

    def visit_substitution_definition(self, node):
        if "names" not in node or not node["names"]:
            raise nodes.SkipNode
        key = node["names"][0]
        # substitution definition should always be a single directive node
        tokens = self.nested_parse(node.children)
        self._front_matter_tokens.append((["substitutions", key], tokens))
        raise nodes.SkipNode

    def visit_EvalRstNode(self, node):
        text = node.astext()
        if not text.endswith("\n"):
            text += "\n"
        self.add_token("fence", "code", 0, content=text, info="{eval-rst}")
        raise nodes.SkipNode

    def visit_DirectiveNode(self, node):
        markup = "`"
        if self.colon_fences and node["conversion"] in (
            "parse_content",
            "parse_content_titles",
            "parse_all",
        ):
            markup = ":"
        if (
            (
                node["name"] == "code-block"
                or node["module"] == "sphinx.directives.patches.Code"
            )
            and not node["options_list"]
            and len(node.children) == 2
        ):
            # special case, where we can use standard Markdown fences
            argument, content = node.children
            self.add_token(
                "fence",
                "code",
                0,
                content=content.astext() + "\n",
                markup="```",
                info=argument.astext().strip(),
            )
            raise nodes.SkipNode
        elif (
            (
                node["name"] == "math"
                or node["module"] == "docutils.parsers.rst.directives.body.MathBlock"
            )
            and self.dollar_math
            and (
                not node["options_list"]
                or (
                    len(node["options_list"]) == 1
                    and node["options_list"][0][0] == "label"
                )
            )
            and len(node.children) == 2
        ):
            # special case where we use dollarmath
            argument, content = node.children
            text = ""
            if argument.astext().strip():
                text += "\n" + argument.astext().strip() + "\n"
            if content.astext().strip():
                text += "\n" + content.astext().strip() + "\n"
            if node["options_list"]:
                label = node["options_list"][0][1]
                major, minor, patch = (int(i) for i in mdit_plug_version.split("."))
                name = "math_block_label"
                if major == 0 and minor < 3:
                    name = "math_block_eqno"
                self.add_token(name, "math", 0, markup="$$", content=text, info=label)
            else:
                self.add_token("math_block", "math", 0, markup="$$", content=text)
            raise nodes.SkipNode
        else:
            self.add_token(
                "directive_open",
                "",
                1,
                meta={
                    key: node[key]
                    for key in ["name", "module", "conversion", "options_list"]
                },
                markup=markup,
            )

    def depart_DirectiveNode(self, node):
        self.add_token("directive_close", "", -1)

    def visit_ArgumentNode(self, node):
        # TODO might be a better construct to have this as children of inline
        self.add_token("directive_arg_open", "", 1)

    def depart_ArgumentNode(self, node):
        self.add_token("directive_arg_close", "", -1)

    def visit_ContentNode(self, node):
        self.add_token("directive_content_open", "", 1)

    def depart_ContentNode(self, node):
        self.add_token("directive_content_close", "", -1)

    # TODO https://docutils.sourceforge.io/docs/user/rst/quickref.htm
    # line block, option list