"""Convert to markdown-it tokens, which can then be rendered by mdformat."""
from io import StringIO
from textwrap import indent
from typing import IO, Any, NamedTuple, Optional
from docutils import nodes
from markdown_it.token import Token
from mdit_py_plugins import __version__ as mdit_plug_version
[docs]class RenderOutput(NamedTuple):
tokens: list[Token]
env: dict[str, Any]
[docs]class MarkdownItRenderer(nodes.GenericNodeVisitor):
"""Render docutils AST to Markdown-It token stream."""
def __init__(
self,
document: nodes.document,
*,
warning_stream: Optional[IO] = None,
raise_on_warning: bool = False,
cite_prefix: str = "cite_",
default_role: Optional[str] = None,
colon_fences: bool = True,
dollar_math: bool = True,
):
self._document = document
self._warning_stream = warning_stream or StringIO()
self.raise_on_warning = raise_on_warning
# prefix added to citation labels
self.cite_prefix = cite_prefix
# if no default role, convert to literal
self.default_role = default_role
self.colon_fences = colon_fences
self.dollar_math = dollar_math
self.reset_state()
def reset_state(self):
# record current state, that can affect children tokens
self._tokens: list[Token] = []
self._env = {"references": {}, "duplicate_refs": []}
self._inline: Optional[Token] = None
self.parent_tokens: dict[str, int] = {}
# [(key path, tokens), ...]
self._front_matter_tokens: list[tuple[list[str], list[Token]]] = []
self._tight_list = True
@property
def document(self) -> nodes.document:
return self._document
def warning(self, message: str, line: Optional[int]):
if line is not None:
self._warning_stream.write(f"RENDER WARNING:{line}: {message}\n")
else:
self._warning_stream.write(f"RENDER WARNING: {message}\n")
[docs] def to_tokens(self) -> RenderOutput:
"""Reset tokens and convert full document."""
self.reset_state()
self._document.walkabout(self)
# add front-matter that should be nested parsed
if self._front_matter_tokens:
fm_tokens = []
fm_tokens.append(Token("front_matter_tokens_open", "", 1))
for key_path, tokens in self._front_matter_tokens:
fm_tokens.append(
Token("front_matter_key_open", "", 1, meta={"key_path": key_path})
)
fm_tokens.extend(tokens)
fm_tokens.append(Token("front_matter_key_close", "", -1))
fm_tokens.append(Token("front_matter_tokens_close", "", -1))
self._tokens = fm_tokens + self._tokens
return RenderOutput(self._tokens[:], self._env)
def nested_parse(self, nodes: list[nodes.Element]) -> list[Token]:
new_inst = MarkdownItRenderer(
document=self._document,
warning_stream=self._warning_stream,
cite_prefix=self.cite_prefix,
default_role=self.default_role,
colon_fences=self.colon_fences,
dollar_math=self.dollar_math,
)
for node in nodes:
node.walkabout(new_inst)
return new_inst._tokens
[docs] def add_token(
self, ttype: str, tag: str, nesting: int, *, content: str = "", **kwargs: Any
) -> Token:
"""A markdown-it token to the stream, handling inline tokens and children."""
token = Token(ttype, tag, nesting, content=content, **kwargs)
# record entries and exits
if ttype.endswith("_open"):
self.parent_tokens.setdefault(ttype[:-5], 0)
self.parent_tokens[ttype[:-5]] += 1
if ttype.endswith("_close"):
self.parent_tokens.setdefault(ttype[:-6], 0)
self.parent_tokens[ttype[:-6]] -= 1
if self.parent_tokens[ttype[:-6]] <= 0:
self.parent_tokens.pop(ttype[:-6])
# decide whether we should be adding as an inline child
if ttype in {"paragraph_open", "heading_open", "th_open", "td_open", "dt_open"}:
self._tokens.append(token)
self._inline = Token("inline", "", 0, children=[])
self._tokens.append(self._inline)
elif ttype in {
"paragraph_close",
"heading_close",
"th_close",
"td_close",
"dt_close",
}:
self._tokens.append(token)
self._inline = None
elif self._inline:
self._inline.children.append(token)
else:
self._tokens.append(token)
return token
[docs] def default_visit(self, node):
self.unknown_visit(node)
[docs] def default_departure(self, node):
self.unknown_departure(node)
[docs] def unknown_visit(self, node):
message = f"no visit method for: {node.__class__}"
self.warning(message, node.line)
if self.raise_on_warning:
raise NotImplementedError(message)
[docs] def unknown_departure(self, node):
message = f"no depart method for: {node.__class__}"
self.warning(message, node.line)
if self.raise_on_warning:
raise NotImplementedError(message)
# Skipped components
def visit_document(self, node):
pass
def depart_document(self, node):
pass
def visit_Element(self, node):
pass
def depart_Element(self, node):
pass
def visit_system_message(self, node):
# ignore
raise nodes.SkipNode
def visit_problematic(self, node):
# ignore
raise nodes.SkipNode
# CommonMark components
def visit_section(self, node):
pass # handled by title
def depart_section(self, node):
pass
def visit_title(self, node):
token = self.add_token("heading_open", f"h{node['level']}", 1)
token.markup = "#" * node["level"]
def depart_title(self, node):
token = self.add_token("heading_close", f"h{node['level']}", -1)
token.markup = "#" * node["level"]
def visit_paragraph(self, node):
if self.parent_tokens.get("th") or self.parent_tokens.get("td"):
# table cells are treated as paragraphs already
return
token = self.add_token("paragraph_open", "p", 1)
if self.parent_tokens.get("list_item") and self._tight_list:
# paragraphs in tight lists are hidden
token.hidden = True
def depart_paragraph(self, node):
if self.parent_tokens.get("th") or self.parent_tokens.get("td"):
# table cells are treated as paragraphs already
return
self.add_token("paragraph_close", "p", -1)
def visit_Text(self, node):
self.add_token("text", "", 0, content=node.astext())
raise nodes.SkipNode
def visit_UnprocessedText(self, node):
self.add_token("unprocessed", "", 0, content=node.astext())
raise nodes.SkipNode
def visit_emphasis(self, node):
self.add_token("em_open", "em", 1, markup="*")
def depart_emphasis(self, node):
self.add_token("em_close", "em", -1, markup="*")
def visit_strong(self, node):
self.add_token("strong_open", "strong", 1, markup="**")
def depart_strong(self, node):
self.add_token("strong_close", "strong", -1, markup="**")
def visit_transition(self, node):
self.add_token("hr", "hr", 0, markup="---")
raise nodes.SkipNode
def visit_bullet_list(self, node):
self.add_token("bullet_list_open", "ul", 1, markup=node["bullet"])
def depart_bullet_list(self, node):
self.add_token("bullet_list_close", "ul", -1, markup=node["bullet"])
def visit_enumerated_list(self, node):
token = self.add_token("ordered_list_open", "ol", 1, markup=".")
if "start" in node:
token.attrs["start"] = node["start"]
def depart_enumerated_list(self, node):
self.add_token("ordered_list_close", "ol", -1, markup=".")
def visit_list_item(self, node):
token = self.add_token("list_item_open", "li", 1)
if "style" in node:
if node["style"] == "bullet":
token.markup = node["prefix"].strip()
elif node["style"] == "enumerated":
token.markup = "."
# a list is loose if any of its list items directly contain
# two block-level elements, otherwise tight. In this case paragraphs are hidden
self._tight_list = len(node.children) < 2
def depart_list_item(self, node):
self.add_token("list_item_close", "li", -1)
def visit_literal(self, node):
self.add_token("code_inline", "code", 0, markup="`", content=node.astext())
raise nodes.SkipNode
def visit_literal_block(self, node):
text = node.astext()
if not text.endswith("\n"):
text += "\n"
self.add_token("code_block", "code", 0, content=text)
raise nodes.SkipNode
def visit_doctest_block(self, node):
# https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#doctest-blocks
# https://pygments.org/docs/lexers/#pygments.lexers.python.PythonConsoleLexer
self.warning("Treating doctest block as pycon literal block", node.line)
text = node.astext()
if not text.endswith("\n"):
text += "\n"
self.add_token(
"fence",
"code",
0,
content=text,
markup="```",
info="pycon",
)
raise nodes.SkipNode
def visit_block_quote(self, node):
self.add_token("blockquote_open", "blockquote", 1, markup=">")
def depart_block_quote(self, node):
self.add_token("blockquote_close", "blockquote", -1, markup=">")
def visit_attribution(self, node):
# Markdown block quotes do not have an attribution syntax,
# so we add a best approximation
token = self.add_token("html_inline", "", 0)
token.content = f'<p class="attribution">-{node.astext()}</p>'
raise nodes.SkipNode
def visit_reference(self, node):
# we assume all reference names are plain text
text = node.astext()
if "standalone_uri" in node:
# autolink
token = self.add_token("link_open", "a", 1, markup="autolink", info="auto")
token.attrs["href"] = node["refuri"]
self.add_token("text", "", 0, content=node["refuri"])
self.add_token("link_close", "a", -1, markup="autolink", info="auto")
elif "refname" in node:
# reference a link definition `[refname]: url`, or a target `(refname)=`
# TODO ensure mdformat does not wrap in <>
token = self.add_token(
"link_open",
"a",
1,
attrs={"href": node["refname"]},
# TODO should only add label if target found?
meta={"label": node["refname"]},
)
self.add_token("text", "", 0, content=text)
self.add_token("link_close", "a", -1)
elif "refuri" in node:
# external link
# TODO ensure prefixed with http://?
token = self.add_token("link_open", "a", 1, attrs={"href": node["refuri"]})
self.add_token("text", "", 0, content=text)
self.add_token("link_close", "a", -1)
elif "refid" in node:
# anonymous links, pointing to internal targets
# TODO ensure mdformat does not wrap in <>
token = self.add_token(
"link_open",
"a",
1,
attrs={"href": node["refid"]},
)
self.add_token("text", "", 0, content=text)
self.add_token("link_close", "a", -1)
else:
message = f"unknown reference type: {node.rawsource}"
self.warning(message, node.line)
if self.raise_on_warning:
raise NotImplementedError(message)
raise nodes.SkipNode
def visit_target(self, node):
if node.get("inline"):
# TODO inline targets
message = f"inline targets not implemented: {node.rawsource}"
self.warning(message, node.line)
if self.raise_on_warning:
raise NotImplementedError(message)
self.add_token(
"code_inline", "code", 0, markup="`", content=str(node.rawsource)
)
raise nodes.SkipNode
if "refuri" in node:
for name in node["names"]:
# TODO warn about name starting ^ (clashes with footnotes)
if name not in self._env["references"]:
self._env["references"][name] = {
"title": "",
"href": node["refuri"],
"map": [node.line, node.line],
}
else:
self._env["duplicate_refs"].append(
{
"label": name,
"title": "",
"href": node["refuri"],
"map": [node.line, node.line],
}
)
elif "names" in node:
for name in node["names"]:
self.add_token(
"myst_target", "", 0, attrs={"class": "myst-target"}, content=name
)
if "refid" in node:
self.add_token(
"myst_target",
"",
0,
attrs={"class": "myst-target"},
content=node["refid"],
)
# TODO check for content?
raise nodes.SkipNode
# Standard CommonMark extensions
[docs] def parse_gfm_table(self, node) -> bool:
"""Check whether an RST table can be converted to a GFM one.
RST tables can have e.g. cells spanning multiple columns/rows,
which the GitHub Flavoured Markdown (GFM) table variant does not support.
"""
# must have one child tgroup
if len(node.children) != 1 or not isinstance(node.children[0], nodes.tgroup):
return False
# tgroup should contain the number of columns
tgroup = node.children[0]
if "cols" not in tgroup:
return False
ncolumns = tgroup["cols"]
# trgoup should contain children: (colspec)*, thead, tbody
if len(tgroup.children) < 2:
return False
if not isinstance(tgroup.children[-2], nodes.thead):
return False
if not isinstance(tgroup.children[-1], nodes.tbody):
return False
thead = tgroup.children[-2]
tbody = tgroup.children[-1]
# the header can only have one row with the full amount of columns
if len(thead.children) != 1 or len(thead.children[0]) != ncolumns:
return False
# each body row should have the full amount of columns
return all(len(row.children) == ncolumns for row in tbody.children)
def visit_table(self, node):
if not self.parse_gfm_table(node):
text = node.rawsource
if not text.endswith("\n"):
text += "\n"
self.add_token(
"fence", "code", 0, content=text, markup="```", info="{eval-rst}"
)
raise nodes.SkipNode
self.add_token("table_open", "table", 1)
def depart_table(self, node):
self.add_token("table_close", "table", -1)
def visit_tgroup(self, node):
pass
def depart_tgroup(self, node):
pass
def visit_colspec(self, node):
raise nodes.SkipNode
def visit_thead(self, node):
self.add_token("thead_open", "thead", 1)
def depart_thead(self, node):
self.add_token("thead_close", "thead", -1)
def visit_tbody(self, node):
self.add_token("tbody_open", "tbody", 1)
def depart_tbody(self, node):
self.add_token("tbody_close", "tbody", -1)
def visit_row(self, node):
self.add_token("tr_open", "tr", 1)
def depart_row(self, node):
self.add_token("tr_close", "tr", -1)
def visit_entry(self, node):
tag = "th" if self.parent_tokens.get("thead") else "td"
self.add_token(f"{tag}_open", tag, 1)
def depart_entry(self, node):
tag = "th" if self.parent_tokens.get("thead") else "td"
# Markdown cells can not include newlines
# TODO improve or upstream this "fix"
# maybe replace with html_inline <br> tokens (text will be escaped)
if self._inline:
for child in self._inline.children:
child.content = child.content.replace("\n", " ")
self.add_token(f"{tag}_close", tag, -1)
# TODO check if handling of is/subId required for footnotes
def visit_footnote(self, node, refname=None):
refname = refname or node["ids"][0] # assume there is only one id
self.add_token("footnote_block_open", "", 1)
self.add_token("footnote_open", "", 1, meta={"label": refname, "id": 0})
def depart_footnote(self, node):
self.add_token("footnote_close", "", -1)
self.add_token("footnote_block_close", "", -1)
def visit_citation(self, node):
# treated same as for visit_footnote, but with specific prefix
# TODO fails if duplicate refname, since names is empty
refname = node["names"][0] # assume there is only one name
refname = f"{self.cite_prefix}{refname}"
return self.visit_footnote(node, refname=refname)
def depart_citation(self, node):
# treated same as for depart_footnote
return self.depart_footnote(node)
def visit_footnote_reference(self, node):
if "refname" in node:
refname = node["refname"]
elif "refid" in node:
refname = node["refid"]
else:
message = f"unknown footnote reference type: {node.rawsource}"
self.warning(message, node.line)
if self.raise_on_warning:
raise NotImplementedError(message)
self.add_token(
"footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0}
)
raise nodes.SkipNode
def visit_citation_reference(self, node):
refname = node["refname"] if "refname" in node else node["refid"]
# for compatibility we treat citations the same as footnotes, with a prefix
refname = f"{self.cite_prefix}{refname}"
self.add_token(
"footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0}
)
# the node also contains the refname as text, but we don't need that
raise nodes.SkipNode
def visit_definition_list(self, node):
self.add_token("dl_open", "dl", 1)
def depart_definition_list(self, node):
self.add_token("dl_close", "dl", -1)
def visit_definition_list_item(self, node):
pass
def depart_definition_list_item(self, node):
pass
def visit_term(self, node):
self.add_token("dt_open", "dt", 1)
def depart_term(self, node):
self.add_token("dt_close", "dt", -1)
def visit_classifier(self, node):
# classifiers can follow a term, e.g. `term : classifier`
# TODO record term classifiers?
raise nodes.SkipNode
def visit_definition(self, node):
self.add_token("dd_open", "dd", 1)
def depart_definition(self, node):
self.add_token("dd_close", "dd", -1)
def visit_FrontMatterNode(self, node):
for field in node:
if len(field) != 2:
continue
key = field[0][0].astext()
tokens = self.nested_parse(field[1].children)
self._front_matter_tokens.append(([key], tokens))
raise nodes.SkipNode
def visit_field_list(self, node):
if node.rawsource:
text = "\n" + node.rawsource.strip() + "\n"
self.add_token("fence", "code", 0, content=text, info="{eval-rst}")
raise nodes.SkipNode
# MyST Markdown specific
def visit_RoleNode(self, node):
# TODO nested parse of specific roles
role = node["role"] or self.default_role
if role:
if self.dollar_math and role == "math":
self.add_token(
"math_inline", "math", 0, markup="$", content=node["text"].strip()
)
else:
self.add_token(
"myst_role", "", 0, meta={"name": role}, content=node["text"]
)
else:
self.add_token("code_inline", "code", 0, markup="`", content=node["text"])
raise nodes.SkipNode
def visit_comment(self, node):
# TODO alternately use <!-- -->
self.add_token(
"myst_line_comment",
"hr",
0,
attrs={"class": "myst-line-comment"},
content=indent(node.astext(), " "),
)
raise nodes.SkipNode
def visit_substitution_reference(self, node):
self.add_token("substitution_inline", "span", 0, content=node["refname"])
# the node also contains the refname as text, but we don't need that
raise nodes.SkipNode
def visit_substitution_definition(self, node):
if "names" not in node or not node["names"]:
raise nodes.SkipNode
key = node["names"][0]
# substitution definition should always be a single directive node
tokens = self.nested_parse(node.children)
self._front_matter_tokens.append((["substitutions", key], tokens))
raise nodes.SkipNode
def visit_EvalRstNode(self, node):
text = node.astext()
if not text.endswith("\n"):
text += "\n"
self.add_token("fence", "code", 0, content=text, info="{eval-rst}")
raise nodes.SkipNode
def visit_DirectiveNode(self, node):
markup = "`"
if self.colon_fences and node["conversion"] in (
"parse_content",
"parse_content_titles",
"parse_all",
):
markup = ":"
if (
(
node["name"] == "code-block"
or node["module"] == "sphinx.directives.patches.Code"
)
and not node["options_list"]
and len(node.children) == 2
):
# special case, where we can use standard Markdown fences
argument, content = node.children
self.add_token(
"fence",
"code",
0,
content=content.astext() + "\n",
markup="```",
info=argument.astext().strip(),
)
raise nodes.SkipNode
elif (
(
node["name"] == "math"
or node["module"] == "docutils.parsers.rst.directives.body.MathBlock"
)
and self.dollar_math
and (
not node["options_list"]
or (
len(node["options_list"]) == 1
and node["options_list"][0][0] == "label"
)
)
and len(node.children) == 2
):
# special case where we use dollarmath
argument, content = node.children
text = ""
if argument.astext().strip():
text += "\n" + argument.astext().strip() + "\n"
if content.astext().strip():
text += "\n" + content.astext().strip() + "\n"
if node["options_list"]:
label = node["options_list"][0][1]
major, minor, patch = (int(i) for i in mdit_plug_version.split("."))
name = "math_block_label"
if major == 0 and minor < 3:
name = "math_block_eqno"
self.add_token(name, "math", 0, markup="$$", content=text, info=label)
else:
self.add_token("math_block", "math", 0, markup="$$", content=text)
raise nodes.SkipNode
else:
self.add_token(
"directive_open",
"",
1,
meta={
key: node[key]
for key in ["name", "module", "conversion", "options_list"]
},
markup=markup,
)
def depart_DirectiveNode(self, node):
self.add_token("directive_close", "", -1)
def visit_ArgumentNode(self, node):
# TODO might be a better construct to have this as children of inline
self.add_token("directive_arg_open", "", 1)
def depart_ArgumentNode(self, node):
self.add_token("directive_arg_close", "", -1)
def visit_ContentNode(self, node):
self.add_token("directive_content_open", "", 1)
def depart_ContentNode(self, node):
self.add_token("directive_content_close", "", -1)
# TODO https://docutils.sourceforge.io/docs/user/rst/quickref.htm
# line block, option list