Source code for rst_to_myst.markdownit

"""Convert to markdown-it tokens, which can then be rendered by mdformat."""

from io import StringIO
from textwrap import indent
from typing import IO, Any, NamedTuple, Optional

from docutils import nodes
from markdown_it.token import Token
from mdit_py_plugins import __version__ as mdit_plug_version


[docs]class RenderOutput(NamedTuple): tokens: list[Token] env: dict[str, Any]
[docs]class MarkdownItRenderer(nodes.GenericNodeVisitor): """Render docutils AST to Markdown-It token stream.""" def __init__( self, document: nodes.document, *, warning_stream: Optional[IO] = None, raise_on_warning: bool = False, cite_prefix: str = "cite_", default_role: Optional[str] = None, colon_fences: bool = True, dollar_math: bool = True, ): self._document = document self._warning_stream = warning_stream or StringIO() self.raise_on_warning = raise_on_warning # prefix added to citation labels self.cite_prefix = cite_prefix # if no default role, convert to literal self.default_role = default_role self.colon_fences = colon_fences self.dollar_math = dollar_math self.reset_state() def reset_state(self): # record current state, that can affect children tokens self._tokens: list[Token] = [] self._env = {"references": {}, "duplicate_refs": []} self._inline: Optional[Token] = None self.parent_tokens: dict[str, int] = {} # [(key path, tokens), ...] self._front_matter_tokens: list[tuple[list[str], list[Token]]] = [] self._tight_list = True @property def document(self) -> nodes.document: return self._document def warning(self, message: str, line: Optional[int]): if line is not None: self._warning_stream.write(f"RENDER WARNING:{line}: {message}\n") else: self._warning_stream.write(f"RENDER WARNING: {message}\n")
[docs] def to_tokens(self) -> RenderOutput: """Reset tokens and convert full document.""" self.reset_state() self._document.walkabout(self) # add front-matter that should be nested parsed if self._front_matter_tokens: fm_tokens = [] fm_tokens.append(Token("front_matter_tokens_open", "", 1)) for key_path, tokens in self._front_matter_tokens: fm_tokens.append( Token("front_matter_key_open", "", 1, meta={"key_path": key_path}) ) fm_tokens.extend(tokens) fm_tokens.append(Token("front_matter_key_close", "", -1)) fm_tokens.append(Token("front_matter_tokens_close", "", -1)) self._tokens = fm_tokens + self._tokens return RenderOutput(self._tokens[:], self._env)
def nested_parse(self, nodes: list[nodes.Element]) -> list[Token]: new_inst = MarkdownItRenderer( document=self._document, warning_stream=self._warning_stream, cite_prefix=self.cite_prefix, default_role=self.default_role, colon_fences=self.colon_fences, dollar_math=self.dollar_math, ) for node in nodes: node.walkabout(new_inst) return new_inst._tokens
[docs] def add_token( self, ttype: str, tag: str, nesting: int, *, content: str = "", **kwargs: Any ) -> Token: """A markdown-it token to the stream, handling inline tokens and children.""" token = Token(ttype, tag, nesting, content=content, **kwargs) # record entries and exits if ttype.endswith("_open"): self.parent_tokens.setdefault(ttype[:-5], 0) self.parent_tokens[ttype[:-5]] += 1 if ttype.endswith("_close"): self.parent_tokens.setdefault(ttype[:-6], 0) self.parent_tokens[ttype[:-6]] -= 1 if self.parent_tokens[ttype[:-6]] <= 0: self.parent_tokens.pop(ttype[:-6]) # decide whether we should be adding as an inline child if ttype in {"paragraph_open", "heading_open", "th_open", "td_open", "dt_open"}: self._tokens.append(token) self._inline = Token("inline", "", 0, children=[]) self._tokens.append(self._inline) elif ttype in { "paragraph_close", "heading_close", "th_close", "td_close", "dt_close", }: self._tokens.append(token) self._inline = None elif self._inline: self._inline.children.append(token) else: self._tokens.append(token) return token
[docs] def default_visit(self, node): self.unknown_visit(node)
[docs] def default_departure(self, node): self.unknown_departure(node)
[docs] def unknown_visit(self, node): message = f"no visit method for: {node.__class__}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message)
[docs] def unknown_departure(self, node): message = f"no depart method for: {node.__class__}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message)
# Skipped components def visit_document(self, node): pass def depart_document(self, node): pass def visit_Element(self, node): pass def depart_Element(self, node): pass def visit_system_message(self, node): # ignore raise nodes.SkipNode def visit_problematic(self, node): # ignore raise nodes.SkipNode # CommonMark components def visit_section(self, node): pass # handled by title def depart_section(self, node): pass def visit_title(self, node): token = self.add_token("heading_open", f"h{node['level']}", 1) token.markup = "#" * node["level"] def depart_title(self, node): token = self.add_token("heading_close", f"h{node['level']}", -1) token.markup = "#" * node["level"] def visit_paragraph(self, node): if self.parent_tokens.get("th") or self.parent_tokens.get("td"): # table cells are treated as paragraphs already return token = self.add_token("paragraph_open", "p", 1) if self.parent_tokens.get("list_item") and self._tight_list: # paragraphs in tight lists are hidden token.hidden = True def depart_paragraph(self, node): if self.parent_tokens.get("th") or self.parent_tokens.get("td"): # table cells are treated as paragraphs already return self.add_token("paragraph_close", "p", -1) def visit_Text(self, node): self.add_token("text", "", 0, content=node.astext()) raise nodes.SkipNode def visit_UnprocessedText(self, node): self.add_token("unprocessed", "", 0, content=node.astext()) raise nodes.SkipNode def visit_emphasis(self, node): self.add_token("em_open", "em", 1, markup="*") def depart_emphasis(self, node): self.add_token("em_close", "em", -1, markup="*") def visit_strong(self, node): self.add_token("strong_open", "strong", 1, markup="**") def depart_strong(self, node): self.add_token("strong_close", "strong", -1, markup="**") def visit_transition(self, node): self.add_token("hr", "hr", 0, markup="---") raise nodes.SkipNode def visit_bullet_list(self, node): self.add_token("bullet_list_open", "ul", 1, markup=node["bullet"]) def depart_bullet_list(self, node): self.add_token("bullet_list_close", "ul", -1, markup=node["bullet"]) def visit_enumerated_list(self, node): token = self.add_token("ordered_list_open", "ol", 1, markup=".") if "start" in node: token.attrs["start"] = node["start"] def depart_enumerated_list(self, node): self.add_token("ordered_list_close", "ol", -1, markup=".") def visit_list_item(self, node): token = self.add_token("list_item_open", "li", 1) if "style" in node: if node["style"] == "bullet": token.markup = node["prefix"].strip() elif node["style"] == "enumerated": token.markup = "." # a list is loose if any of its list items directly contain # two block-level elements, otherwise tight. In this case paragraphs are hidden self._tight_list = len(node.children) < 2 def depart_list_item(self, node): self.add_token("list_item_close", "li", -1) def visit_literal(self, node): self.add_token("code_inline", "code", 0, markup="`", content=node.astext()) raise nodes.SkipNode def visit_literal_block(self, node): text = node.astext() if not text.endswith("\n"): text += "\n" self.add_token("code_block", "code", 0, content=text) raise nodes.SkipNode def visit_doctest_block(self, node): # https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#doctest-blocks # https://pygments.org/docs/lexers/#pygments.lexers.python.PythonConsoleLexer self.warning("Treating doctest block as pycon literal block", node.line) text = node.astext() if not text.endswith("\n"): text += "\n" self.add_token( "fence", "code", 0, content=text, markup="```", info="pycon", ) raise nodes.SkipNode def visit_block_quote(self, node): self.add_token("blockquote_open", "blockquote", 1, markup=">") def depart_block_quote(self, node): self.add_token("blockquote_close", "blockquote", -1, markup=">") def visit_attribution(self, node): # Markdown block quotes do not have an attribution syntax, # so we add a best approximation token = self.add_token("html_inline", "", 0) token.content = f'<p class="attribution">-{node.astext()}</p>' raise nodes.SkipNode def visit_reference(self, node): # we assume all reference names are plain text text = node.astext() if "standalone_uri" in node: # autolink token = self.add_token("link_open", "a", 1, markup="autolink", info="auto") token.attrs["href"] = node["refuri"] self.add_token("text", "", 0, content=node["refuri"]) self.add_token("link_close", "a", -1, markup="autolink", info="auto") elif "refname" in node: # reference a link definition `[refname]: url`, or a target `(refname)=` # TODO ensure mdformat does not wrap in <> token = self.add_token( "link_open", "a", 1, attrs={"href": node["refname"]}, # TODO should only add label if target found? meta={"label": node["refname"]}, ) self.add_token("text", "", 0, content=text) self.add_token("link_close", "a", -1) elif "refuri" in node: # external link # TODO ensure prefixed with http://? token = self.add_token("link_open", "a", 1, attrs={"href": node["refuri"]}) self.add_token("text", "", 0, content=text) self.add_token("link_close", "a", -1) elif "refid" in node: # anonymous links, pointing to internal targets # TODO ensure mdformat does not wrap in <> token = self.add_token( "link_open", "a", 1, attrs={"href": node["refid"]}, ) self.add_token("text", "", 0, content=text) self.add_token("link_close", "a", -1) else: message = f"unknown reference type: {node.rawsource}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message) raise nodes.SkipNode def visit_target(self, node): if node.get("inline"): # TODO inline targets message = f"inline targets not implemented: {node.rawsource}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message) self.add_token( "code_inline", "code", 0, markup="`", content=str(node.rawsource) ) raise nodes.SkipNode if "refuri" in node: for name in node["names"]: # TODO warn about name starting ^ (clashes with footnotes) if name not in self._env["references"]: self._env["references"][name] = { "title": "", "href": node["refuri"], "map": [node.line, node.line], } else: self._env["duplicate_refs"].append( { "label": name, "title": "", "href": node["refuri"], "map": [node.line, node.line], } ) elif "names" in node: for name in node["names"]: self.add_token( "myst_target", "", 0, attrs={"class": "myst-target"}, content=name ) if "refid" in node: self.add_token( "myst_target", "", 0, attrs={"class": "myst-target"}, content=node["refid"], ) # TODO check for content? raise nodes.SkipNode # Standard CommonMark extensions
[docs] def parse_gfm_table(self, node) -> bool: """Check whether an RST table can be converted to a GFM one. RST tables can have e.g. cells spanning multiple columns/rows, which the GitHub Flavoured Markdown (GFM) table variant does not support. """ # must have one child tgroup if len(node.children) != 1 or not isinstance(node.children[0], nodes.tgroup): return False # tgroup should contain the number of columns tgroup = node.children[0] if "cols" not in tgroup: return False ncolumns = tgroup["cols"] # trgoup should contain children: (colspec)*, thead, tbody if len(tgroup.children) < 2: return False if not isinstance(tgroup.children[-2], nodes.thead): return False if not isinstance(tgroup.children[-1], nodes.tbody): return False thead = tgroup.children[-2] tbody = tgroup.children[-1] # the header can only have one row with the full amount of columns if len(thead.children) != 1 or len(thead.children[0]) != ncolumns: return False # each body row should have the full amount of columns return all(len(row.children) == ncolumns for row in tbody.children)
def visit_table(self, node): if not self.parse_gfm_table(node): text = node.rawsource if not text.endswith("\n"): text += "\n" self.add_token( "fence", "code", 0, content=text, markup="```", info="{eval-rst}" ) raise nodes.SkipNode self.add_token("table_open", "table", 1) def depart_table(self, node): self.add_token("table_close", "table", -1) def visit_tgroup(self, node): pass def depart_tgroup(self, node): pass def visit_colspec(self, node): raise nodes.SkipNode def visit_thead(self, node): self.add_token("thead_open", "thead", 1) def depart_thead(self, node): self.add_token("thead_close", "thead", -1) def visit_tbody(self, node): self.add_token("tbody_open", "tbody", 1) def depart_tbody(self, node): self.add_token("tbody_close", "tbody", -1) def visit_row(self, node): self.add_token("tr_open", "tr", 1) def depart_row(self, node): self.add_token("tr_close", "tr", -1) def visit_entry(self, node): tag = "th" if self.parent_tokens.get("thead") else "td" self.add_token(f"{tag}_open", tag, 1) def depart_entry(self, node): tag = "th" if self.parent_tokens.get("thead") else "td" # Markdown cells can not include newlines # TODO improve or upstream this "fix" # maybe replace with html_inline <br> tokens (text will be escaped) if self._inline: for child in self._inline.children: child.content = child.content.replace("\n", " ") self.add_token(f"{tag}_close", tag, -1) # TODO check if handling of is/subId required for footnotes def visit_footnote(self, node, refname=None): refname = refname or node["ids"][0] # assume there is only one id self.add_token("footnote_block_open", "", 1) self.add_token("footnote_open", "", 1, meta={"label": refname, "id": 0}) def depart_footnote(self, node): self.add_token("footnote_close", "", -1) self.add_token("footnote_block_close", "", -1) def visit_citation(self, node): # treated same as for visit_footnote, but with specific prefix # TODO fails if duplicate refname, since names is empty refname = node["names"][0] # assume there is only one name refname = f"{self.cite_prefix}{refname}" return self.visit_footnote(node, refname=refname) def depart_citation(self, node): # treated same as for depart_footnote return self.depart_footnote(node) def visit_footnote_reference(self, node): if "refname" in node: refname = node["refname"] elif "refid" in node: refname = node["refid"] else: message = f"unknown footnote reference type: {node.rawsource}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message) self.add_token( "footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0} ) raise nodes.SkipNode def visit_citation_reference(self, node): refname = node["refname"] if "refname" in node else node["refid"] # for compatibility we treat citations the same as footnotes, with a prefix refname = f"{self.cite_prefix}{refname}" self.add_token( "footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0} ) # the node also contains the refname as text, but we don't need that raise nodes.SkipNode def visit_definition_list(self, node): self.add_token("dl_open", "dl", 1) def depart_definition_list(self, node): self.add_token("dl_close", "dl", -1) def visit_definition_list_item(self, node): pass def depart_definition_list_item(self, node): pass def visit_term(self, node): self.add_token("dt_open", "dt", 1) def depart_term(self, node): self.add_token("dt_close", "dt", -1) def visit_classifier(self, node): # classifiers can follow a term, e.g. `term : classifier` # TODO record term classifiers? raise nodes.SkipNode def visit_definition(self, node): self.add_token("dd_open", "dd", 1) def depart_definition(self, node): self.add_token("dd_close", "dd", -1) def visit_FrontMatterNode(self, node): for field in node: if len(field) != 2: continue key = field[0][0].astext() tokens = self.nested_parse(field[1].children) self._front_matter_tokens.append(([key], tokens)) raise nodes.SkipNode def visit_field_list(self, node): if node.rawsource: text = "\n" + node.rawsource.strip() + "\n" self.add_token("fence", "code", 0, content=text, info="{eval-rst}") raise nodes.SkipNode # MyST Markdown specific def visit_RoleNode(self, node): # TODO nested parse of specific roles role = node["role"] or self.default_role if role: if self.dollar_math and role == "math": self.add_token( "math_inline", "math", 0, markup="$", content=node["text"].strip() ) else: self.add_token( "myst_role", "", 0, meta={"name": role}, content=node["text"] ) else: self.add_token("code_inline", "code", 0, markup="`", content=node["text"]) raise nodes.SkipNode def visit_comment(self, node): # TODO alternately use <!-- --> self.add_token( "myst_line_comment", "hr", 0, attrs={"class": "myst-line-comment"}, content=indent(node.astext(), " "), ) raise nodes.SkipNode def visit_substitution_reference(self, node): self.add_token("substitution_inline", "span", 0, content=node["refname"]) # the node also contains the refname as text, but we don't need that raise nodes.SkipNode def visit_substitution_definition(self, node): if "names" not in node or not node["names"]: raise nodes.SkipNode key = node["names"][0] # substitution definition should always be a single directive node tokens = self.nested_parse(node.children) self._front_matter_tokens.append((["substitutions", key], tokens)) raise nodes.SkipNode def visit_EvalRstNode(self, node): text = node.astext() if not text.endswith("\n"): text += "\n" self.add_token("fence", "code", 0, content=text, info="{eval-rst}") raise nodes.SkipNode def visit_DirectiveNode(self, node): markup = "`" if self.colon_fences and node["conversion"] in ( "parse_content", "parse_content_titles", "parse_all", ): markup = ":" if ( ( node["name"] == "code-block" or node["module"] == "sphinx.directives.patches.Code" ) and not node["options_list"] and len(node.children) == 2 ): # special case, where we can use standard Markdown fences argument, content = node.children self.add_token( "fence", "code", 0, content=content.astext() + "\n", markup="```", info=argument.astext().strip(), ) raise nodes.SkipNode elif ( ( node["name"] == "math" or node["module"] == "docutils.parsers.rst.directives.body.MathBlock" ) and self.dollar_math and ( not node["options_list"] or ( len(node["options_list"]) == 1 and node["options_list"][0][0] == "label" ) ) and len(node.children) == 2 ): # special case where we use dollarmath argument, content = node.children text = "" if argument.astext().strip(): text += "\n" + argument.astext().strip() + "\n" if content.astext().strip(): text += "\n" + content.astext().strip() + "\n" if node["options_list"]: label = node["options_list"][0][1] major, minor, patch = (int(i) for i in mdit_plug_version.split(".")) name = "math_block_label" if major == 0 and minor < 3: name = "math_block_eqno" self.add_token(name, "math", 0, markup="$$", content=text, info=label) else: self.add_token("math_block", "math", 0, markup="$$", content=text) raise nodes.SkipNode else: self.add_token( "directive_open", "", 1, meta={ key: node[key] for key in ["name", "module", "conversion", "options_list"] }, markup=markup, ) def depart_DirectiveNode(self, node): self.add_token("directive_close", "", -1) def visit_ArgumentNode(self, node): # TODO might be a better construct to have this as children of inline self.add_token("directive_arg_open", "", 1) def depart_ArgumentNode(self, node): self.add_token("directive_arg_close", "", -1) def visit_ContentNode(self, node): self.add_token("directive_content_open", "", 1) def depart_ContentNode(self, node): self.add_token("directive_content_close", "", -1)
# TODO https://docutils.sourceforge.io/docs/user/rst/quickref.htm # line block, option list