Source code for rst_to_myst.markdownit

"""Convert to markdown-it tokens, which can then be rendered by mdformat."""
from io import StringIO
from textwrap import indent
from typing import IO, Any, Dict, List, NamedTuple, Optional, Tuple

from docutils import nodes
from markdown_it.token import Token
from mdit_py_plugins import __version__ as mdit_plug_version

[docs]class RenderOutput(NamedTuple): tokens: List[Token] env: Dict[str, Any]
[docs]class MarkdownItRenderer(nodes.GenericNodeVisitor): """Render docutils AST to Markdown-It token stream.""" def __init__( self, document: nodes.document, *, warning_stream: Optional[IO] = None, raise_on_warning: bool = False, cite_prefix: str = "cite_", default_role: Optional[str] = None, colon_fences: bool = True, dollar_math: bool = True, ): self._document = document self._warning_stream = warning_stream or StringIO() self.raise_on_warning = raise_on_warning # prefix added to citation labels self.cite_prefix = cite_prefix # if no default role, convert to literal self.default_role = default_role self.colon_fences = colon_fences self.dollar_math = dollar_math self.reset_state() def reset_state(self): # record current state, that can affect children tokens self._tokens: List[Token] = [] self._env = {"references": {}, "duplicate_refs": []} self._inline: Optional[Token] = None self.parent_tokens: Dict[str, int] = {} # [(key path, tokens), ...] self._front_matter_tokens: List[Tuple[List[str], List[Token]]] = [] self._tight_list = True @property def document(self) -> nodes.document: return self._document def warning(self, message: str, line: Optional[int]): if line is not None: self._warning_stream.write(f"RENDER WARNING:{line}: {message}\n") else: self._warning_stream.write(f"RENDER WARNING: {message}\n")
[docs] def to_tokens(self) -> RenderOutput: """Reset tokens and convert full document.""" self.reset_state() self._document.walkabout(self) # add front-matter that should be nested parsed if self._front_matter_tokens: fm_tokens = [] fm_tokens.append(Token("front_matter_tokens_open", "", 1)) for key_path, tokens in self._front_matter_tokens: fm_tokens.append( Token("front_matter_key_open", "", 1, meta={"key_path": key_path}) ) fm_tokens.extend(tokens) fm_tokens.append(Token("front_matter_key_close", "", -1)) fm_tokens.append(Token("front_matter_tokens_close", "", -1)) self._tokens = fm_tokens + self._tokens return RenderOutput(self._tokens[:], self._env)
def nested_parse(self, nodes: List[nodes.Element]) -> List[Token]: new_inst = MarkdownItRenderer( document=self._document, warning_stream=self._warning_stream, cite_prefix=self.cite_prefix, default_role=self.default_role, colon_fences=self.colon_fences, dollar_math=self.dollar_math, ) for node in nodes: node.walkabout(new_inst) return new_inst._tokens
[docs] def add_token( self, ttype: str, tag: str, nesting: int, *, content: str = "", **kwargs: Any ) -> Token: """A markdown-it token to the stream, handling inline tokens and children.""" token = Token(ttype, tag, nesting, content=content, **kwargs) # record entries and exits if ttype.endswith("_open"): self.parent_tokens.setdefault(ttype[:-5], 0) self.parent_tokens[ttype[:-5]] += 1 if ttype.endswith("_close"): self.parent_tokens.setdefault(ttype[:-6], 0) self.parent_tokens[ttype[:-6]] -= 1 if self.parent_tokens[ttype[:-6]] <= 0: self.parent_tokens.pop(ttype[:-6]) # decide whether we should be adding as an inline child if ttype in {"paragraph_open", "heading_open", "th_open", "td_open", "dt_open"}: self._tokens.append(token) self._inline = Token("inline", "", 0, children=[]) self._tokens.append(self._inline) elif ttype in { "paragraph_close", "heading_close", "th_close", "td_close", "dt_close", }: self._tokens.append(token) self._inline = None elif self._inline: self._inline.children.append(token) else: self._tokens.append(token) return token
[docs] def default_visit(self, node): self.unknown_visit(node)
[docs] def default_departure(self, node): self.unknown_departure(node)
[docs] def unknown_visit(self, node): message = f"no visit method for: {node.__class__}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message)
[docs] def unknown_departure(self, node): message = f"no depart method for: {node.__class__}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message)
# Skipped components def visit_document(self, node): pass def depart_document(self, node): pass def visit_Element(self, node): pass def depart_Element(self, node): pass def visit_system_message(self, node): # ignore raise nodes.SkipNode def visit_problematic(self, node): # ignore raise nodes.SkipNode # CommonMark components def visit_section(self, node): pass # handled by title def depart_section(self, node): pass def visit_title(self, node): token = self.add_token("heading_open", f"h{node['level']}", 1) token.markup = "#" * node["level"] def depart_title(self, node): token = self.add_token("heading_close", f"h{node['level']}", -1) token.markup = "#" * node["level"] def visit_paragraph(self, node): if self.parent_tokens.get("th") or self.parent_tokens.get("td"): # table cells are treated as paragraphs already return token = self.add_token("paragraph_open", "p", 1) if self.parent_tokens.get("list_item") and self._tight_list: # paragraphs in tight lists are hidden token.hidden = True def depart_paragraph(self, node): if self.parent_tokens.get("th") or self.parent_tokens.get("td"): # table cells are treated as paragraphs already return self.add_token("paragraph_close", "p", -1) def visit_Text(self, node): self.add_token("text", "", 0, content=node.astext()) raise nodes.SkipNode def visit_UnprocessedText(self, node): self.add_token("unprocessed", "", 0, content=node.astext()) raise nodes.SkipNode def visit_emphasis(self, node): self.add_token("em_open", "em", 1, markup="*") def depart_emphasis(self, node): self.add_token("em_close", "em", -1, markup="*") def visit_strong(self, node): self.add_token("strong_open", "strong", 1, markup="**") def depart_strong(self, node): self.add_token("strong_close", "strong", -1, markup="**") def visit_transition(self, node): self.add_token("hr", "hr", 0, markup="---") raise nodes.SkipNode def visit_bullet_list(self, node): self.add_token("bullet_list_open", "ul", 1, markup=node["bullet"]) def depart_bullet_list(self, node): self.add_token("bullet_list_close", "ul", -1, markup=node["bullet"]) def visit_enumerated_list(self, node): token = self.add_token("ordered_list_open", "ol", 1, markup=".") if "start" in node: token.attrs["start"] = node["start"] def depart_enumerated_list(self, node): self.add_token("ordered_list_close", "ol", -1, markup=".") def visit_list_item(self, node): token = self.add_token("list_item_open", "li", 1) if "style" in node: if node["style"] == "bullet": token.markup = node["prefix"].strip() elif node["style"] == "enumerated": token.markup = "." # a list is loose if any of its list items directly contain # two block-level elements, otherwise tight. In this case paragraphs are hidden self._tight_list = len(node.children) < 2 def depart_list_item(self, node): self.add_token("list_item_close", "li", -1) def visit_literal(self, node): self.add_token("code_inline", "code", 0, markup="`", content=node.astext()) raise nodes.SkipNode def visit_literal_block(self, node): text = node.astext() if not text.endswith("\n"): text += "\n" self.add_token("code_block", "code", 0, content=text) raise nodes.SkipNode def visit_doctest_block(self, node): # # self.warning("Treating doctest block as pycon literal block", node.line) text = node.astext() if not text.endswith("\n"): text += "\n" self.add_token( "fence", "code", 0, content=text, markup="```", info="pycon", ) raise nodes.SkipNode def visit_block_quote(self, node): self.add_token("blockquote_open", "blockquote", 1, markup=">") def depart_block_quote(self, node): self.add_token("blockquote_close", "blockquote", -1, markup=">") def visit_attribution(self, node): # Markdown block quotes do not have an attribution syntax, # so we add a best approximation token = self.add_token("html_inline", "", 0) token.content = f'<p class="attribution">-{node.astext()}</p>' raise nodes.SkipNode def visit_reference(self, node): # we assume all reference names are plain text text = node.astext() if "standalone_uri" in node: # autolink token = self.add_token("link_open", "a", 1, markup="autolink", info="auto") token.attrs["href"] = node["refuri"] self.add_token("text", "", 0, content=node["refuri"]) self.add_token("link_close", "a", -1, markup="autolink", info="auto") elif "refname" in node: # reference a link definition `[refname]: url`, or a target `(refname)=` # TODO ensure mdformat does not wrap in <> token = self.add_token( "link_open", "a", 1, attrs={"href": node["refname"]}, # TODO should only add label if target found? meta={"label": node["refname"]}, ) self.add_token("text", "", 0, content=text) self.add_token("link_close", "a", -1) elif "refuri" in node: # external link # TODO ensure prefixed with http://? token = self.add_token("link_open", "a", 1, attrs={"href": node["refuri"]}) self.add_token("text", "", 0, content=text) self.add_token("link_close", "a", -1) elif "refid" in node: # anonymous links, pointing to internal targets # TODO ensure mdformat does not wrap in <> token = self.add_token( "link_open", "a", 1, attrs={"href": node["refid"]}, ) self.add_token("text", "", 0, content=text) self.add_token("link_close", "a", -1) else: message = f"unknown reference type: {node.rawsource}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message) raise nodes.SkipNode def visit_target(self, node): if "inline" in node and node["inline"]: # TODO inline targets message = f"inline targets not implemented: {node.rawsource}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message) self.add_token( "code_inline", "code", 0, markup="`", content=str(node.rawsource) ) raise nodes.SkipNode if "refuri" in node: for name in node["names"]: # TODO warn about name starting ^ (clashes with footnotes) if name not in self._env["references"]: self._env["references"][name] = { "title": "", "href": node["refuri"], "map": [node.line, node.line], } else: self._env["duplicate_refs"].append( { "label": name, "title": "", "href": node["refuri"], "map": [node.line, node.line], } ) elif "names" in node: for name in node["names"]: self.add_token( "myst_target", "", 0, attrs={"class": "myst-target"}, content=name ) if "refid" in node: self.add_token( "myst_target", "", 0, attrs={"class": "myst-target"}, content=node["refid"], ) # TODO check for content? raise nodes.SkipNode # Standard CommonMark extensions
[docs] def parse_gfm_table(self, node) -> bool: """Check whether an RST table can be converted to a GFM one. RST tables can have e.g. cells spanning multiple columns/rows, which the GitHub Flavoured Markdown (GFM) table variant does not support. """ # must have one child tgroup if len(node.children) != 1 or not isinstance(node.children[0], nodes.tgroup): return False # tgroup should contain the number of columns tgroup = node.children[0] if "cols" not in tgroup: return False ncolumns = tgroup["cols"] # trgoup should contain children: (colspec)*, thead, tbody if len(tgroup.children) < 2: return False if not isinstance(tgroup.children[-2], nodes.thead): return False if not isinstance(tgroup.children[-1], nodes.tbody): return False thead = tgroup.children[-2] tbody = tgroup.children[-1] # the header can only have one row with the full amount of columns if len(thead.children) != 1 or len(thead.children[0]) != ncolumns: return False # each body row should have the full amount of columns for row in tbody.children: if len(row.children) != ncolumns: return False return True
def visit_table(self, node): if not self.parse_gfm_table(node): text = node.rawsource if not text.endswith("\n"): text += "\n" self.add_token( "fence", "code", 0, content=text, markup="```", info="{eval-rst}" ) raise nodes.SkipNode self.add_token("table_open", "table", 1) def depart_table(self, node): self.add_token("table_close", "table", -1) def visit_tgroup(self, node): pass def depart_tgroup(self, node): pass def visit_colspec(self, node): raise nodes.SkipNode def visit_thead(self, node): self.add_token("thead_open", "thead", 1) def depart_thead(self, node): self.add_token("thead_close", "thead", -1) def visit_tbody(self, node): self.add_token("tbody_open", "tbody", 1) def depart_tbody(self, node): self.add_token("tbody_close", "tbody", -1) def visit_row(self, node): self.add_token("tr_open", "tr", 1) def depart_row(self, node): self.add_token("tr_close", "tr", -1) def visit_entry(self, node): tag = "th" if self.parent_tokens.get("thead") else "td" self.add_token(f"{tag}_open", tag, 1) def depart_entry(self, node): tag = "th" if self.parent_tokens.get("thead") else "td" # Markdown cells can not include newlines # TODO improve or upstream this "fix" # maybe replace with html_inline <br> tokens (text will be escaped) if self._inline: for child in self._inline.children: child.content = child.content.replace("\n", " ") self.add_token(f"{tag}_close", tag, -1) # TODO check if handling of is/subId required for footnotes def visit_footnote(self, node, refname=None): refname = refname or node["ids"][0] # assume there is only one id self.add_token("footnote_block_open", "", 1) self.add_token("footnote_open", "", 1, meta={"label": refname, "id": 0}) def depart_footnote(self, node): self.add_token("footnote_close", "", -1) self.add_token("footnote_block_close", "", -1) def visit_citation(self, node): # treated same as for visit_footnote, but with specific prefix # TODO fails if duplicate refname, since names is empty refname = node["names"][0] # assume there is only one name refname = f"{self.cite_prefix}{refname}" return self.visit_footnote(node, refname=refname) def depart_citation(self, node): # treated same as for depart_footnote return self.depart_footnote(node) def visit_footnote_reference(self, node): if "refname" in node: refname = node["refname"] elif "refid" in node: refname = node["refid"] else: message = f"unknown footnote reference type: {node.rawsource}" self.warning(message, node.line) if self.raise_on_warning: raise NotImplementedError(message) self.add_token( "footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0} ) raise nodes.SkipNode def visit_citation_reference(self, node): refname = node["refname"] if "refname" in node else node["refid"] # for compatibility we treat citations the same as footnotes, with a prefix refname = f"{self.cite_prefix}{refname}" self.add_token( "footnote_ref", "", 0, meta={"label": refname, "id": 0, "subId": 0} ) # the node also contains the refname as text, but we don't need that raise nodes.SkipNode def visit_definition_list(self, node): self.add_token("dl_open", "dl", 1) def depart_definition_list(self, node): self.add_token("dl_close", "dl", -1) def visit_definition_list_item(self, node): pass def depart_definition_list_item(self, node): pass def visit_term(self, node): self.add_token("dt_open", "dt", 1) def depart_term(self, node): self.add_token("dt_close", "dt", -1) def visit_classifier(self, node): # classifiers can follow a term, e.g. `term : classifier` # TODO record term classifiers? raise nodes.SkipNode def visit_definition(self, node): self.add_token("dd_open", "dd", 1) def depart_definition(self, node): self.add_token("dd_close", "dd", -1) def visit_FrontMatterNode(self, node): for field in node: if not len(field) == 2: continue key = field[0][0].astext() tokens = self.nested_parse(field[1].children) self._front_matter_tokens.append(([key], tokens)) raise nodes.SkipNode def visit_field_list(self, node): if node.rawsource: text = "\n" + node.rawsource.strip() + "\n" self.add_token("fence", "code", 0, content=text, info="{eval-rst}") raise nodes.SkipNode # MyST Markdown specific def visit_RoleNode(self, node): # TODO nested parse of specific roles role = node["role"] or self.default_role if role: if self.dollar_math and role == "math": self.add_token( "math_inline", "math", 0, markup="$", content=node["text"].strip() ) else: self.add_token( "myst_role", "", 0, meta={"name": role}, content=node["text"] ) else: self.add_token("code_inline", "code", 0, markup="`", content=node["text"]) raise nodes.SkipNode def visit_comment(self, node): # TODO alternately use <!-- --> self.add_token( "myst_line_comment", "hr", 0, attrs={"class": "myst-line-comment"}, content=indent(node.astext(), " "), ) raise nodes.SkipNode def visit_substitution_reference(self, node): self.add_token("substitution_inline", "span", 0, content=node["refname"]) # the node also contains the refname as text, but we don't need that raise nodes.SkipNode def visit_substitution_definition(self, node): if "names" not in node or not node["names"]: raise nodes.SkipNode key = node["names"][0] # substitution definition should always be a single directive node tokens = self.nested_parse(node.children) self._front_matter_tokens.append((["substitutions", key], tokens)) raise nodes.SkipNode def visit_EvalRstNode(self, node): text = node.astext() if not text.endswith("\n"): text += "\n" self.add_token("fence", "code", 0, content=text, info="{eval-rst}") raise nodes.SkipNode def visit_DirectiveNode(self, node): markup = "`" if self.colon_fences and node["conversion"] in ( "parse_content", "parse_content_titles", "parse_all", ): markup = ":" if ( ( node["name"] == "code-block" or node["module"] == "sphinx.directives.patches.Code" ) and not node["options_list"] and len(node.children) == 2 ): # special case, where we can use standard Markdown fences argument, content = node.children self.add_token( "fence", "code", 0, content=content.astext() + "\n", markup="```", info=argument.astext().strip(), ) raise nodes.SkipNode elif ( ( node["name"] == "math" or node["module"] == "docutils.parsers.rst.directives.body.MathBlock" ) and self.dollar_math and ( not node["options_list"] or ( len(node["options_list"]) == 1 and node["options_list"][0][0] == "label" ) ) and len(node.children) == 2 ): # special case where we use dollarmath argument, content = node.children text = "" if argument.astext().strip(): text += "\n" + argument.astext().strip() + "\n" if content.astext().strip(): text += "\n" + content.astext().strip() + "\n" if node["options_list"]: label = node["options_list"][0][1] major, minor, patch = [int(i) for i in mdit_plug_version.split(".")] name = "math_block_label" if major == 0 and minor < 3: name = "math_block_eqno" self.add_token(name, "math", 0, markup="$$", content=text, info=label) else: self.add_token("math_block", "math", 0, markup="$$", content=text) raise nodes.SkipNode else: self.add_token( "directive_open", "", 1, meta={ key: node[key] for key in ["name", "module", "conversion", "options_list"] }, markup=markup, ) def depart_DirectiveNode(self, node): self.add_token("directive_close", "", -1) def visit_ArgumentNode(self, node): # TODO might be a better construct to have this as children of inline self.add_token("directive_arg_open", "", 1) def depart_ArgumentNode(self, node): self.add_token("directive_arg_close", "", -1) def visit_ContentNode(self, node): self.add_token("directive_content_open", "", 1) def depart_ContentNode(self, node): self.add_token("directive_content_close", "", -1)
# TODO # line block, option list