Source code for rst_to_myst.parser

from functools import lru_cache
from io import StringIO
from typing import Any, Dict, Iterable, Optional, Tuple

import yaml
from docutils import nodes
from docutils.frontend import OptionParser
from docutils.parsers.rst import Parser
from docutils.transforms import Transform
from docutils.transforms.references import (
    AnonymousHyperlinks,
    Footnotes,
    PropagateTargets,
)
from docutils.utils import new_document, roman

try:
    from importlib.resources import files
except ImportError:
    from importlib_resources import files

from . import data as package_data
from .inliner import InlinerMyst
from .namespace import ApplicationNamespace, compile_namespace
from .nodes import FrontMatterNode
from .states import get_state_classes


class LosslessRSTParser(Parser):
    """Modified RST Parser, allowing for the retrieval of the original source text.

    Principally, roles and directives are not run.
    """

    def __init__(self):
        self.initial_state = "Body"
        self.state_classes = get_state_classes()
        for state_class in self.state_classes:
            # flush any cached states from the last parse
            state_class.nested_sm_cache = []
        self.inliner = InlinerMyst()


class IndirectHyperlinks(Transform):
    """Resolve indirect hyperlinks."""

    def apply(self):
        for target in self.document.indirect_targets:
            if not target.resolved:
                self.resolve_indirect_target(target)  # TODO implement this resolve?
            # Do not resolve the actual references, since this replaces the "refname"
            # self.resolve_indirect_references(target)


class StripFootnoteLabel(Transform):
    """Footnotes and citations can start with a label note, which we do not need."""

    def apply(self):
        for node in self.document.traverse(
            lambda n: isinstance(n, (nodes.footnote, nodes.citation))
        ):
            if node.children and isinstance(node.children[0], nodes.label):
                node.pop(0)


ENUM_CONVERTERS = {
    "arabic": (lambda i: i),
    "lowerroman": (lambda i: roman.toRoman(i).lower()),
    "upperroman": (lambda i: roman.toRoman(i).upper()),
    "loweralpha": (lambda i: chr(ord("a") + i - 1)),
    "upperalpha": (lambda i: chr(ord("a") + i - 1).upper()),
}


class ResolveListItems(Transform):
    """For bullet/enumerated lists, propagate attributes to their child list items.

    Also decide if they are loose/tight::

        A list is loose if any of its list items are separated by blank lines,
        or if any of its list items directly contain two block-level elements
        with a blank line between them. Otherwise a list is tight.
    """

    def apply(self):
        for node in self.document.traverse(nodes.bullet_list):
            prefix = node["bullet"] + " "
            for child in node.children:
                if isinstance(child, nodes.list_item):
                    child["style"] = "bullet"
                    child["prefix"] = prefix

        for node in self.document.traverse(nodes.enumerated_list):
            number = 1
            if "start" in node:
                number = node["start"]
            # TODO markdown-it only supports numbers
            # prefix = node["prefix"]
            # suffix = node["suffix"]
            # convert = ENUM_CONVERTERS[node["enumtype"]]
            for child in node.children:
                if isinstance(child, nodes.list_item):
                    child["style"] = "enumerated"
                    child["prefix"] = f"{number}. "
                    number += 1


class FrontMatter(Transform):
    """Extract an initial field list into a `FrontMatterNode`.

    Similar to ``docutils.transforms.frontmatter.DocInfo``.
    """

    def apply(self):
        if not self.document.settings.front_matter:
            return
        index = self.document.first_child_not_matching_class(nodes.PreBibliographic)
        if index is None:
            return
        candidate = self.document[index]
        if isinstance(candidate, nodes.section):
            index = candidate.first_child_not_matching_class(nodes.PreBibliographic)
            if index is None:
                return
            candidate = candidate[index]
        if isinstance(candidate, nodes.field_list):
            front_matter = FrontMatterNode("", *candidate.children)
            candidate.replace_self(front_matter)


@lru_cache()
def _load_directive_data() -> Dict[str, Any]:
    return yaml.safe_load(
        files(package_data).joinpath("directives.yml").read_text("utf8")
    )


[docs]def to_docutils_ast(
    text: str,
    uri: str = "source",
    report_level: int = 2,
    halt_level: int = 4,
    warning_stream: Optional[StringIO] = None,
    language_code: str = "en",
    use_sphinx: bool = True,
    extensions: Iterable[str] = (),
    default_domain: str = "py",
    conversions: Optional[dict] = None,
    front_matter: bool = True,
    namespace: Optional[ApplicationNamespace] = None,
) -> Tuple[nodes.document, StringIO]:
    """Convert a string of text to a docutils AST.

    :param text: The text to convert.
    :param uri: The URI of the document.
    :param report_level: The report level for docutils.
    :param halt_level: The halt level for docutils.
    :param warning_stream: A stream to write warnings to.
    :param language_code: The language code for docutils.
    :param use_sphinx: Whether to use Sphinx roles and directives.
    :param extensions: A list of Sphinx extensions to use.
    :param default_domain: The default Sphinx domain.
    :param conversions: A dictionary of conversion functions.
    :param front_matter: Whether to treat initial field list as front matter.
    :param namespace: A pre-computed docutils namespace to use.
    """
    settings = OptionParser(components=(LosslessRSTParser,)).get_default_values()
    warning_stream = StringIO() if warning_stream is None else warning_stream
    settings.warning_stream = warning_stream
    settings.report_level = report_level  # 2=warning
    settings.halt_level = halt_level  # 4=severe
    # The level at or above which `SystemMessage` exceptions
    # will be raised, halting execution.
    settings.language_code = language_code

    document = new_document(uri, settings=settings)

    # compile lookup for directives/roles
    namespace = (
        compile_namespace(
            language_code=language_code,
            use_sphinx=use_sphinx,
            extensions=extensions,
            default_domain=default_domain,
        )
        if namespace is None
        else namespace
    )
    document.settings.namespace = namespace

    # get conversion lookup for directives
    directive_data = _load_directive_data()
    if conversions:
        directive_data = {**directive_data, **conversions}
    document.settings.directive_data = directive_data

    # whether to treat initial field list as front matter
    document.settings.front_matter = front_matter

    parser = LosslessRSTParser()
    parser.parse(text, document)

    # these three transforms are required for converting targets correctly
    for transform_cls in [
        PropagateTargets,  # Propagate empty internal targets to the next element. (260)
        FrontMatter,  # convert initial field list (DocInfo=340)
        AnonymousHyperlinks,  # Link anonymous references to targets. (440)
        # IndirectHyperlinks,  # "refuri" migrated back to all indirect targets (460)
        Footnotes,  # Assign numbers to autonumbered footnotes (620)
        # bespoke transforms
        StripFootnoteLabel,
        ResolveListItems,
    ]:
        transform = transform_cls(document)
        transform.apply()

    return document, warning_stream