Source code for rst_to_myst.parser

from collections.abc import Iterable
from functools import lru_cache
from io import StringIO
from typing import Any, Optional

from docutils import nodes
from docutils.frontend import OptionParser
from docutils.parsers.rst import Parser
from docutils.transforms import Transform
from docutils.transforms.references import (
    AnonymousHyperlinks,
    Footnotes,
    PropagateTargets,
)
from docutils.utils import new_document, roman
import yaml

try:
    from importlib.resources import files
except ImportError:
    from importlib_resources import files

from . import data as package_data
from .inliner import InlinerMyst
from .namespace import ApplicationNamespace, compile_namespace
from .nodes import FrontMatterNode
from .states import get_state_classes


class LosslessRSTParser(Parser):
    """Modified RST Parser, allowing for the retrieval of the original source text.

    Principally, roles and directives are not run.
    """

    def __init__(self):
        self.initial_state = "Body"
        self.state_classes = get_state_classes()
        for state_class in self.state_classes:
            # flush any cached states from the last parse
            state_class.nested_sm_cache = []
        self.inliner = InlinerMyst()


class IndirectHyperlinks(Transform):
    """Resolve indirect hyperlinks."""

    def apply(self):
        for target in self.document.indirect_targets:
            if not target.resolved:
                self.resolve_indirect_target(target)  # TODO implement this resolve?
            # Do not resolve the actual references, since this replaces the "refname"
            # self.resolve_indirect_references(target)


class StripFootnoteLabel(Transform):
    """Footnotes and citations can start with a label note, which we do not need."""

    def apply(self):
        for node in self.document.traverse(
            lambda n: isinstance(n, (nodes.footnote, nodes.citation))
        ):
            if node.children and isinstance(node.children[0], nodes.label):
                node.pop(0)


ENUM_CONVERTERS = {
    "arabic": (lambda i: i),
    "lowerroman": (lambda i: roman.toRoman(i).lower()),
    "upperroman": (lambda i: roman.toRoman(i).upper()),
    "loweralpha": (lambda i: chr(ord("a") + i - 1)),
    "upperalpha": (lambda i: chr(ord("a") + i - 1).upper()),
}


class ResolveListItems(Transform):
    """For bullet/enumerated lists, propagate attributes to their child list items.

    Also decide if they are loose/tight::

        A list is loose if any of its list items are separated by blank lines,
        or if any of its list items directly contain two block-level elements
        with a blank line between them. Otherwise a list is tight.
    """

    def apply(self):
        for node in self.document.traverse(nodes.bullet_list):
            prefix = node["bullet"] + " "
            for child in node.children:
                if isinstance(child, nodes.list_item):
                    child["style"] = "bullet"
                    child["prefix"] = prefix

        for node in self.document.traverse(nodes.enumerated_list):
            number = 1
            if "start" in node:
                number = node["start"]
            # TODO markdown-it only supports numbers
            # prefix = node["prefix"]
            # suffix = node["suffix"]
            # convert = ENUM_CONVERTERS[node["enumtype"]]
            for child in node.children:
                if isinstance(child, nodes.list_item):
                    child["style"] = "enumerated"
                    child["prefix"] = f"{number}. "
                    number += 1


class FrontMatter(Transform):
    """Extract an initial field list into a `FrontMatterNode`.

    Similar to ``docutils.transforms.frontmatter.DocInfo``.
    """

    def apply(self):
        if not self.document.settings.front_matter:
            return
        index = self.document.first_child_not_matching_class(nodes.PreBibliographic)
        if index is None:
            return
        candidate = self.document[index]
        if isinstance(candidate, nodes.section):
            index = candidate.first_child_not_matching_class(nodes.PreBibliographic)
            if index is None:
                return
            candidate = candidate[index]
        if isinstance(candidate, nodes.field_list):
            front_matter = FrontMatterNode("", *candidate.children)
            candidate.replace_self(front_matter)


@lru_cache
def _load_directive_data() -> dict[str, Any]:
    return yaml.safe_load(
        files(package_data).joinpath("directives.yml").read_text("utf8")
    )


[docs]def to_docutils_ast(
    text: str,
    uri: str = "source",
    report_level: int = 2,
    halt_level: int = 4,
    warning_stream: Optional[StringIO] = None,
    language_code: str = "en",
    use_sphinx: bool = True,
    extensions: Iterable[str] = (),
    default_domain: str = "py",
    conversions: Optional[dict] = None,
    front_matter: bool = True,
    namespace: Optional[ApplicationNamespace] = None,
) -> tuple[nodes.document, StringIO]:
    """Convert a string of text to a docutils AST.

    :param text: The text to convert.
    :param uri: The URI of the document.
    :param report_level: The report level for docutils.
    :param halt_level: The halt level for docutils.
    :param warning_stream: A stream to write warnings to.
    :param language_code: The language code for docutils.
    :param use_sphinx: Whether to use Sphinx roles and directives.
    :param extensions: A list of Sphinx extensions to use.
    :param default_domain: The default Sphinx domain.
    :param conversions: A dictionary of conversion functions.
    :param front_matter: Whether to treat initial field list as front matter.
    :param namespace: A pre-computed docutils namespace to use.
    """
    settings = OptionParser(components=(LosslessRSTParser,)).get_default_values()
    warning_stream = StringIO() if warning_stream is None else warning_stream
    settings.warning_stream = warning_stream
    settings.report_level = report_level  # 2=warning
    settings.halt_level = halt_level  # 4=severe
    # The level at or above which `SystemMessage` exceptions
    # will be raised, halting execution.
    settings.language_code = language_code

    document = new_document(uri, settings=settings)

    # compile lookup for directives/roles
    namespace = (
        compile_namespace(
            language_code=language_code,
            use_sphinx=use_sphinx,
            extensions=extensions,
            default_domain=default_domain,
        )
        if namespace is None
        else namespace
    )
    document.settings.namespace = namespace

    # get conversion lookup for directives
    directive_data = _load_directive_data()
    if conversions:
        directive_data = {**directive_data, **conversions}
    document.settings.directive_data = directive_data

    # whether to treat initial field list as front matter
    document.settings.front_matter = front_matter

    parser = LosslessRSTParser()
    parser.parse(text, document)

    # these three transforms are required for converting targets correctly
    for transform_cls in [
        PropagateTargets,  # Propagate empty internal targets to the next element. (260)
        FrontMatter,  # convert initial field list (DocInfo=340)
        AnonymousHyperlinks,  # Link anonymous references to targets. (440)
        # IndirectHyperlinks,  # "refuri" migrated back to all indirect targets (460)
        Footnotes,  # Assign numbers to autonumbered footnotes (620)
        # bespoke transforms
        StripFootnoteLabel,
        ResolveListItems,
    ]:
        transform = transform_cls(document)
        transform.apply()

    return document, warning_stream