From 86dc085476fb9fab56dbefcd7c229278b8149390 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Thu, 27 Jul 2023 03:35:00 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=91=8C=20Directive=20option=20parsing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + .pre-commit-config.yaml | 3 +- docs/conf.py | 1 + docs/syntax/roles-and-directives.md | 51 +- myst_parser/mdit_to_docutils/base.py | 8 +- myst_parser/mocking.py | 2 +- myst_parser/parsers/directives.py | 150 ++-- myst_parser/parsers/options.py | 658 ++++++++++++++++++ .../fixtures/directive_options.md | 2 +- .../fixtures/directive_parsing.txt | 34 +- .../fixtures/option_parsing.yaml | 169 +++++ .../fixtures/option_parsing_errors.yaml | 42 ++ .../fixtures/reporter_warnings.md | 2 +- tests/test_renderers/test_parse_directives.py | 37 +- 14 files changed, 1078 insertions(+), 83 deletions(-) create mode 100644 myst_parser/parsers/options.py create mode 100644 tests/test_renderers/fixtures/option_parsing.yaml create mode 100644 tests/test_renderers/fixtures/option_parsing_errors.yaml diff --git a/.gitignore b/.gitignore index dc4ce707..48cf54de 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,5 @@ _archive/ .vscode/ .DS_Store + +docs/apidocs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d192d459..393dcf3c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,8 @@ exclude: > \.vscode/settings\.json| tests/test_commonmark/commonmark\.json| .*\.xml| - tests/.*/.*\.md + tests/.*/.*\.md| + tests/.*/.*\.yaml )$ repos: diff --git a/docs/conf.py b/docs/conf.py index 43f9c970..58cceb0e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -85,6 +85,7 @@ nitpick_ignore_regex = [ (r"py:.*", r"docutils\..*"), (r"py:.*", r"pygments\..*"), + (r"py:.*", r"typing\.Literal\[.*"), ] nitpick_ignore = [ ("py:obj", "myst_parser._docs._ConfigBase"), diff --git a/docs/syntax/roles-and-directives.md b/docs/syntax/roles-and-directives.md index 7c0fea86..8e05b590 100644 --- a/docs/syntax/roles-and-directives.md +++ b/docs/syntax/roles-and-directives.md @@ -23,9 +23,8 @@ It is effectively a Markdown code fence with curly brackets around the language, Here is the basic structure: `````{list-table} ---- -header-rows: 1 ---- +:header-rows: 1 + * - MyST - reStructuredText * - ````md @@ -55,16 +54,13 @@ This is my note ``` ::: -#### Parameterizing directives +#### Parameterizing directives (options) -For directives that take parameters as input, there are two ways to parameterize them. -In each case, the options themselves are given as `key: value` pairs. An example of -each is shown below: +Many directives can take key/value pairs, in an optional *option block* at the start of the directive. -**Short-hand options with `:` characters**. If you only need one or two options for your -directive and wish to save lines, you may also specify directive options as a collection -of lines just after the first line of the directive, each preceding with `:`. Then the -leading `:` is removed from each line, and the rest is parsed as YAML. +The option block starts on the first line of the directive body and is defined by a set of lines prefixed with `:`. + +The block then follows a YAML-like mapping syntax, where the key (string) and value (string) are separated by a colon (`:`): :::{myst-example} ```{code-block} python @@ -77,10 +73,28 @@ print(f'my {a}nd line') ``` ::: -**Using YAML frontmatter**. A block of YAML front-matter just after the -first line of the directive will be parsed as options for the directive. This needs to be -surrounded by `---` lines. Everything in between will be parsed by YAML and -passed as keyword arguments to your directive. For example: +Comments, starting `#`, are also allowed in between options or at the end of values, and are ignored. +The values can be enclosed in quotes (`"` or `'`) and span multiple lines. +Newline behaviour can be controlled by starting the value with `|` (preserve newlines) or `>` (collapse newlines): + +:::{myst-example} +```{code-block} python +:lineno-start: 10 # this is a comment +: # this is also a comment +:emphasize-lines: "1, 3" +:caption: | +: This is my +: multi-line caption. It is *pretty nifty* ;-) + +a = 2 +print('my 1st line') +print(f'my {a}nd line') +``` +::: + +::::{dropdown} Old-style options block + +Option blocks can also be enclosed by `---`, with no `:` prefix, for example: :::{myst-example} ```{code-block} python @@ -97,6 +111,8 @@ print(f'my {a}nd line') ``` ::: +:::: + (syntax/directives/parsing)= #### How directives parse content @@ -209,9 +225,8 @@ Roles are similar to directives - they allow you to define arbitrary new functio To define an in-line role, use the following form: ````{list-table} ---- -header-rows: 1 ---- +:header-rows: 1 + * - MyST - reStructuredText * - ````md diff --git a/myst_parser/mdit_to_docutils/base.py b/myst_parser/mdit_to_docutils/base.py index faf35e44..93714f7d 100644 --- a/myst_parser/mdit_to_docutils/base.py +++ b/myst_parser/mdit_to_docutils/base.py @@ -1741,6 +1741,7 @@ def run_directive( directive_class, first_line, content, + line=position, additional_options=additional_options, ) except MarkupError as error: @@ -1750,12 +1751,11 @@ def run_directive( ) return [error] - if parsed.warnings: - _errors = ",\n".join(parsed.warnings) + for warning_msg, warning_line in parsed.warnings: self.create_warning( - f"{name!r}: {_errors}", + f"{name!r}: {warning_msg}", MystWarnings.DIRECTIVE_PARSING, - line=position, + line=warning_line if warning_line is not None else position, append_to=self.current_node, ) diff --git a/myst_parser/mocking.py b/myst_parser/mocking.py index 0bafa424..a6f64319 100644 --- a/myst_parser/mocking.py +++ b/myst_parser/mocking.py @@ -140,7 +140,7 @@ def parse_directive_block( # TODO should argument_str always be ""? parsed = parse_directive_text(directive, "", "\n".join(content)) if parsed.warnings: - raise MarkupError(",".join(parsed.warnings)) + raise MarkupError(",".join(w for w, _ in parsed.warnings)) return ( parsed.arguments, parsed.options, diff --git a/myst_parser/parsers/directives.py b/myst_parser/parsers/directives.py index 1b031044..e49ed429 100644 --- a/myst_parser/parsers/directives.py +++ b/myst_parser/parsers/directives.py @@ -35,7 +35,6 @@ """ from __future__ import annotations -import datetime import re from dataclasses import dataclass from textwrap import dedent @@ -43,9 +42,13 @@ import yaml from docutils.parsers.rst import Directive +from docutils.parsers.rst.directives import flag from docutils.parsers.rst.directives.misc import TestDirective from docutils.parsers.rst.states import MarkupError +from .options import TokenizeError +from .options import to_items as options_to_items + @dataclass class DirectiveParsingResult: @@ -57,8 +60,10 @@ class DirectiveParsingResult: """The lines of body content""" body_offset: int """The number of lines to the start of the body content.""" - warnings: list[str] - """List of non-fatal errors encountered during parsing.""" + warnings: list[tuple[str, int | None]] + """List of non-fatal errors encountered during parsing. + (message, line_number) + """ def parse_directive_text( @@ -66,6 +71,7 @@ def parse_directive_text( first_line: str, content: str, *, + line: int | None = None, validate_options: bool = True, additional_options: dict[str, str] | None = None, ) -> DirectiveParsingResult: @@ -75,32 +81,54 @@ def parse_directive_text( May be an argument or body text, dependent on the directive :param content: All text after the first line. Can include options. :param validate_options: Whether to validate the values of options + This is actually only here to be used by myst-nb cells, + which converts options directly to JSON metadata, using the full YAML spec. :param additional_options: Additional options to add to the directive, above those parsed from the content (content options take priority). :raises MarkupError: if there is a fatal parsing/validation error """ - parse_errors: list[str] = [] + parse_errors: list[tuple[str, int | None]] + options: dict[str, Any] + body_lines: list[str] + content_offset: int + has_options_block: bool + if directive_class.option_spec: - body, options, option_errors = parse_directive_options( + # only look for an option block if there are possible options + # body, options, option_errors = _parse_directive_options( + result = _parse_directive_options( content, directive_class, - validate=validate_options, + line=line, + as_yaml=not validate_options, additional_options=additional_options, ) - parse_errors.extend(option_errors) - body_lines = body.splitlines() + parse_errors = result.errors + has_options_block = result.has_options + options = result.options + body_lines = result.content.splitlines() content_offset = len(content.splitlines()) - len(body_lines) else: - # If there are no possible options, we do not look for a YAML block + parse_errors = [] + has_options_block = False options = {} body_lines = content.splitlines() content_offset = 0 if not (directive_class.required_arguments or directive_class.optional_arguments): - # If there are no possible arguments, then the body starts on the argument line - if first_line: + # If there are no possible arguments, then the body can start on the argument line + if first_line.strip(): + if has_options_block and any(body_lines): + parse_errors.append( + ( + "Cannot split content across first line and body, " + "when options block is present (move first line to body)", + None, + ) + ) body_lines.insert(0, first_line) + content_offset = 0 arguments = [] else: arguments = parse_directive_arguments(directive_class, first_line) @@ -113,26 +141,35 @@ def parse_directive_text( # check for body content if body_lines and not directive_class.has_content: - parse_errors.append("Has content, but none permitted") + parse_errors.append(("Has content, but none permitted", None)) return DirectiveParsingResult( arguments, options, body_lines, content_offset, parse_errors ) -def parse_directive_options( +@dataclass +class _DirectiveOptions: + content: str + options: dict[str, Any] + errors: list[tuple[str, int | None]] + has_options: bool + + +def _parse_directive_options( content: str, directive_class: type[Directive], - validate: bool = True, + as_yaml: bool, + line: int | None, additional_options: dict[str, str] | None = None, -) -> tuple[str, dict, list[str]]: +) -> _DirectiveOptions: """Parse (and validate) the directive option section. :returns: (content, options, validation_errors) """ - options: dict[str, Any] = {} - validation_errors: list[str] = [] + yaml_block: None | str = None if content.startswith("---"): + line = None if line is None else line + 1 content = "\n".join(content.splitlines()[1:]) match = re.search(r"^-{3,}", content, re.MULTILINE) if match: @@ -142,12 +179,11 @@ def parse_directive_options( yaml_block = content content = "" yaml_block = dedent(yaml_block) - try: - options = yaml.safe_load(yaml_block) or {} - except (yaml.parser.ParserError, yaml.scanner.ScannerError): - validation_errors.append("Invalid options format (bad YAML)") elif content.lstrip().startswith(":"): - content_lines = content.splitlines() # type: list + # TODO deprecate allowing initial whitespace (by lstripping) + # or at least make it that all have the same indent + # also look at mystjs implementation + content_lines = content.splitlines() yaml_lines = [] while content_lines: if not content_lines[0].lstrip().startswith(":"): @@ -155,64 +191,80 @@ def parse_directive_options( yaml_lines.append(content_lines.pop(0).lstrip()[1:]) yaml_block = "\n".join(yaml_lines) content = "\n".join(content_lines) - try: - options = yaml.safe_load(yaml_block) or {} - except (yaml.parser.ParserError, yaml.scanner.ScannerError): - validation_errors.append("Invalid options format (bad YAML)") - if not isinstance(options, dict): - options = {} - validation_errors.append("Invalid options format (not a dict)") + has_options_block = yaml_block is not None - if validation_errors: - return content, options, validation_errors + if as_yaml: + yaml_errors: list[tuple[str, int | None]] = [] + try: + yaml_options = yaml.safe_load(yaml_block or "") or {} + except (yaml.parser.ParserError, yaml.scanner.ScannerError): + yaml_options = {} + yaml_errors.append(("Invalid options format (bad YAML)", line)) + if not isinstance(yaml_options, dict): + yaml_options = {} + yaml_errors.append(("Invalid options format (not a dict)", line)) + return _DirectiveOptions(content, yaml_options, yaml_errors, has_options_block) + + options: dict[str, str] = {} + if yaml_block is not None: + try: + options = dict(options_to_items(yaml_block)) + except TokenizeError as err: + return _DirectiveOptions( + content, + options, + [(f"Invalid options format: {err.problem}", line)], + has_options_block, + ) - if (not validate) or issubclass(directive_class, TestDirective): + if issubclass(directive_class, TestDirective): # technically this directive spec only accepts one option ('option') # but since its for testing only we accept all options - return content, options, validation_errors + return _DirectiveOptions(content, options, [], has_options_block) if additional_options: - # The YAML block takes priority over additional options + # The options block takes priority over additional options options = {**additional_options, **options} # check options against spec options_spec: dict[str, Callable] = directive_class.option_spec unknown_options: list[str] = [] new_options: dict[str, Any] = {} + validation_errors: list[tuple[str, int | None]] = [] + value: str | None for name, value in options.items(): try: convertor = options_spec[name] except KeyError: unknown_options.append(name) continue - if not isinstance(value, str): - if value is True or value is None: - value = None # flag converter requires no argument - elif isinstance(value, (int, float, datetime.date, datetime.datetime)): - # convertor always requires string input - value = str(value) - else: - validation_errors.append( - f'option "{name}" value not string (enclose with ""): {value}' - ) - continue + if not value: + # restructured text parses empty option values as None + value = None + if convertor is flag: + # flag will error if value is not empty, + # but to be more permissive we allow any value + value = None try: converted_value = convertor(value) except (ValueError, TypeError) as error: validation_errors.append( - f"Invalid option value for {name!r}: {value}: {error}" + (f"Invalid option value for {name!r}: {value}: {error}", line) ) else: new_options[name] = converted_value if unknown_options: validation_errors.append( - f"Unknown option keys: {sorted(unknown_options)} " - f"(allowed: {sorted(options_spec)})" + ( + f"Unknown option keys: {sorted(unknown_options)} " + f"(allowed: {sorted(options_spec)})", + line, + ) ) - return content, new_options, validation_errors + return _DirectiveOptions(content, new_options, validation_errors, has_options_block) def parse_directive_arguments( diff --git a/myst_parser/parsers/options.py b/myst_parser/parsers/options.py new file mode 100644 index 00000000..15da7327 --- /dev/null +++ b/myst_parser/parsers/options.py @@ -0,0 +1,658 @@ +"""Parser for directive options. + +This is a highly restricted parser for YAML, +which only allows a subset of YAML to be used for directive options: + +- Only block mappings are allowed at the top level +- Mapping keys are parsed as strings (plain or quoted) +- Mapping values are parsed as strings (plain, quoted, literal `|`, folded `>`) +- `#` Comments are allowed and blank lines + +Adapted from: +https://github.com/yaml/pyyaml/commit/957ae4d495cf8fcb5475c6c2f1bce801096b68a5 + +For a good description of multi-line YAML strings, see: +https://stackoverflow.com/a/21699210/5033292 +""" +from __future__ import annotations + +from dataclasses import dataclass, replace +from typing import ClassVar, Final, Iterable, Literal, cast + + +@dataclass +class Position: + """Position of a character in a stream.""" + + index: int + line: int + column: int + + +class StreamBuffer: + """A buffer for a stream of characters.""" + + def __init__(self, stream: str): + self._buffer = stream + _CHARS_END + self._index = 0 + self._line = 0 + self._column = 0 + + @property + def index(self) -> int: + return self._index + + @property + def line(self) -> int: + return self._line + + @property + def column(self) -> int: + return self._column + + def peek(self, index: int = 0) -> str: + return self._buffer[self._index + index] + + def prefix(self, length: int = 1) -> str: + return self._buffer[self._index : self._index + length] + + def forward(self, length: int = 1) -> None: + while length: + ch = self._buffer[self._index] + self._index += 1 + if ch in "\n\x85\u2028\u2029" or ( + ch == "\r" and self._buffer[self._index] != "\n" + ): + self._line += 1 + self._column = 0 + elif ch != "\uFEFF": + self._column += 1 + length -= 1 + + def get_position(self) -> Position: + return Position(self._index, self._line, self._column) + + +@dataclass +class Token: + """A parsed token from a directive option stream.""" + + id: ClassVar[str] = "" + start: Position + end: Position + + +@dataclass +class KeyToken(Token): + id: ClassVar[str] = "" + value: str + style: Literal[None, "'", '"'] = None + """The original style of the string.""" + + +@dataclass +class ValueToken(Token): + id: ClassVar[str] = "" + value: str + style: Literal[None, "'", '"', "|", ">"] = None + """The original style of the string.""" + + +@dataclass +class ColonToken(Token): + id: ClassVar[str] = "" + + +class TokenizeError(Exception): + def __init__( + self, + problem: str, + problem_mark: Position, + context: str | None = None, + context_mark: Position | None = None, + ): + """A YAML error with optional context. + + :param problem: The problem encountered + :param problem_mark: The position of the problem + :param context: The context of the error, e.g. the parent being scanned + :param context_mark: The position of the context + """ + self.context = context + self.context_mark = context_mark + self.problem = problem + self.problem_mark = problem_mark + + def clone(self, line_offset: int, column_offset: int) -> TokenizeError: + """Clone the error with the given line and column offsets.""" + return TokenizeError( + self.problem, + replace( + self.problem_mark, + line=self.problem_mark.line + line_offset, + column=self.problem_mark.column + column_offset, + ), + self.context, + None + if self.context_mark is None + else replace( + self.context_mark, + line=self.context_mark.line + line_offset, + column=self.context_mark.column + column_offset, + ), + ) + + def __str__(self) -> str: + lines = [] + if self.context is not None: + lines.append(self.context) + if self.context_mark is not None and ( + self.context_mark.line != self.problem_mark.line + or self.context_mark.column != self.problem_mark.column + ): + lines.append( + f"at line {self.context_mark.line}, column {self.context_mark.column}" + ) + if self.problem is not None: + lines.append(self.problem) + if self.problem_mark is not None: + lines.append( + f"at line {self.problem_mark.line}, column {self.problem_mark.column}" + ) + return "\n".join(lines) + + +def to_items( + text: str, line_offset: int = 0, column_offset: int = 0 +) -> Iterable[tuple[str, str]]: + """Parse a directive option block into (key, value) tuples. + + :param text: The directive option text. + :param line_offset: The line offset to apply to the error positions. + :param column_offset: The column offset to apply to the error positions. + + :raises: `TokenizeError` + """ + for key_token, value_token in to_tokens(text, line_offset, column_offset): + yield key_token.value, value_token.value if value_token is not None else "" + + +def to_tokens( + text: str, line_offset: int = 0, column_offset: int = 0 +) -> Iterable[tuple[KeyToken, ValueToken | None]]: + """Parse a directive option, and yield key/value token pairs. + + :param text: The directive option text. + :param line_offset: The line offset to apply to the error positions. + :param column_offset: The column offset to apply to the error positions. + + :raises: `TokenizeError` + """ + key_token: KeyToken | None = None + try: + for token in tokenize(text): + if isinstance(token, KeyToken): + if key_token is not None: + yield key_token, None + key_token = token + elif isinstance(token, ValueToken): + if key_token is None: + raise TokenizeError("expected key before value", token.start) + yield key_token, token + key_token = None + except TokenizeError as exc: + if line_offset or column_offset: + raise exc.clone(line_offset, column_offset) from exc + raise + + +def tokenize(text: str) -> Iterable[Token]: + """Yield tokens from a directive option stream.""" + stream = StreamBuffer(text) + + while True: + _scan_to_next_token(stream) + + if stream.peek() == _CHARS_END: + break + + if not stream.column == 0: + raise TokenizeError( + "expected key to start at column 0", stream.get_position() + ) + + # find key + ch = stream.peek() + if ch in ("'", '"'): + yield _scan_flow_scalar(stream, cast(Literal['"', "'"], ch), is_key=True) + else: + yield _scan_plain_scalar(stream, is_key=True) + + _scan_to_next_token(stream) + + # check next char is colon + space + if stream.peek() != ":": + raise TokenizeError("expected ':' after key", stream.get_position()) + + start_mark = stream.get_position() + stream.forward() + end_mark = stream.get_position() + yield ColonToken(start_mark, end_mark) + + _scan_to_next_token(stream) + + # now find value + ch = stream.peek() + if stream.column == 0: + pass + elif ch in ("|", ">"): + yield _scan_block_scalar(stream, cast(Literal["|", ">"], ch)) + elif ch in ("'", '"'): + yield _scan_flow_scalar(stream, cast(Literal['"', "'"], ch), is_key=False) + else: + yield _scan_plain_scalar(stream, is_key=False) + + +def _scan_to_next_token(stream: StreamBuffer) -> None: + """Skip spaces, line breaks and comments. + + The byte order mark is also stripped, + if it's the first character in the stream. + """ + if stream.index == 0 and stream.peek() == "\uFEFF": + stream.forward() + found = False + while not found: + while stream.peek() == " ": + stream.forward() + if stream.peek() == "#": + while stream.peek() not in _CHARS_END_NEWLINE: + stream.forward() + if not _scan_line_break(stream): + found = True + + +def _scan_plain_scalar( + stream: StreamBuffer, is_key: bool = False +) -> KeyToken | ValueToken: + chunks = [] + start_mark = stream.get_position() + end_mark = start_mark + indent = 0 if is_key else 1 + spaces: list[str] = [] + while True: + length = 0 + if stream.peek() == "#": + break + while True: + ch = stream.peek(length) + if ch in _CHARS_END_SPACE_TAB_NEWLINE or ( + is_key + and ch == ":" + and stream.peek(length + 1) in _CHARS_END_SPACE_TAB_NEWLINE + ): + break + length += 1 + if length == 0: + break + chunks.extend(spaces) + chunks.append(stream.prefix(length)) + stream.forward(length) + end_mark = stream.get_position() + spaces = _scan_plain_spaces(stream, allow_newline=(not is_key)) + if not spaces or stream.peek() == "#" or (stream.column < indent): + break + + return ( + KeyToken(start_mark, end_mark, "".join(chunks)) + if is_key + else ValueToken(start_mark, end_mark, "".join(chunks)) + ) + + +def _scan_plain_spaces(stream: StreamBuffer, allow_newline: bool = True) -> list[str]: + chunks = [] + length = 0 + while stream.peek(length) == " ": + length += 1 + whitespaces = stream.prefix(length) + stream.forward(length) + ch = stream.peek() + if allow_newline and ch in _CHARS_NEWLINE: + line_break = _scan_line_break(stream) + breaks = [] + while stream.peek() in _CHARS_SPACE_NEWLINE: + if stream.peek() == " ": + stream.forward() + else: + breaks.append(_scan_line_break(stream)) + if line_break != "\n": + chunks.append(line_break) + elif not breaks: + chunks.append(" ") + chunks.extend(breaks) + elif whitespaces: + chunks.append(whitespaces) + return chunks + + +def _scan_line_break(stream: StreamBuffer) -> str: + # Transforms: + # '\r\n' : '\n' + # '\r' : '\n' + # '\n' : '\n' + # '\x85' : '\n' + # '\u2028' : '\u2028' + # '\u2029 : '\u2029' + # default : '' + ch = stream.peek() + if ch in "\r\n\x85": + if stream.prefix(2) == "\r\n": + stream.forward(2) + else: + stream.forward() + return "\n" + elif ch in "\u2028\u2029": + stream.forward() + return ch + return "" + + +def _scan_flow_scalar( + stream: StreamBuffer, style: Literal["'", '"'], is_key: bool = False +) -> KeyToken | ValueToken: + double = style == '"' + chunks = [] + start_mark = stream.get_position() + quote = stream.peek() + stream.forward() + chunks.extend(_scan_flow_scalar_non_spaces(stream, double, start_mark)) + while stream.peek() != quote: + chunks.extend(_scan_flow_scalar_spaces(stream, start_mark)) + chunks.extend(_scan_flow_scalar_non_spaces(stream, double, start_mark)) + stream.forward() + end_mark = stream.get_position() + return ( + KeyToken(start_mark, end_mark, "".join(chunks), style) + if is_key + else ValueToken(start_mark, end_mark, "".join(chunks), style) + ) + + +def _scan_flow_scalar_non_spaces( + stream: StreamBuffer, double: bool, start_mark: Position +) -> list[str]: + chunks = [] + while True: + length = 0 + while stream.peek(length) not in "'\"\\" + _CHARS_END_SPACE_TAB_NEWLINE: + length += 1 + if length: + chunks.append(stream.prefix(length)) + stream.forward(length) + ch = stream.peek() + if not double and ch == "'" and stream.peek(1) == "'": + chunks.append("'") + stream.forward(2) + elif (double and ch == "'") or (not double and ch in '"\\'): + chunks.append(ch) + stream.forward() + elif double and ch == "\\": + stream.forward() + ch = stream.peek() + if ch in _ESCAPE_REPLACEMENTS: + chunks.append(_ESCAPE_REPLACEMENTS[ch]) + stream.forward() + elif ch in _ESCAPE_CODES: + length = _ESCAPE_CODES[ch] + stream.forward() + for k in range(length): + if stream.peek(k) not in "0123456789ABCDEFabcdef": + raise TokenizeError( + "expected escape sequence of %d hexadecimal numbers, but found %r" + % (length, stream.peek(k)), + stream.get_position(), + "while scanning a double-quoted scalar", + start_mark, + ) + code = int(stream.prefix(length), 16) + chunks.append(chr(code)) + stream.forward(length) + elif ch in _CHARS_NEWLINE: + _scan_line_break(stream) + chunks.extend(_scan_flow_scalar_breaks(stream)) + else: + raise TokenizeError( + "found unknown escape character %r" % ch, + stream.get_position(), + "while scanning a double-quoted scalar", + start_mark, + ) + else: + return chunks + + +def _scan_flow_scalar_spaces(stream: StreamBuffer, start_mark: Position) -> list[str]: + chunks = [] + length = 0 + while stream.peek(length) in " \t": + length += 1 + whitespaces = stream.prefix(length) + stream.forward(length) + ch = stream.peek() + if ch == _CHARS_END: + raise TokenizeError( + "found unexpected end of stream", + stream.get_position(), + "while scanning a quoted scalar", + start_mark, + ) + elif ch in _CHARS_NEWLINE: + line_break = _scan_line_break(stream) + breaks = _scan_flow_scalar_breaks(stream) + if line_break != "\n": + chunks.append(line_break) + elif not breaks: + chunks.append(" ") + chunks.extend(breaks) + else: + chunks.append(whitespaces) + return chunks + + +def _scan_flow_scalar_breaks(stream: StreamBuffer) -> list[str]: + chunks = [] + while True: + while stream.peek() in " \t": + stream.forward() + if stream.peek() in _CHARS_NEWLINE: + chunks.append(_scan_line_break(stream)) + else: + return chunks + + +def _scan_block_scalar(stream: StreamBuffer, style: Literal["|", ">"]) -> ValueToken: + indent = 0 + folded = style == ">" + chunks = [] + start_mark = stream.get_position() + + # Scan the header. + stream.forward() + chomping, increment = _scan_block_scalar_indicators(stream, start_mark) + _scan_block_scalar_ignored_line(stream, start_mark) + + # Determine the indentation level and go to the first non-empty line. + min_indent = indent + 1 + if min_indent < 1: + min_indent = 1 + if increment is None: + breaks, max_indent, end_mark = _scan_block_scalar_indentation(stream) + indent = max(min_indent, max_indent) + else: + indent = min_indent + increment - 1 + breaks, end_mark = _scan_block_scalar_breaks(stream, indent) + line_break = "" + + # Scan the inner part of the block scalar. + while stream.column == indent and stream.peek() != _CHARS_END: + chunks.extend(breaks) + leading_non_space = stream.peek() not in " \t" + length = 0 + while stream.peek(length) not in _CHARS_END_NEWLINE: + length += 1 + chunks.append(stream.prefix(length)) + stream.forward(length) + line_break = _scan_line_break(stream) + breaks, end_mark = _scan_block_scalar_breaks(stream, indent) + if stream.column == indent and stream.peek() != _CHARS_END: + if ( + folded + and line_break == "\n" + and leading_non_space + and stream.peek() not in " \t" + ): + if not breaks: + chunks.append(" ") + else: + chunks.append(line_break) + else: + break + + # Chomp the tail. + if chomping is not False: + chunks.append(line_break) + if chomping is True: + chunks.extend(breaks) + + # We are done. + return ValueToken(start_mark, end_mark, "".join(chunks), style) + + +def _scan_block_scalar_indicators( + stream: StreamBuffer, start_mark: Position +) -> tuple[bool | None, int | None]: + chomping = None + increment = None + ch = stream.peek() + if ch in "+-": + chomping = ch == "+" + stream.forward() + ch = stream.peek() + if ch in "0123456789": + increment = int(ch) + if increment == 0: + raise TokenizeError( + "expected indentation indicator in the range 1-9, but found 0", + stream.get_position(), + "while scanning a block scalar", + start_mark, + ) + stream.forward() + elif ch in "0123456789": + increment = int(ch) + if increment == 0: + raise TokenizeError( + "expected indentation indicator in the range 1-9, but found 0", + stream.get_position(), + "while scanning a block scalar", + start_mark, + ) + stream.forward() + ch = stream.peek() + if ch in "+-": + chomping = ch == "+" + stream.forward() + ch = stream.peek() + if ch not in _CHARS_END_SPACE_NEWLINE: + raise TokenizeError( + "expected chomping or indentation indicators, but found %r" % ch, + stream.get_position(), + "while scanning a block scalar", + start_mark, + ) + return chomping, increment + + +def _scan_block_scalar_ignored_line(stream: StreamBuffer, start_mark: Position) -> None: + while stream.peek() == " ": + stream.forward() + if stream.peek() == "#": + while stream.peek() not in _CHARS_END_NEWLINE: + stream.forward() + ch = stream.peek() + if ch not in _CHARS_END_NEWLINE: + raise TokenizeError( + "expected a comment or a line break, but found %r" % ch, + stream.get_position(), + "while scanning a block scalar", + start_mark, + ) + _scan_line_break(stream) + + +def _scan_block_scalar_indentation( + stream: StreamBuffer, +) -> tuple[list[str], int, Position]: + chunks = [] + max_indent = 0 + end_mark = stream.get_position() + while stream.peek() in _CHARS_SPACE_NEWLINE: + if stream.peek() != " ": + chunks.append(_scan_line_break(stream)) + end_mark = stream.get_position() + else: + stream.forward() + if stream.column > max_indent: + max_indent = stream.column + return chunks, max_indent, end_mark + + +def _scan_block_scalar_breaks( + stream: StreamBuffer, indent: int +) -> tuple[list[str], Position]: + chunks = [] + end_mark = stream.get_position() + while stream.column < indent and stream.peek() == " ": + stream.forward() + while stream.peek() in _CHARS_NEWLINE: + chunks.append(_scan_line_break(stream)) + end_mark = stream.get_position() + while stream.column < indent and stream.peek() == " ": + stream.forward() + return chunks, end_mark + + +_CHARS_END: Final[str] = "\0" +_CHARS_NEWLINE: Final[str] = "\r\n\x85\u2028\u2029" +_CHARS_END_NEWLINE: Final[str] = "\0\r\n\x85\u2028\u2029" +_CHARS_SPACE_NEWLINE: Final[str] = " \r\n\x85\u2028\u2029" +_CHARS_END_SPACE_NEWLINE: Final[str] = "\0 \r\n\x85\u2028\u2029" +_CHARS_END_SPACE_TAB_NEWLINE: Final[str] = "\0 \t\r\n\x85\u2028\u2029" + +_ESCAPE_REPLACEMENTS: Final[dict[str, str]] = { + "0": "\0", + "a": "\x07", + "b": "\x08", + "t": "\x09", + "\t": "\x09", + "n": "\x0A", + "v": "\x0B", + "f": "\x0C", + "r": "\x0D", + "e": "\x1B", + " ": "\x20", + '"': '"', + "\\": "\\", + "/": "/", + "N": "\x85", + "_": "\xA0", + "L": "\u2028", + "P": "\u2029", +} + +_ESCAPE_CODES: Final[dict[str, int]] = { + "x": 2, + "u": 4, + "U": 8, +} diff --git a/tests/test_renderers/fixtures/directive_options.md b/tests/test_renderers/fixtures/directive_options.md index 9b472587..779a7942 100644 --- a/tests/test_renderers/fixtures/directive_options.md +++ b/tests/test_renderers/fixtures/directive_options.md @@ -133,7 +133,7 @@ foo - 'restructuredtext-test-directive': Invalid options format (bad YAML) [myst.directive_parse] + 'restructuredtext-test-directive': Invalid options format: expected ':' after key [myst.directive_parse] Directive processed. Type="restructuredtext-test-directive", arguments=[], options={}, content: diff --git a/tests/test_renderers/fixtures/directive_parsing.txt b/tests/test_renderers/fixtures/directive_parsing.txt index e1c76c0d..25b9b769 100644 --- a/tests/test_renderers/fixtures/directive_parsing.txt +++ b/tests/test_renderers/fixtures/directive_parsing.txt @@ -25,6 +25,21 @@ options: {} warnings: [] . +note: content in first line and body +. +```{note} a +b +``` +. +arguments: [] +body: +- a +- b +content_offset: 0 +options: {} +warnings: [] +. + note: content after option . ```{note} @@ -92,11 +107,14 @@ body: - first line - '' - body line -content_offset: 1 +content_offset: 0 options: class: - tip -warnings: [] +warnings: +- - Cannot split content across first line and body, when options block is present + (move first line to body) + - null . admonition: no options, no new line @@ -162,7 +180,8 @@ body: [] content_offset: 3 options: {} warnings: -- Invalid options format (bad YAML) +- - 'Unknown option keys: [''a''] (allowed: [''class'', ''name''])' + - 1 . warning: yaml not a dict @@ -178,7 +197,8 @@ body: [] content_offset: 3 options: {} warnings: -- Invalid options format (not a dict) +- - 'Invalid options format: expected '':'' after key' + - 1 . warning: unknown option name @@ -192,7 +212,8 @@ body: [] content_offset: 1 options: {} warnings: -- 'Unknown option keys: [''unknown''] (allowed: [''class'', ''name''])' +- - 'Unknown option keys: [''unknown''] (allowed: [''class'', ''name''])' + - 0 . warning: invalid option value @@ -206,7 +227,8 @@ body: [] content_offset: 1 options: {} warnings: -- 'Invalid option value for ''class'': 1: cannot make "1" into a class name' +- - 'Invalid option value for ''class'': 1: cannot make "1" into a class name' + - 0 . error: missing argument diff --git a/tests/test_renderers/fixtures/option_parsing.yaml b/tests/test_renderers/fixtures/option_parsing.yaml new file mode 100644 index 00000000..f48b60ef --- /dev/null +++ b/tests/test_renderers/fixtures/option_parsing.yaml @@ -0,0 +1,169 @@ +plain key/values: + content: |- + key1: + key2: val2 + key3: + val3 + key4: val4.1 + val4.2 + expected: |- + [ + [ + "key1", + "" + ], + [ + "key2", + "val2" + ], + [ + "key3", + "val3" + ], + [ + "key4", + "val4.1 val4.2" + ] + ] + +plain key/values with comments: + content: |- + key1: # comment + key2: val2 # comment + # comment + key3: + val3 # comment + + key4: val4.1 + val4.2 # comment + expected: |- + [ + [ + "key1", + "" + ], + [ + "key2", + "val2" + ], + [ + "key3", + "val3" + ], + [ + "key4", + "val4.1 val4.2" + ] + ] + +quoted key/values: + content: |- + "key1": "val1" + 'key2': 'val2' + "key + 3": "val + 3" + escapes: "\"\e\x07" + expected: |- + [ + [ + "key1", + "val1" + ], + [ + "key2", + "val2" + ], + [ + "key 3", + "val 3" + ], + [ + "escapes", + "\"\u001b\u0007" + ] + ] + +literal values: + content: | + key1: | + val1.1 + val1.2 + + val1.3 + key2: |2 + val2.1 + val2.2 + + val2.3 + key3: |- + val3.1 + val3.2 + + val3.3 + key4: |+ + val4.1 + val4.2 + + val4.3 + expected: |- + [ + [ + "key1", + "val1.1\nval1.2\n\nval1.3\n" + ], + [ + "key2", + " val2.1\nval2.2\n\n val2.3\n" + ], + [ + "key3", + "val3.1\nval3.2\n\nval3.3" + ], + [ + "key4", + "val4.1\nval4.2\n\nval4.3\n" + ] + ] + +folded values: + content: | + key1: > + val1.1 + val1.2 + + val1.3 + key2: >2 + val2.1 + val2.2 + + val2.3 + key3: >- + val3.1 + val3.2 + + val3.3 + key4: >+ + val4.1 + val4.2 + + val4.3 + expected: |- + [ + [ + "key1", + "val1.1 val1.2\nval1.3\n" + ], + [ + "key2", + " val2.1\nval2.2\n\n val2.3\n" + ], + [ + "key3", + "val3.1 val3.2\nval3.3" + ], + [ + "key4", + "val4.1 val4.2\nval4.3\n" + ] + ] diff --git a/tests/test_renderers/fixtures/option_parsing_errors.yaml b/tests/test_renderers/fixtures/option_parsing_errors.yaml new file mode 100644 index 00000000..b077bb94 --- /dev/null +++ b/tests/test_renderers/fixtures/option_parsing_errors.yaml @@ -0,0 +1,42 @@ +no `:`: + content: | + key1 + expected: |- + expected ':' after key + at line 1, column 0 + +Indented key: + content: |2 + key1: value1 + expected: |- + expected key to start at column 0 + at line 0, column 1 + +Quote not closed: + content: | + key1: "value1 + expected: |- + while scanning a quoted scalar + at line 0, column 6 + found unexpected end of stream + at line 1, column 0 + +Content after literal: + content: | + key1: | value1 + extra + expected: |- + while scanning a block scalar + at line 0, column 6 + expected a comment or a line break, but found 'v' + at line 0, column 8 + +Content after folded: + content: | + key1: > value1 + extra + expected: |- + while scanning a block scalar + at line 0, column 6 + expected a comment or a line break, but found 'v' + at line 0, column 8 diff --git a/tests/test_renderers/fixtures/reporter_warnings.md b/tests/test_renderers/fixtures/reporter_warnings.md index 776d0e8c..dd8d70b9 100644 --- a/tests/test_renderers/fixtures/reporter_warnings.md +++ b/tests/test_renderers/fixtures/reporter_warnings.md @@ -149,7 +149,7 @@ bad-option-value :class: [1] ``` . -:1: (WARNING/2) 'note': option "class" value not string (enclose with ""): [1] [myst.directive_parse] +:1: (WARNING/2) 'note': Invalid option value for 'class': [1]: cannot make "[1]" into a class name [myst.directive_parse] :1: (ERROR/3) Content block expected for the "note" directive; none found. . diff --git a/tests/test_renderers/test_parse_directives.py b/tests/test_renderers/test_parse_directives.py index 4a25ed11..a8c8fa74 100644 --- a/tests/test_renderers/test_parse_directives.py +++ b/tests/test_renderers/test_parse_directives.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import pytest @@ -7,10 +8,33 @@ from markdown_it import MarkdownIt from myst_parser.parsers.directives import MarkupError, parse_directive_text +from myst_parser.parsers.options import TokenizeError +from myst_parser.parsers.options import to_items as options_to_items FIXTURE_PATH = Path(__file__).parent.joinpath("fixtures") +@pytest.mark.param_file(FIXTURE_PATH / "option_parsing.yaml", "yaml") +def test_option_parsing(file_params): + """Test parsing of directive options.""" + result = list(options_to_items(file_params.content)) + file_params.assert_expected( + json.dumps(result, ensure_ascii=False, indent=2), rstrip_lines=True + ) + + +@pytest.mark.param_file(FIXTURE_PATH / "option_parsing_errors.yaml", "yaml") +def test_option_parsing_errors(file_params): + """Test parsing of directive options.""" + try: + list(options_to_items(file_params.content)) + except TokenizeError as err: + result = str(err) + else: + result = "No error" + file_params.assert_expected(result, rstrip_lines=True) + + @pytest.mark.param_file(FIXTURE_PATH / "directive_parsing.txt") def test_parsing(file_params): """Test parsing of directive text.""" @@ -25,7 +49,7 @@ def test_parsing(file_params): raise AssertionError(f"Unknown directive: {name}") try: result = parse_directive_text( - klass, first_line[0] if first_line else "", tokens[0].content + klass, first_line[0] if first_line else "", tokens[0].content, line=0 ) except MarkupError as err: outcome = f"error: {err}" @@ -51,6 +75,15 @@ def test_parsing_errors(descript, klass, arguments, content): parse_directive_text(klass, arguments, content) +def test_parsing_full_yaml(): + result = parse_directive_text( + Note, "", "---\na: [1]\n---\ncontent", validate_options=False + ) + assert not result.warnings + assert result.options == {"a": [1]} + assert result.body == ["content"] + + def test_additional_options(): """Allow additional options to be passed to a directive.""" # this should be fine @@ -79,4 +112,4 @@ def test_additional_options(): Note, "", "content", additional_options={"foo": "bar"} ) assert len(result.warnings) == 1 - assert "Unknown option" in result.warnings[0] + assert "Unknown option" in result.warnings[0][0]