# Copyright (c) 2017-2025 Renata Hodovan, Akos Kiss.
#
# Licensed under the BSD 3-Clause License
# <LICENSE.rst or https://opensource.org/licenses/BSD-3-Clause>.
# This file may not be copied, modified, or distributed except
# according to those terms.
from __future__ import annotations
from copy import deepcopy
from itertools import zip_longest
from math import inf
from textwrap import indent
from typing import Any, ClassVar, Iterator
[docs]
class RuleSize:
"""
Size information about a grammar rule or a (sub)tree (generated based on a
grammar rule), expressed using different metrics (both vertical and
horizontal).
This class can be used to describe various size-related concepts, e.g.:
- The minimum depth of derivation of a grammar rule and the minimum number
of tokens generated by a grammar rule.
- The actual depth of derivation of a grammar rule and the actual number
of tokens generated by a grammar rule.
- The maximum allowed depth of derivation in a tree and the maximum number
of tokens allowed in a tree.
- The depth of derivation reaching a node from the root of the tree and
the number of tokens in the tree outside the subtree of the node.
The class supports additive arithmetics (``+``, ``+=``, ``-``, ``-=``) and
comparisons (``==``, ``<=``, ``<``). Note, however, that the type is not
totally but partially ordered, i.e., a size object is less than another size
object iff all its metrics compare less than the corresponding metrics of
the other size object.
"""
__slots__ = ('depth', 'tokens')
max: ClassVar[RuleSize] # pylint: disable=declare-non-slot
"""Maxium possible size (``(inf, inf)``)."""
def __init__(self, depth: int | float = 0, tokens: int | float = 0) -> None:
"""
:param depth: Derivation length (default: 0).
:param tokens: Token count (default: 0).
"""
self.depth: int | float = depth #: Derivation length.
self.tokens: int | float = tokens #: Token count.
def __add__(self, other: RuleSize) -> RuleSize:
return RuleSize(depth=self.depth + other.depth, tokens=self.tokens + other.tokens)
def __iadd__(self, other: RuleSize) -> RuleSize:
self.depth += other.depth
self.tokens += other.tokens
return self
def __sub__(self, other: RuleSize) -> RuleSize:
return RuleSize(depth=self.depth - other.depth, tokens=self.tokens - other.tokens)
def __isub__(self, other: RuleSize) -> RuleSize:
self.depth -= other.depth
self.tokens -= other.tokens
return self
def __eq__(self, other: object) -> bool:
if not isinstance(other, RuleSize):
return NotImplemented
return self.depth == other.depth and self.tokens == other.tokens
def __le__(self, other: RuleSize) -> bool:
# This defines a partial order (i.e., reflexive, antisymmetric, and transitive).
# Not every pair of objects are comparable.
return self.depth <= other.depth and self.tokens <= other.tokens
def __lt__(self, other: RuleSize) -> bool:
# This defines a strict partial order (i.e., irreflexive, asymmetric, and transitive).
# Not every pair of objects are comparable.
return self.depth < other.depth and self.tokens < other.tokens
def __repr__(self) -> str:
return f'{self.__class__.__name__}(depth={self.depth!r}, tokens={self.tokens!r})'
RuleSize.max = RuleSize(inf, inf)
[docs]
class Rule:
"""
Abstract base class of tree nodes.
Tree nodes support deep copying via :func:`copy.deepcopy` (but no shallow
copying via :func:`copy.copy`). A deep copy of a node is the copy of the
whole subtree rooted at that node. However, the parent of a copy node is
always None, even if the original node had a parent.
Tree nodes support various string representations:
- The "informal" representation of a node consists of the concatenation of
the string contents of all tokens found in the tree rooted at the node in
depth-first order. No extra whitespace is inserted for separation.
- The "official" representation of a node is an (almost) valid Python
expression to recreate the tree rooted at the node. If the node has any
children, the representation consists of multiple lines.
- The "debug" representation of a node is a multi-line string with the most
important attributes of the tree rooted at the node, using vertical bars
for visual guidance.
The builtins :class:`str` and :func:`repr` can be used to compute the
"informal" and "official" representations, respectively, while
:func:`format` can produce all of the above. When string formatting is used,
the following format specifiers are recognized:
================= ==========================
Specifier Meaning
================= ==========================
``''`` or ``'s'`` "Informal" representation.
``'r'`` "Official" representation.
``'|'`` "Debug" representation.
================= ==========================
Thus, if ``n`` is a node, the following expressions give equivalent results:
- ``str(n)``, ``f'{n}'``, ``f'{n!s}'``, ``f'{n:s}'``,
``'{}'.format(n)``, ``'{!s}'.format(n)``, and ``'{:s}'.format(n)``
- ``repr(n)``, ``f'{n!r}'``, ``f'{n:r}'``, ``'{!r}'.format(n)``, and
``'{:r}'.format(n)``
- ``f'{n:|}'`` and ``'{:|}'.format(n)``
"""
__slots__ = ('name', 'parent')
def __init__(self, *, name: str) -> None:
"""
:param name: Name of the node, i.e., name of the corresponding parser or
lexer rule in the grammar.
"""
self.name: str = name #: Name of the node, i.e., name of the corresponding parser or lexer rule in the grammar.
self.parent: ParentRule | None = None #: Parent node object.
@property
def left_sibling(self) -> Rule | None:
"""
Get the left sibling of the node if any. Return ``None`` if the node has
no parent or is the leftmost child of its parent.
:return: The left sibling of the current node or ``None``.
"""
if not self.parent:
return None
self_idx = self.parent.children.index(self)
if self_idx == 0:
return None
return self.parent.children[self_idx - 1]
@property
def right_sibling(self) -> Rule | None:
"""
Get the right sibling of the node if any. Return ``None`` if the node
has no parent or is the rightmost child of its parent.
:return: The right sibling of the current node or ``None``.
"""
if not self.parent:
return None
self_idx = self.parent.children.index(self)
if self_idx == len(self.parent.children) - 1:
return None
return self.parent.children[self_idx + 1]
@property
def root(self) -> Rule:
"""
Get the root of the node, i.e., the node at the top of the parent chain.
:return: The root of the current node.
"""
node = self
while node.parent:
node = node.parent
return node
[docs]
def replace(self, node: Rule) -> Rule:
"""
Replace the current node with ``node`` among its siblings.
:param node: The replacement node.
:return: ``node``
"""
node.remove()
if self.parent and node is not self:
self.parent.children[self.parent.children.index(self)] = node
node.parent = self.parent
self.parent = None
return node
[docs]
def remove(self) -> None:
"""
Remove the current node from its parent.
"""
if self.parent:
self.parent.children.remove(self)
self.parent = None
[docs]
def equals(self, other: Any) -> bool:
"""
Compare two nodes (potentially including any children) for equality. The
comparison is not implemented within ``__eq__`` to ensure that nodes can
be added to collections based on identity.
:param other: The node to compare the current node to.
:return: Whether the two nodes are equal.
"""
return self.__class__ == other.__class__ and self.name == other.name
[docs]
def equalTokens(self, other: Rule) -> bool:
"""
Compare the tokens in the sub-trees of two nodes.
:param other: The node to compare the current node to.
:return: Whether the two nodes are equal.
"""
return all(self_token == other_token for self_token, other_token in zip_longest(self.tokens(), other.tokens()))
[docs]
def tokens(self) -> Iterator[str]:
"""
Generator method to iterate over the (non-empty) tokens (i.e., strings)
of the sub-tree of the node.
:return: Iterator over token string contents.
"""
raise NotImplementedError()
def _dbg_(self) -> str:
"""
Called by :meth:`__format__` to compute the "debug" string
representation of a node.
"""
raise NotImplementedError()
def __format__(self, format_spec: str) -> str:
if format_spec in ['', 's']:
return str(self)
if format_spec == 'r':
return repr(self)
if format_spec == '|':
return self._dbg_()
raise TypeError
def __copy__(self) -> Rule:
raise TypeError('shallow copy not supported')
def __deepcopy__(self, memo: dict[int, Any]) -> Rule:
raise NotImplementedError()
[docs]
class ParentRule(Rule):
"""
Abstract base class of tree nodes that can have children.
"""
__slots__ = ('children',)
def __init__(self, *, name: str, children: list[Rule] | None = None) -> None:
"""
:param name: Name of the corresponding parser rule in the grammar.
:param children: Children of the rule (default: no children).
"""
super().__init__(name=name)
self.children: list[Rule] = [] #: Children of the rule.
if children:
self.add_children(children)
@property
def last_child(self) -> Rule | None:
"""
Get the last child of the current node if any. Return ``None`` if the
node has no children.
"""
return self.children[-1] if self.children else None
[docs]
def insert_child(self, idx: int, node: Rule) -> None:
"""
Insert node as child at position.
:param idx: Index of position to insert ``node`` at.
:param node: Node to be inserted.
"""
if not node:
return
node.remove()
self.children.insert(idx, node)
node.parent = self
[docs]
def add_child(self, node: Rule) -> None:
"""
Add node to the end of the list of the children.
:param node: Node to be added to children.
"""
if node is None:
return
node.remove()
self.children.append(node)
node.parent = self
[docs]
def add_children(self, nodes: list[Rule]) -> None:
"""
Add mulitple nodes to the end of the list of the children.
:param nodes: List of nodes to be added to children.
"""
for node in nodes:
self.add_child(node)
[docs]
def __iadd__(self, item: Rule | list[Rule]) -> ParentRule:
"""
Support for ``+=`` operation to add one or more children to the current
node. An alias to :meth:`add_child` or :meth:`add_children` depending on
the type of ``child``.
:param item: The node(s) to be added as child.
:return: The current node with extended children.
"""
if isinstance(item, list):
self.add_children(item)
else:
self.add_child(item)
return self
[docs]
def equals(self, other: Any) -> bool:
return super().equals(other) and len(self.children) == len(other.children) and all(child.equals(other.children[i]) for i, child in enumerate(self.children))
[docs]
def tokens(self) -> Iterator[str]:
for child in self.children:
yield from child.tokens()
def __str__(self) -> str:
return ''.join(str(child) for child in self.children)
def _repr_children(self) -> str:
return 'children=[\n{children}]'.format(children=indent(',\n'.join(repr(child) for child in self.children), ' '))
def _dbg_children(self) -> str:
return '\n' + indent('\n'.join(child._dbg_() for child in self.children), '| ') if self.children else ""
[docs]
class UnparserRule(ParentRule):
"""
Tree node representing a parser rule. It can have zero or more
:class:`UnparserRule`, :class:`UnlexerRule`, :class:`UnparserRuleQuantifier`
or :class:`UnparserRuleAlternative` children.
"""
def __getattr__(self, item: str) -> Rule | list[Rule]:
# This check is needed to avoid infinite recursions when loading a tree
# with pickle. In such cases, the loaded instance is prepared by
# creating an empty object with the expected ``__class__`` and by
# restoring the saved attributes (without calling ``__init__``).
# During this operation, the ``__set_state__`` method of the target
# class is tried to be called, if it exists. Otherwise, ``__getattr__``
# throws an ``AttributeError``. However, if the instantiation of this
# error object tries to access any field that is not yet added by
# pickle, then it throws another ``AttributeError``, causing an
# infinite recursion. Filtering for the field names, that are used
# later in this method, eliminates the issue.
if item in ['name', 'children']:
raise AttributeError()
result = []
worklist = list(reversed(self.children))
while worklist:
child = worklist.pop()
if isinstance(child, ParentRule) and not isinstance(child, UnparserRule):
worklist.extend(reversed(child.children))
elif child.name == item:
result.append(child)
if not result:
raise AttributeError(f'[{self.name}] No child with name {item!r} {[child.name for child in self.children]}.')
return result[0] if len(result) == 1 else result
def __repr__(self) -> str:
parts = [
f'name={self.name!r}',
]
if self.children:
parts.append(self._repr_children())
return f'{self.__class__.__name__}({", ".join(parts)})'
def _dbg_(self) -> str:
return f'{self.name}{self._dbg_children()}'
def __deepcopy__(self, memo: dict[int, Any]) -> UnparserRule:
return UnparserRule(name=deepcopy(self.name, memo), children=[deepcopy(child, memo) for child in self.children])
[docs]
class UnlexerRule(Rule):
"""
Tree node representing a lexer rule or token. It has a string constant set
in its ``src`` field.
"""
__slots__ = ('src', 'size', 'immutable')
def __init__(self, *, name: str, src: str = '', size: RuleSize | None = None, immutable: bool = False) -> None:
"""
:param name: Name of the corresponding lexer rule in the grammar.
:param src: String content of the lexer rule (default: "").
:param size: Size of the lexer rule (default: (1,1) if ``src`` is not
empty, (0,0) otherwise).
:param immutable: Boolean to mark literal Unlexer nodes as immutable.
"""
super().__init__(name=name)
self.src: str = src #: String content of the lexer rule.
self.size: RuleSize = size or (RuleSize(depth=1, tokens=1) if src else RuleSize(depth=0, tokens=0)) #: Size of the lexer rule, aggregated from the token fragments it is composed of.
self.immutable: bool = immutable #: Whether the lexer rule is immutable, i.e., its source is defined as a literal and cannot be changed.
[docs]
def equals(self, other: Any) -> bool:
return super().equals(other) and self.src == other.src and self.immutable == other.immutable
[docs]
def tokens(self) -> Iterator[str]:
if self.src:
yield self.src
def __str__(self) -> str:
return self.src
def __repr__(self) -> str:
parts = []
if self.name:
parts.append(f'name={self.name!r}')
if self.src:
parts.append(f'src={self.src!r}')
if (self.src and self.size != RuleSize(1, 1)) or (not self.src and self.size != RuleSize(0, 0)):
parts.append(f'size={self.size!r}')
if self.immutable:
parts.append(f'immutable={self.immutable}')
return f'{self.__class__.__name__}({", ".join(parts)})'
def _dbg_(self) -> str:
return f'{self.name or ""}{":" if self.name else ""}{self.src!r}{" (immutable)" if self.immutable else ""}'
def __deepcopy__(self, memo: dict[int, Any]) -> UnlexerRule:
return UnlexerRule(name=deepcopy(self.name, memo), src=deepcopy(self.src, memo), size=deepcopy(self.size, memo), immutable=deepcopy(self.immutable, memo))
[docs]
class UnparserRuleQuantifier(ParentRule):
"""
Tree node representing the root of a quantified sub-tree. It can have one or
more :class:`UnparserRuleQuantified` children.
"""
__slots__ = ('idx', 'start', 'stop')
def __init__(self, *, idx: int, start: int, stop: int | float, children: list[Rule] | None = None) -> None:
"""
:param idx: Index of the quantifier in the parent rule.
:param start: Minimum number of expected items in the sub-tree.
:param stop: Maximum number of expected items in the sub-tree.
:param children: Children of the quantifier (default: no children).
"""
super().__init__(name='', children=children)
self.idx: int = idx #: Index of the quantifier in the parent rule.
self.start: int = start #: Minimum number of expected items in the sub-tree.
self.stop: int | float = stop #: Maximum number of expected items in the sub-tree.
[docs]
def equals(self, other: Any) -> bool:
return super().equals(other) and self.idx == other.idx and self.start == other.start and self.stop == other.stop
def __repr__(self) -> str:
parts = [
f'idx={self.idx}',
f'start={self.start}',
f'stop={self.stop}',
]
if self.children:
parts.append(self._repr_children())
return f'{self.__class__.__name__}({", ".join(parts)})'
def _dbg_(self) -> str:
return f'{self.__class__.__name__}:[{self.idx}]{self._dbg_children()}'
def __deepcopy__(self, memo: dict[int, Any]) -> UnparserRuleQuantifier:
return UnparserRuleQuantifier(idx=deepcopy(self.idx, memo), start=deepcopy(self.start, memo), stop=deepcopy(self.stop, memo), children=[deepcopy(child, memo) for child in self.children])
[docs]
class UnparserRuleQuantified(ParentRule):
"""
Tree node representing a single instance of quantified sub-tree. It can have
one or more :class:`UnparserRule`, :class:`UnlexerRule`,
:class:`UnparserRuleQuantifier` or :class:`UnparserRuleAlternative`
children.
"""
def __init__(self, *, children: list[Rule] | None = None) -> None:
"""
:param children: Children of the quantified rule (default: no children).
"""
super().__init__(name='', children=children)
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self._repr_children() if self.children else ""})'
def _dbg_(self) -> str:
return f'{self.__class__.__name__}{self._dbg_children()}'
def __deepcopy__(self, memo: dict[int, Any]) -> UnparserRuleQuantified:
return UnparserRuleQuantified(children=[deepcopy(child, memo) for child in self.children])
[docs]
class UnparserRuleAlternative(ParentRule):
"""
Tree node representing a sub-tree of an alternative. It can have zero or
more :class:`UnparserRule`, :class:`UnlexerRule`,
:class:`UnparserRuleQuantifier` or :class:`UnparserRuleAlternative`
children.
"""
__slots__ = ('alt_idx', 'idx')
def __init__(self, *, alt_idx: int, idx: int, children: list[Rule] | None = None) -> None:
"""
:param alt_idx: Index of the alternation in the parent rule.
:param idx: Index of the alternative in the parent alternation.
:param children: Children of the alternative (default: no children).
"""
super().__init__(name='', children=children)
self.alt_idx: int = alt_idx #: Index of the alternation in the parent rule.
self.idx: int = idx #: Index of the alternative in the parent alternation.
[docs]
def equals(self, other: Any) -> bool:
return super().equals(other) and self.alt_idx == other.alt_idx and self.idx == other.idx
def __repr__(self) -> str:
parts = [
f'alt_idx={self.alt_idx}',
f'idx={self.idx}',
]
if self.children:
parts.append(self._repr_children())
return f'{self.__class__.__name__}({", ".join(parts)})'
def _dbg_(self) -> str:
return f'{self.__class__.__name__}:[{self.alt_idx}/{self.idx}]{self._dbg_children()}'
def __deepcopy__(self, memo: dict[int, Any]) -> UnparserRuleAlternative:
return UnparserRuleAlternative(alt_idx=deepcopy(self.alt_idx, memo),
idx=deepcopy(self.idx, memo),
children=[deepcopy(child, memo) for child in self.children])