Add cut operator (^) to grammar

traviscross · traviscross · commit 8b74468863f9 · 2025-12-14T16:51:40.000Z
The cut operator (`^`) is a backtracking fence. Once the expression to its left succeeds, we become committed to the alternative; the remainder of the expression must parse successfully or parsing will fail. See *Packrat Parsers Can Handle Practical Grammars in Mostly Constant Space*, Mizushima et al., <https://kmizu.github.io/papers/paste513-mizushima.pdf>. This operator solves a problem for us with C string literals. These literals cannot contain a null escape. But if we simply fail to lex the literal (e.g. `c"\0"`), we may instead lex it successfully as two separate tokens (`c "\0"), and that would be incorrect. As long as we only use cut to express constraints that can be expressed in a regular language and we keep our alternations disjoint, the grammar can still be mechanically converted to a CFG. Let's add the cut operator to our grammar and use it for C string literals and some similar constructs. In the railroad diagrams, we'll render the cut as a "no backtracking" box around the expression or sequence of expressions after the cut. The idea is that once you enter the box the only way out is forward.
diff --git a/mdbook-spec/src/grammar.rs b/mdbook-spec/src/grammar.rs
@@ -78,6 +78,8 @@ enum ExpressionKind {
     Charset(Vec<Characters>),
     /// ``~[` ` LF]``
     NegExpression(Box<Expression>),
+    /// `A ^ B`
+    Cut(Box<Expression>, Box<Expression>),
     /// `U+0060`
     Unicode(String),
 }
@@ -128,6 +130,10 @@ impl Expression {
             | ExpressionKind::NegExpression(e) => {
                 e.visit_nt(callback);
             }
+            ExpressionKind::Cut(e1, e2) => {
+                e1.visit_nt(callback);
+                e2.visit_nt(callback);
+            }
             ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => {
                 for e in es {
                     e.visit_nt(callback);
diff --git a/mdbook-spec/src/grammar/parser.rs b/mdbook-spec/src/grammar/parser.rs
@@ -173,18 +173,22 @@ impl Parser<'_> {
         match es.len() {
             0 => Ok(None),
             1 => Ok(Some(es.pop().unwrap())),
-            _ => Ok(Some(Expression {
-                kind: ExpressionKind::Alt(es),
-                suffix: None,
-                footnote: None,
-            })),
+            _ => {
+                if let ExpressionKind::Cut(_, _) = es.last().unwrap().kind {
+                    bail!(self, "expected final arm to not contain cut operator");
+                }
+                Ok(Some(Expression::new_kind(ExpressionKind::Alt(es))))
+            }
         }
     }
 
     fn parse_seq(&mut self) -> Result<Option<Expression>> {
         let mut es = Vec::new();
         loop {
             self.space0();
+            if self.peek() == Some(b'^') {
+                return Ok(Some(self.parse_cut(es)?));
+            }
             let Some(e) = self.parse_expr1()? else {
                 break;
             };
@@ -201,6 +205,34 @@ impl Parser<'_> {
         }
     }
 
+    /// Parse cut (`^`) operator.
+    fn parse_cut(&mut self, mut es: Vec<Expression>) -> Result<Expression> {
+        self.expect("^", "expected `^`")?;
+        let Some(last_expr) = es.last() else {
+            bail!(self, "expected expression before cut operator");
+        };
+        match last_expr.kind {
+            ExpressionKind::Optional(_)
+            | ExpressionKind::Repeat(_)
+            | ExpressionKind::RepeatNonGreedy(_)
+            | ExpressionKind::RepeatRange(_, None | Some(0), _) => {
+                bail!(self, "expected non-optional expression before cut operator");
+            }
+            _ => {}
+        }
+        let Some(rhs) = self.parse_seq()? else {
+            bail!(self, "expected expression after cut operator");
+        };
+        let lhs = match es.len() {
+            1 => es.pop().unwrap(),
+            _ => Expression::new_kind(ExpressionKind::Sequence(es)),
+        };
+        Ok(Expression::new_kind(ExpressionKind::Cut(
+            Box::new(lhs),
+            Box::new(rhs),
+        )))
+    }
+
     fn parse_expr1(&mut self) -> Result<Option<Expression>> {
         let Some(next) = self.peek() else {
             return Ok(None);
@@ -506,13 +538,62 @@ fn translate_position(input: &str, index: usize) -> (&str, usize, usize) {
     ("", line_number + 1, 0)
 }
 
-#[test]
-fn translate_tests() {
-    assert_eq!(translate_position("", 0), ("", 0, 0));
-    assert_eq!(translate_position("test", 0), ("test", 1, 1));
-    assert_eq!(translate_position("test", 3), ("test", 1, 4));
-    assert_eq!(translate_position("test", 4), ("test", 1, 5));
-    assert_eq!(translate_position("test\ntest2", 4), ("test", 1, 5));
-    assert_eq!(translate_position("test\ntest2", 5), ("test2", 2, 1));
-    assert_eq!(translate_position("test\ntest2\n", 11), ("", 3, 0));
+#[cfg(test)]
+mod tests {
+    use crate::grammar::Grammar;
+    use crate::grammar::parser::{parse_grammar, translate_position};
+    use std::path::Path;
+
+    #[test]
+    fn test_translate() {
+        assert_eq!(translate_position("", 0), ("", 0, 0));
+        assert_eq!(translate_position("test", 0), ("test", 1, 1));
+        assert_eq!(translate_position("test", 3), ("test", 1, 4));
+        assert_eq!(translate_position("test", 4), ("test", 1, 5));
+        assert_eq!(translate_position("test\ntest2", 4), ("test", 1, 5));
+        assert_eq!(translate_position("test\ntest2", 5), ("test2", 2, 1));
+        assert_eq!(translate_position("test\ntest2\n", 11), ("", 3, 0));
+    }
+
+    fn parse(input: &str) -> Result<Grammar, String> {
+        let mut grammar = Grammar::default();
+        parse_grammar(input, &mut grammar, "test", Path::new("test.md"))
+            .map_err(|e| e.to_string())?;
+        Ok(grammar)
+    }
+
+    #[test]
+    fn test_cut() {
+        let input = "Rule -> A ^ B | C";
+        let grammar = parse(input).unwrap();
+        grammar.productions.get("Rule").unwrap();
+    }
+
+    #[test]
+    fn test_cut_fail_final_arm() {
+        let input = "Rule -> A | B ^ C";
+        let err = parse(input).unwrap_err();
+        assert!(err.contains("expected final arm to not contain cut operator"));
+    }
+
+    #[test]
+    fn test_cut_fail_optional_lhs() {
+        let input = "Rule -> A* ^ B";
+        let err = parse(input).unwrap_err();
+        assert!(err.contains("expected non-optional expression before cut operator"));
+    }
+
+    #[test]
+    fn test_cut_fail_optional_lhs_group() {
+        let input = "Rule -> (A B)* ^ C";
+        let err = parse(input).unwrap_err();
+        assert!(err.contains("expected non-optional expression before cut operator"));
+    }
+
+    #[test]
+    fn test_cut_fail_trailing() {
+        let input = "Rule -> A ^";
+        let err = parse(input).unwrap_err();
+        assert!(err.contains("expected expression after cut operator"));
+    }
 }
diff --git a/mdbook-spec/src/grammar/render_markdown.rs b/mdbook-spec/src/grammar/render_markdown.rs
@@ -83,6 +83,7 @@ impl Expression {
             | ExpressionKind::Comment(_)
             | ExpressionKind::Charset(_)
             | ExpressionKind::NegExpression(_)
+            | ExpressionKind::Cut(_, _)
             | ExpressionKind::Unicode(_) => &self.kind,
         }
     }
@@ -175,6 +176,11 @@ impl Expression {
                 output.push('~');
                 e.render_markdown(cx, output);
             }
+            ExpressionKind::Cut(e1, e2) => {
+                e1.render_markdown(cx, output);
+                output.push_str(" ^ ");
+                e2.render_markdown(cx, output);
+            }
             ExpressionKind::Unicode(s) => {
                 output.push_str("U+");
                 output.push_str(s);
diff --git a/mdbook-spec/src/grammar/render_railroad.rs b/mdbook-spec/src/grammar/render_railroad.rs
@@ -220,6 +220,12 @@ impl Expression {
                         let ch = node_for_nt(cx, "CHAR");
                         Box::new(Except::new(Box::new(ch), n))
                     }
+                    ExpressionKind::Cut(e1, e2) => {
+                        let n1 = e1.render_railroad(cx, stack)?;
+                        let n2 = e2.render_railroad(cx, stack)?;
+                        let lbox = LabeledBox::new(n2, Comment::new("no backtracking".to_string()));
+                        Box::new(Sequence::new(vec![n1, Box::new(lbox)]))
+                    }
                     ExpressionKind::Unicode(s) => Box::new(Terminal::new(format!("U+{}", s))),
                 };
             }
diff --git a/reference-dev-guide/src/grammar.md b/reference-dev-guide/src/grammar.md
@@ -63,6 +63,7 @@ Expr1 ->
     | Prose
     | Group
     | NegativeExpression
+    | Cut
 
 Unicode -> `U+` [`A`-`Z` `0`-`9`]4..4
 
@@ -92,6 +93,8 @@ Prose -> `<` ~[`>` LF]+ `>`
 Group -> `(` ` `* Expression ` `* `)`
 
 NegativeExpression -> `~` ( Charset | Terminal | NonTerminal )
+
+Cut -> `^`
 ```
 
 The general format is a series of productions separated by blank lines. The expressions are as follows:
@@ -110,6 +113,7 @@ The general format is a series of productions separated by blank lines. The expr
 | Prose | \<any ASCII character except CR\> | An English description of what should be matched, surrounded in angle brackets. |
 | Group | (\`,\` Parameter)+ | Groups an expression for the purpose of precedence, such as applying a repetition operator to a sequence of other expressions.
 | NegativeExpression | ~\[\` \` LF\] | Matches anything except the given Charset, Terminal, or Nonterminal. |
+| Cut | ^ | The cut operator. Commits to the current alternative if the preceding expression matches. |
 | Sequence | \`fn\` Name Parameters | A sequence of expressions that must match in order. |
 | Alternation | Expr1 \| Expr2 | Matches only one of the given expressions, separated by the vertical pipe character. |
 | Suffix | \_except \[LazyBooleanExpression\]\_  | Adds a suffix to the previous expression to provide an additional English description, rendered in subscript. This can contain limited markdown, but try to avoid anything except basics like links. |
diff --git a/src/notation.md b/src/notation.md
@@ -24,6 +24,7 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets:
 | ~\[ ]              | ~\[`b` `B`]                    | Any characters, except those listed       |
 | ~`string`         | ~`\n`, ~`*/`                  | Any characters, except this sequence      |
 | ( )               | (`,` _Parameter_)<sup>?</sup> | Groups items                              |
+| ^                 | `c"` ^ _CStringRest_          | Commit to an alternative ([cut operator]) |
 | U+xxxx            | U+0060                        | A single unicode character                |
 | \<text\>          | \<any ASCII char except CR\>  | An English description of what should be matched |
 | Rule <sub>suffix</sub> | IDENTIFIER_OR_KEYWORD <sub>_except `crate`_</sub> | A modification to the previous rule |
@@ -52,6 +53,7 @@ r[notation.grammar.visualizations]
 Below each grammar block is a button to toggle the display of a [syntax diagram]. A square element is a non-terminal rule, and a rounded rectangle is a terminal.
 
 [binary operators]: expressions/operator-expr.md#arithmetic-and-logical-binary-operators
+[cut operator]: https://kmizu.github.io/papers/paste513-mizushima.pdf
 [keywords]: keywords.md
 [syntax diagram]: https://en.wikipedia.org/wiki/Syntax_diagram
 [tokens]: tokens.md
diff --git a/src/tokens.md b/src/tokens.md
@@ -241,7 +241,7 @@ r[lex.token.literal.str-raw.syntax]
 RAW_STRING_LITERAL -> `r` RAW_STRING_CONTENT SUFFIX?
 
 RAW_STRING_CONTENT ->
-      `"` ( ~CR )*? `"`
+      `"` ^ ( ~CR )*? `"`
     | `#` RAW_STRING_CONTENT `#`
 ```
 
@@ -281,7 +281,7 @@ r[lex.token.byte]
 r[lex.token.byte.syntax]
 ```grammar,lexer
 BYTE_LITERAL ->
-    `b'` ( ASCII_FOR_CHAR | BYTE_ESCAPE )  `'` SUFFIX?
+    `b'` ^ ( ASCII_FOR_CHAR | BYTE_ESCAPE )  `'` SUFFIX?
 
 ASCII_FOR_CHAR ->
     <any ASCII (i.e. 0x00 to 0x7F) except `'`, `\`, LF, CR, or TAB>
@@ -305,7 +305,7 @@ r[lex.token.str-byte]
 r[lex.token.str-byte.syntax]
 ```grammar,lexer
 BYTE_STRING_LITERAL ->
-    `b"` ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX?
+    `b"` ^ ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX?
 
 ASCII_FOR_STRING ->
     <any ASCII (i.e 0x00 to 0x7F) except `"`, `\`, or CR>
@@ -357,7 +357,7 @@ RAW_BYTE_STRING_LITERAL ->
     `br` RAW_BYTE_STRING_CONTENT SUFFIX?
 
 RAW_BYTE_STRING_CONTENT ->
-      `"` ASCII_FOR_RAW*? `"`
+      `"` ^ ASCII_FOR_RAW*? `"`
     | `#` RAW_BYTE_STRING_CONTENT `#`
 
 ASCII_FOR_RAW ->
@@ -401,7 +401,7 @@ r[lex.token.str-c]
 r[lex.token.str-c.syntax]
 ```grammar,lexer
 C_STRING_LITERAL ->
-    `c"` (
+    `c"` ^ (
         ~[`"` `\` CR NUL]
       | BYTE_ESCAPE _except `\0` or `\x00`_
       | UNICODE_ESCAPE _except `\u{0}`, `\u{00}`, …, `\u{000000}`_
@@ -480,7 +480,7 @@ RAW_C_STRING_LITERAL ->
     `cr` RAW_C_STRING_CONTENT SUFFIX?
 
 RAW_C_STRING_CONTENT ->
-      `"` ( ~[CR NUL] )*? `"`
+      `"` ^ ( ~[CR NUL] )*? `"`
     | `#` RAW_C_STRING_CONTENT `#`
 ```
 

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ impl Expression {`
`83`	`83`	`\| ExpressionKind::Comment(_)`
`84`	`84`	`\| ExpressionKind::Charset(_)`
`85`	`85`	`\| ExpressionKind::NegExpression(_)`
	`86`	`+ \| ExpressionKind::Cut(_, _)`
`86`	`87`	`\| ExpressionKind::Unicode(_) => &self.kind,`
`87`	`88`	`}`
`88`	`89`	`}`
`@@ -175,6 +176,11 @@ impl Expression {`
`175`	`176`	`output.push('~');`
`176`	`177`	`e.render_markdown(cx, output);`
`177`	`178`	`}`
	`179`	`+ ExpressionKind::Cut(e1, e2) => {`
	`180`	`+ e1.render_markdown(cx, output);`
	`181`	`+ output.push_str(" ^ ");`
	`182`	`+ e2.render_markdown(cx, output);`
	`183`	`+ }`
`178`	`184`	`ExpressionKind::Unicode(s) => {`
`179`	`185`	`output.push_str("U+");`
`180`	`186`	`output.push_str(s);`
Original file line number	Diff line number	Diff line change
`@@ -220,6 +220,12 @@ impl Expression {`
`220`	`220`	`let ch = node_for_nt(cx, "CHAR");`
`221`	`221`	`Box::new(Except::new(Box::new(ch), n))`
`222`	`222`	`}`
	`223`	`+ ExpressionKind::Cut(e1, e2) => {`
	`224`	`+ let n1 = e1.render_railroad(cx, stack)?;`
	`225`	`+ let n2 = e2.render_railroad(cx, stack)?;`
	`226`	`+ let lbox = LabeledBox::new(n2, Comment::new("no backtracking".to_string()));`
	`227`	`+ Box::new(Sequence::new(vec![n1, Box::new(lbox)]))`
	`228`	`+ }`
`223`	`229`	`ExpressionKind::Unicode(s) => Box::new(Terminal::new(format!("U+{}", s))),`
`224`	`230`	`};`
`225`	`231`	`}`