Skip to content

Commit 8b74468

Browse files
committed
Add cut operator (^) to grammar
The cut operator (`^`) is a backtracking fence. Once the expression to its left succeeds, we become committed to the alternative; the remainder of the expression must parse successfully or parsing will fail. See *Packrat Parsers Can Handle Practical Grammars in Mostly Constant Space*, Mizushima et al., <https://kmizu.github.io/papers/paste513-mizushima.pdf>. This operator solves a problem for us with C string literals. These literals cannot contain a null escape. But if we simply fail to lex the literal (e.g. `c"\0"`), we may instead lex it successfully as two separate tokens (`c "\0"), and that would be incorrect. As long as we only use cut to express constraints that can be expressed in a regular language and we keep our alternations disjoint, the grammar can still be mechanically converted to a CFG. Let's add the cut operator to our grammar and use it for C string literals and some similar constructs. In the railroad diagrams, we'll render the cut as a "no backtracking" box around the expression or sequence of expressions after the cut. The idea is that once you enter the box the only way out is forward.
1 parent 12f9af1 commit 8b74468

File tree

7 files changed

+125
-20
lines changed

7 files changed

+125
-20
lines changed

mdbook-spec/src/grammar.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ enum ExpressionKind {
7878
Charset(Vec<Characters>),
7979
/// ``~[` ` LF]``
8080
NegExpression(Box<Expression>),
81+
/// `A ^ B`
82+
Cut(Box<Expression>, Box<Expression>),
8183
/// `U+0060`
8284
Unicode(String),
8385
}
@@ -128,6 +130,10 @@ impl Expression {
128130
| ExpressionKind::NegExpression(e) => {
129131
e.visit_nt(callback);
130132
}
133+
ExpressionKind::Cut(e1, e2) => {
134+
e1.visit_nt(callback);
135+
e2.visit_nt(callback);
136+
}
131137
ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => {
132138
for e in es {
133139
e.visit_nt(callback);

mdbook-spec/src/grammar/parser.rs

Lines changed: 95 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -173,18 +173,22 @@ impl Parser<'_> {
173173
match es.len() {
174174
0 => Ok(None),
175175
1 => Ok(Some(es.pop().unwrap())),
176-
_ => Ok(Some(Expression {
177-
kind: ExpressionKind::Alt(es),
178-
suffix: None,
179-
footnote: None,
180-
})),
176+
_ => {
177+
if let ExpressionKind::Cut(_, _) = es.last().unwrap().kind {
178+
bail!(self, "expected final arm to not contain cut operator");
179+
}
180+
Ok(Some(Expression::new_kind(ExpressionKind::Alt(es))))
181+
}
181182
}
182183
}
183184

184185
fn parse_seq(&mut self) -> Result<Option<Expression>> {
185186
let mut es = Vec::new();
186187
loop {
187188
self.space0();
189+
if self.peek() == Some(b'^') {
190+
return Ok(Some(self.parse_cut(es)?));
191+
}
188192
let Some(e) = self.parse_expr1()? else {
189193
break;
190194
};
@@ -201,6 +205,34 @@ impl Parser<'_> {
201205
}
202206
}
203207

208+
/// Parse cut (`^`) operator.
209+
fn parse_cut(&mut self, mut es: Vec<Expression>) -> Result<Expression> {
210+
self.expect("^", "expected `^`")?;
211+
let Some(last_expr) = es.last() else {
212+
bail!(self, "expected expression before cut operator");
213+
};
214+
match last_expr.kind {
215+
ExpressionKind::Optional(_)
216+
| ExpressionKind::Repeat(_)
217+
| ExpressionKind::RepeatNonGreedy(_)
218+
| ExpressionKind::RepeatRange(_, None | Some(0), _) => {
219+
bail!(self, "expected non-optional expression before cut operator");
220+
}
221+
_ => {}
222+
}
223+
let Some(rhs) = self.parse_seq()? else {
224+
bail!(self, "expected expression after cut operator");
225+
};
226+
let lhs = match es.len() {
227+
1 => es.pop().unwrap(),
228+
_ => Expression::new_kind(ExpressionKind::Sequence(es)),
229+
};
230+
Ok(Expression::new_kind(ExpressionKind::Cut(
231+
Box::new(lhs),
232+
Box::new(rhs),
233+
)))
234+
}
235+
204236
fn parse_expr1(&mut self) -> Result<Option<Expression>> {
205237
let Some(next) = self.peek() else {
206238
return Ok(None);
@@ -506,13 +538,62 @@ fn translate_position(input: &str, index: usize) -> (&str, usize, usize) {
506538
("", line_number + 1, 0)
507539
}
508540

509-
#[test]
510-
fn translate_tests() {
511-
assert_eq!(translate_position("", 0), ("", 0, 0));
512-
assert_eq!(translate_position("test", 0), ("test", 1, 1));
513-
assert_eq!(translate_position("test", 3), ("test", 1, 4));
514-
assert_eq!(translate_position("test", 4), ("test", 1, 5));
515-
assert_eq!(translate_position("test\ntest2", 4), ("test", 1, 5));
516-
assert_eq!(translate_position("test\ntest2", 5), ("test2", 2, 1));
517-
assert_eq!(translate_position("test\ntest2\n", 11), ("", 3, 0));
541+
#[cfg(test)]
542+
mod tests {
543+
use crate::grammar::Grammar;
544+
use crate::grammar::parser::{parse_grammar, translate_position};
545+
use std::path::Path;
546+
547+
#[test]
548+
fn test_translate() {
549+
assert_eq!(translate_position("", 0), ("", 0, 0));
550+
assert_eq!(translate_position("test", 0), ("test", 1, 1));
551+
assert_eq!(translate_position("test", 3), ("test", 1, 4));
552+
assert_eq!(translate_position("test", 4), ("test", 1, 5));
553+
assert_eq!(translate_position("test\ntest2", 4), ("test", 1, 5));
554+
assert_eq!(translate_position("test\ntest2", 5), ("test2", 2, 1));
555+
assert_eq!(translate_position("test\ntest2\n", 11), ("", 3, 0));
556+
}
557+
558+
fn parse(input: &str) -> Result<Grammar, String> {
559+
let mut grammar = Grammar::default();
560+
parse_grammar(input, &mut grammar, "test", Path::new("test.md"))
561+
.map_err(|e| e.to_string())?;
562+
Ok(grammar)
563+
}
564+
565+
#[test]
566+
fn test_cut() {
567+
let input = "Rule -> A ^ B | C";
568+
let grammar = parse(input).unwrap();
569+
grammar.productions.get("Rule").unwrap();
570+
}
571+
572+
#[test]
573+
fn test_cut_fail_final_arm() {
574+
let input = "Rule -> A | B ^ C";
575+
let err = parse(input).unwrap_err();
576+
assert!(err.contains("expected final arm to not contain cut operator"));
577+
}
578+
579+
#[test]
580+
fn test_cut_fail_optional_lhs() {
581+
let input = "Rule -> A* ^ B";
582+
let err = parse(input).unwrap_err();
583+
assert!(err.contains("expected non-optional expression before cut operator"));
584+
}
585+
586+
#[test]
587+
fn test_cut_fail_optional_lhs_group() {
588+
let input = "Rule -> (A B)* ^ C";
589+
let err = parse(input).unwrap_err();
590+
assert!(err.contains("expected non-optional expression before cut operator"));
591+
}
592+
593+
#[test]
594+
fn test_cut_fail_trailing() {
595+
let input = "Rule -> A ^";
596+
let err = parse(input).unwrap_err();
597+
assert!(err.contains("expected expression after cut operator"));
598+
}
518599
}

mdbook-spec/src/grammar/render_markdown.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ impl Expression {
8383
| ExpressionKind::Comment(_)
8484
| ExpressionKind::Charset(_)
8585
| ExpressionKind::NegExpression(_)
86+
| ExpressionKind::Cut(_, _)
8687
| ExpressionKind::Unicode(_) => &self.kind,
8788
}
8889
}
@@ -175,6 +176,11 @@ impl Expression {
175176
output.push('~');
176177
e.render_markdown(cx, output);
177178
}
179+
ExpressionKind::Cut(e1, e2) => {
180+
e1.render_markdown(cx, output);
181+
output.push_str(" ^ ");
182+
e2.render_markdown(cx, output);
183+
}
178184
ExpressionKind::Unicode(s) => {
179185
output.push_str("U+");
180186
output.push_str(s);

mdbook-spec/src/grammar/render_railroad.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,12 @@ impl Expression {
220220
let ch = node_for_nt(cx, "CHAR");
221221
Box::new(Except::new(Box::new(ch), n))
222222
}
223+
ExpressionKind::Cut(e1, e2) => {
224+
let n1 = e1.render_railroad(cx, stack)?;
225+
let n2 = e2.render_railroad(cx, stack)?;
226+
let lbox = LabeledBox::new(n2, Comment::new("no backtracking".to_string()));
227+
Box::new(Sequence::new(vec![n1, Box::new(lbox)]))
228+
}
223229
ExpressionKind::Unicode(s) => Box::new(Terminal::new(format!("U+{}", s))),
224230
};
225231
}

reference-dev-guide/src/grammar.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Expr1 ->
6363
| Prose
6464
| Group
6565
| NegativeExpression
66+
| Cut
6667
6768
Unicode -> `U+` [`A`-`Z` `0`-`9`]4..4
6869
@@ -92,6 +93,8 @@ Prose -> `<` ~[`>` LF]+ `>`
9293
Group -> `(` ` `* Expression ` `* `)`
9394
9495
NegativeExpression -> `~` ( Charset | Terminal | NonTerminal )
96+
97+
Cut -> `^`
9598
```
9699

97100
The general format is a series of productions separated by blank lines. The expressions are as follows:
@@ -110,6 +113,7 @@ The general format is a series of productions separated by blank lines. The expr
110113
| Prose | \<any ASCII character except CR\> | An English description of what should be matched, surrounded in angle brackets. |
111114
| Group | (\`,\` Parameter)+ | Groups an expression for the purpose of precedence, such as applying a repetition operator to a sequence of other expressions.
112115
| NegativeExpression | ~\[\` \` LF\] | Matches anything except the given Charset, Terminal, or Nonterminal. |
116+
| Cut | ^ | The cut operator. Commits to the current alternative if the preceding expression matches. |
113117
| Sequence | \`fn\` Name Parameters | A sequence of expressions that must match in order. |
114118
| Alternation | Expr1 \| Expr2 | Matches only one of the given expressions, separated by the vertical pipe character. |
115119
| Suffix | \_except \[LazyBooleanExpression\]\_ | Adds a suffix to the previous expression to provide an additional English description, rendered in subscript. This can contain limited markdown, but try to avoid anything except basics like links. |

src/notation.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets:
2424
| ~\[ ] | ~\[`b` `B`] | Any characters, except those listed |
2525
| ~`string` | ~`\n`, ~`*/` | Any characters, except this sequence |
2626
| ( ) | (`,` _Parameter_)<sup>?</sup> | Groups items |
27+
| ^ | `c"` ^ _CStringRest_ | Commit to an alternative ([cut operator]) |
2728
| U+xxxx | U+0060 | A single unicode character |
2829
| \<text\> | \<any ASCII char except CR\> | An English description of what should be matched |
2930
| Rule <sub>suffix</sub> | IDENTIFIER_OR_KEYWORD <sub>_except `crate`_</sub> | A modification to the previous rule |
@@ -52,6 +53,7 @@ r[notation.grammar.visualizations]
5253
Below each grammar block is a button to toggle the display of a [syntax diagram]. A square element is a non-terminal rule, and a rounded rectangle is a terminal.
5354

5455
[binary operators]: expressions/operator-expr.md#arithmetic-and-logical-binary-operators
56+
[cut operator]: https://kmizu.github.io/papers/paste513-mizushima.pdf
5557
[keywords]: keywords.md
5658
[syntax diagram]: https://en.wikipedia.org/wiki/Syntax_diagram
5759
[tokens]: tokens.md

src/tokens.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ r[lex.token.literal.str-raw.syntax]
241241
RAW_STRING_LITERAL -> `r` RAW_STRING_CONTENT SUFFIX?
242242
243243
RAW_STRING_CONTENT ->
244-
`"` ( ~CR )*? `"`
244+
`"` ^ ( ~CR )*? `"`
245245
| `#` RAW_STRING_CONTENT `#`
246246
```
247247

@@ -281,7 +281,7 @@ r[lex.token.byte]
281281
r[lex.token.byte.syntax]
282282
```grammar,lexer
283283
BYTE_LITERAL ->
284-
`b'` ( ASCII_FOR_CHAR | BYTE_ESCAPE ) `'` SUFFIX?
284+
`b'` ^ ( ASCII_FOR_CHAR | BYTE_ESCAPE ) `'` SUFFIX?
285285
286286
ASCII_FOR_CHAR ->
287287
<any ASCII (i.e. 0x00 to 0x7F) except `'`, `\`, LF, CR, or TAB>
@@ -305,7 +305,7 @@ r[lex.token.str-byte]
305305
r[lex.token.str-byte.syntax]
306306
```grammar,lexer
307307
BYTE_STRING_LITERAL ->
308-
`b"` ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX?
308+
`b"` ^ ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX?
309309
310310
ASCII_FOR_STRING ->
311311
<any ASCII (i.e 0x00 to 0x7F) except `"`, `\`, or CR>
@@ -357,7 +357,7 @@ RAW_BYTE_STRING_LITERAL ->
357357
`br` RAW_BYTE_STRING_CONTENT SUFFIX?
358358
359359
RAW_BYTE_STRING_CONTENT ->
360-
`"` ASCII_FOR_RAW*? `"`
360+
`"` ^ ASCII_FOR_RAW*? `"`
361361
| `#` RAW_BYTE_STRING_CONTENT `#`
362362
363363
ASCII_FOR_RAW ->
@@ -401,7 +401,7 @@ r[lex.token.str-c]
401401
r[lex.token.str-c.syntax]
402402
```grammar,lexer
403403
C_STRING_LITERAL ->
404-
`c"` (
404+
`c"` ^ (
405405
~[`"` `\` CR NUL]
406406
| BYTE_ESCAPE _except `\0` or `\x00`_
407407
| UNICODE_ESCAPE _except `\u{0}`, `\u{00}`, …, `\u{000000}`_
@@ -480,7 +480,7 @@ RAW_C_STRING_LITERAL ->
480480
`cr` RAW_C_STRING_CONTENT SUFFIX?
481481
482482
RAW_C_STRING_CONTENT ->
483-
`"` ( ~[CR NUL] )*? `"`
483+
`"` ^ ( ~[CR NUL] )*? `"`
484484
| `#` RAW_C_STRING_CONTENT `#`
485485
```
486486

0 commit comments

Comments
 (0)