MLIR 22.0.0git
Parser.cpp
Go to the documentation of this file.
1//===- Parser.cpp - Matcher expression parser -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Recursive parser implementation for the matcher expression grammar.
10//
11//===----------------------------------------------------------------------===//
12
13#include "Parser.h"
14
15#include <vector>
16
18
19// Simple structure to hold information for one token from the parser.
21 TokenInfo() = default;
22
23 // Method to set the kind and text of the token
24 void set(TokenKind newKind, llvm::StringRef newText) {
25 kind = newKind;
26 text = newText;
27 }
28
29 // Known identifiers.
30 static const char *const idExtract;
31
32 llvm::StringRef text;
36};
37
38const char *const Parser::TokenInfo::idExtract = "extract";
39
41public:
42 // Constructor with matcherCode and error
43 explicit CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error)
44 : code(matcherCode), startOfLine(matcherCode), error(error) {
45 nextToken = getNextToken();
46 }
47
48 // Constructor with matcherCode, error, and codeCompletionOffset
49 CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error,
50 unsigned codeCompletionOffset)
51 : code(matcherCode), startOfLine(matcherCode), error(error),
52 codeCompletionLocation(matcherCode.data() + codeCompletionOffset) {
53 nextToken = getNextToken();
54 }
55
56 // Peek at next token without consuming it
57 const TokenInfo &peekNextToken() const { return nextToken; }
58
59 // Consume and return the next token
61 TokenInfo thisToken = nextToken;
62 nextToken = getNextToken();
63 return thisToken;
64 }
65
66 // Skip any newline tokens
68 while (nextToken.kind == TokenKind::NewLine)
69 nextToken = getNextToken();
70 return nextToken;
71 }
72
73 // Consume and return next token, ignoring newlines
76 return nextToken.kind == TokenKind::Eof ? nextToken : consumeNextToken();
77 }
78
79 // Return kind of next token
80 TokenKind nextTokenKind() const { return nextToken.kind; }
81
82private:
83 // Helper function to get the first character as a new StringRef and drop it
84 // from the original string
85 llvm::StringRef firstCharacterAndDrop(llvm::StringRef &str) {
86 assert(!str.empty());
87 llvm::StringRef firstChar = str.substr(0, 1);
88 str = str.drop_front();
89 return firstChar;
90 }
91
92 // Get next token, consuming whitespaces and handling different token types
93 TokenInfo getNextToken() {
94 consumeWhitespace();
95 TokenInfo result;
96 result.range.start = currentLocation();
97
98 // Code completion case
99 if (codeCompletionLocation && codeCompletionLocation <= code.data()) {
101 llvm::StringRef(codeCompletionLocation, 0));
102 codeCompletionLocation = nullptr;
103 return result;
104 }
105
106 // End of file case
107 if (code.empty()) {
108 result.set(TokenKind::Eof, "");
109 return result;
110 }
111
112 // Switch to handle specific characters
113 switch (code[0]) {
114 case '#':
115 code = code.drop_until([](char c) { return c == '\n'; });
116 return getNextToken();
117 case ',':
118 result.set(TokenKind::Comma, firstCharacterAndDrop(code));
119 break;
120 case '.':
121 result.set(TokenKind::Period, firstCharacterAndDrop(code));
122 break;
123 case '\n':
124 ++line;
125 startOfLine = code.drop_front();
126 result.set(TokenKind::NewLine, firstCharacterAndDrop(code));
127 break;
128 case '(':
129 result.set(TokenKind::OpenParen, firstCharacterAndDrop(code));
130 break;
131 case ')':
132 result.set(TokenKind::CloseParen, firstCharacterAndDrop(code));
133 break;
134 case '"':
135 case '\'':
136 consumeStringLiteral(&result);
137 break;
138 case '0':
139 case '1':
140 case '2':
141 case '3':
142 case '4':
143 case '5':
144 case '6':
145 case '7':
146 case '8':
147 case '9':
148 consumeNumberLiteral(&result);
149 break;
150 default:
151 parseIdentifierOrInvalid(&result);
152 break;
153 }
154
155 result.range.end = currentLocation();
156 return result;
157 }
158
159 void consumeNumberLiteral(TokenInfo *result) {
160 StringRef original = code;
161 unsigned value = 0;
162 if (!code.consumeInteger(0, value)) {
163 size_t numConsumed = original.size() - code.size();
164 result->text = original.take_front(numConsumed);
166 result->value = static_cast<int64_t>(value);
167 return;
168 }
169 }
170
171 // Consume a string literal, handle escape sequences and missing closing
172 // quote.
173 void consumeStringLiteral(TokenInfo *result) {
174 bool inEscape = false;
175 const char marker = code[0];
176 for (size_t length = 1; length < code.size(); ++length) {
177 if (inEscape) {
178 inEscape = false;
179 continue;
180 }
181 if (code[length] == '\\') {
182 inEscape = true;
183 continue;
184 }
185 if (code[length] == marker) {
187 result->text = code.substr(0, length + 1);
188 result->value = code.substr(1, length - 1);
189 code = code.drop_front(length + 1);
190 return;
191 }
192 }
193 llvm::StringRef errorText = code;
194 code = code.drop_front(code.size());
195 SourceRange range;
196 range.start = result->range.start;
197 range.end = currentLocation();
198 error->addError(range, ErrorType::ParserStringError) << errorText;
199 result->kind = TokenKind::Error;
200 }
201
202 void parseIdentifierOrInvalid(TokenInfo *result) {
203 if (isalnum(code[0])) {
204 // Parse an identifier
205 size_t tokenLength = 1;
206
207 while (true) {
208 // A code completion location in/immediately after an identifier will
209 // cause the portion of the identifier before the code completion
210 // location to become a code completion token.
211 if (codeCompletionLocation == code.data() + tokenLength) {
212 codeCompletionLocation = nullptr;
214 result->text = code.substr(0, tokenLength);
215 code = code.drop_front(tokenLength);
216 return;
217 }
218 if (tokenLength == code.size() || !(isalnum(code[tokenLength])))
219 break;
220 ++tokenLength;
221 }
222 llvm::StringRef token = code.substr(0, tokenLength);
223 code = code.drop_front(tokenLength);
224 // Check if the identifier is a boolean literal
225 if (token == "true") {
226 result->text = "false";
228 result->value = true;
229 } else if (token == "false") {
230 result->text = "false";
232 result->value = false;
233 } else {
234 // Otherwise it is treated as a normal identifier
235 result->kind = TokenKind::Ident;
236 result->text = token;
237 }
238 } else {
240 result->text = code.substr(0, 1);
241 code = code.drop_front(1);
242 }
243 }
244
245 // Consume all leading whitespace from code, except newlines
246 void consumeWhitespace() { code = code.ltrim(" \t\v\f\r"); }
247
248 // Returns the current location in the source code
249 SourceLocation currentLocation() {
250 SourceLocation location;
251 location.line = line;
252 location.column = code.data() - startOfLine.data() + 1;
253 return location;
254 }
255
256 llvm::StringRef code;
257 llvm::StringRef startOfLine;
258 unsigned line = 1;
259 Diagnostics *error;
260 TokenInfo nextToken;
261 const char *codeCompletionLocation = nullptr;
262};
263
264Parser::Sema::~Sema() = default;
265
267 llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
268 return {};
269}
270
271std::vector<MatcherCompletion>
275
276// Entry for the scope of a parser
278 Parser *parser;
279
281 parser->contextStack.emplace_back(c, 0u);
282 }
283
284 ~ScopedContextEntry() { parser->contextStack.pop_back(); }
285
286 void nextArg() { ++parser->contextStack.back().second; }
287};
288
289// Parse and validate expressions starting with an identifier.
290// This function can parse named values and matchers. In case of failure, it
291// will try to determine the user's intent to give an appropriate error message.
292bool Parser::parseIdentifierPrefixImpl(VariantValue *value) {
293 const TokenInfo nameToken = tokenizer->consumeNextToken();
294
295 if (tokenizer->nextTokenKind() != TokenKind::OpenParen) {
296 // Parse as a named value.
297 if (auto namedValue = namedValues ? namedValues->lookup(nameToken.text)
298 : VariantValue()) {
299
300 if (tokenizer->nextTokenKind() != TokenKind::Period) {
301 *value = namedValue;
302 return true;
303 }
304
305 if (!namedValue.isMatcher()) {
306 error->addError(tokenizer->peekNextToken().range,
308 return false;
309 }
310 }
311
312 if (tokenizer->nextTokenKind() == TokenKind::NewLine) {
313 error->addError(tokenizer->peekNextToken().range,
315 << "NewLine";
316 return false;
317 }
318
319 // If the syntax is correct and the name is not a matcher either, report
320 // an unknown named value.
321 if ((tokenizer->nextTokenKind() == TokenKind::Comma ||
322 tokenizer->nextTokenKind() == TokenKind::CloseParen ||
323 tokenizer->nextTokenKind() == TokenKind::NewLine ||
324 tokenizer->nextTokenKind() == TokenKind::Eof) &&
325 !sema->lookupMatcherCtor(nameToken.text)) {
326 error->addError(nameToken.range, ErrorType::RegistryValueNotFound)
327 << nameToken.text;
328 return false;
329 }
330 // Otherwise, fallback to the matcher parser.
331 }
332
333 tokenizer->skipNewlines();
334
335 assert(nameToken.kind == TokenKind::Ident);
336 TokenInfo openToken = tokenizer->consumeNextToken();
337 if (openToken.kind != TokenKind::OpenParen) {
338 error->addError(openToken.range, ErrorType::ParserNoOpenParen)
339 << openToken.text;
340 return false;
341 }
342
343 std::optional<MatcherCtor> ctor = sema->lookupMatcherCtor(nameToken.text);
344
345 // Parse as a matcher expression.
346 return parseMatcherExpressionImpl(nameToken, openToken, ctor, value);
347}
348
349bool Parser::parseChainedExpression(std::string &argument) {
350 // Parse the parenthesized argument to .extract("foo")
351 // Note: EOF is handled inside the consume functions and would fail below when
352 // checking token kind.
353 const TokenInfo openToken = tokenizer->consumeNextToken();
354 const TokenInfo argumentToken = tokenizer->consumeNextTokenIgnoreNewlines();
355 const TokenInfo closeToken = tokenizer->consumeNextTokenIgnoreNewlines();
356
357 if (openToken.kind != TokenKind::OpenParen) {
358 error->addError(openToken.range, ErrorType::ParserChainedExprNoOpenParen);
359 return false;
360 }
361
362 if (argumentToken.kind != TokenKind::Literal ||
363 !argumentToken.value.isString()) {
364 error->addError(argumentToken.range,
366 return false;
367 }
368
369 if (closeToken.kind != TokenKind::CloseParen) {
370 error->addError(closeToken.range, ErrorType::ParserChainedExprNoCloseParen);
371 return false;
372 }
373
374 // If all checks passed, extract the argument and return true.
375 argument = argumentToken.value.getString();
376 return true;
377}
378
379// Parse the arguments of a matcher
380bool Parser::parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
381 const TokenInfo &nameToken, TokenInfo &endToken) {
382 ScopedContextEntry sce(this, ctor);
383
384 while (tokenizer->nextTokenKind() != TokenKind::Eof) {
385 if (tokenizer->nextTokenKind() == TokenKind::CloseParen) {
386 // end of args.
387 endToken = tokenizer->consumeNextToken();
388 break;
389 }
390
391 if (!args.empty()) {
392 // We must find a , token to continue.
393 TokenInfo commaToken = tokenizer->consumeNextToken();
394 if (commaToken.kind != TokenKind::Comma) {
395 error->addError(commaToken.range, ErrorType::ParserNoComma)
396 << commaToken.text;
397 return false;
398 }
399 }
400
401 ParserValue argValue;
402 tokenizer->skipNewlines();
403
404 argValue.text = tokenizer->peekNextToken().text;
405 argValue.range = tokenizer->peekNextToken().range;
406 if (!parseExpressionImpl(&argValue.value)) {
407 return false;
408 }
409
410 tokenizer->skipNewlines();
411 args.push_back(argValue);
412 sce.nextArg();
413 }
414
415 return true;
416}
417
418// Parse and validate a matcher expression.
419bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken,
420 const TokenInfo &openToken,
421 std::optional<MatcherCtor> ctor,
422 VariantValue *value) {
423 if (!ctor) {
424 error->addError(nameToken.range, ErrorType::RegistryMatcherNotFound)
425 << nameToken.text;
426 // Do not return here. We need to continue to give completion suggestions.
427 }
428
429 std::vector<ParserValue> args;
430 TokenInfo endToken;
431
432 tokenizer->skipNewlines();
433
434 if (!parseMatcherArgs(args, ctor.value_or(nullptr), nameToken, endToken)) {
435 return false;
436 }
437
438 // Check for the missing closing parenthesis
439 if (endToken.kind != TokenKind::CloseParen) {
440 error->addError(openToken.range, ErrorType::ParserNoCloseParen)
441 << nameToken.text;
442 return false;
443 }
444
445 std::string functionName;
446 if (tokenizer->peekNextToken().kind == TokenKind::Period) {
447 tokenizer->consumeNextToken();
448 TokenInfo chainCallToken = tokenizer->consumeNextToken();
449 if (chainCallToken.kind == TokenKind::CodeCompletion) {
450 addCompletion(chainCallToken, MatcherCompletion("extract(\"", "extract"));
451 return false;
452 }
453
454 if (chainCallToken.kind != TokenKind::Ident ||
455 chainCallToken.text != TokenInfo::idExtract) {
456 error->addError(chainCallToken.range,
458 return false;
459 }
460
461 if (chainCallToken.text == TokenInfo::idExtract &&
462 !parseChainedExpression(functionName))
463 return false;
464 }
465
466 if (!ctor)
467 return false;
468 // Merge the start and end infos.
469 SourceRange matcherRange = nameToken.range;
470 matcherRange.end = endToken.range.end;
471 VariantMatcher result = sema->actOnMatcherExpression(
472 *ctor, matcherRange, functionName, args, error);
473 if (result.isNull())
474 return false;
475 *value = result;
476 return true;
477}
478
479// If the prefix of this completion matches the completion token, add it to
480// completions minus the prefix.
481void Parser::addCompletion(const TokenInfo &compToken,
482 const MatcherCompletion &completion) {
483 if (llvm::StringRef(completion.typedText).starts_with(compToken.text)) {
484 completions.emplace_back(completion.typedText.substr(compToken.text.size()),
485 completion.matcherDecl);
486 }
487}
488
489std::vector<MatcherCompletion>
490Parser::getNamedValueCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) {
491 if (!namedValues)
492 return {};
493
494 std::vector<MatcherCompletion> result;
495 for (const auto &entry : *namedValues) {
496 std::string decl =
497 (entry.getValue().getTypeAsString() + " " + entry.getKey()).str();
498 result.emplace_back(entry.getKey(), decl);
499 }
500 return result;
501}
502
503void Parser::addExpressionCompletions() {
504 const TokenInfo compToken = tokenizer->consumeNextTokenIgnoreNewlines();
505 assert(compToken.kind == TokenKind::CodeCompletion);
506
507 // We cannot complete code if there is an invalid element on the context
508 // stack.
509 for (const auto &entry : contextStack) {
510 if (!entry.first)
511 return;
512 }
513
514 auto acceptedTypes = sema->getAcceptedCompletionTypes(contextStack);
515 for (const auto &completion : sema->getMatcherCompletions(acceptedTypes)) {
516 addCompletion(compToken, completion);
517 }
518
519 for (const auto &completion : getNamedValueCompletions(acceptedTypes)) {
520 addCompletion(compToken, completion);
521 }
522}
523
524// Parse an <Expresssion>
525bool Parser::parseExpressionImpl(VariantValue *value) {
526 switch (tokenizer->nextTokenKind()) {
528 *value = tokenizer->consumeNextToken().value;
529 return true;
530 case TokenKind::Ident:
531 return parseIdentifierPrefixImpl(value);
533 addExpressionCompletions();
534 return false;
535 case TokenKind::Eof:
536 error->addError(tokenizer->consumeNextToken().range,
538 return false;
539
540 case TokenKind::Error:
541 // This error was already reported by the tokenizer.
542 return false;
546 case TokenKind::Comma:
549 const TokenInfo token = tokenizer->consumeNextToken();
550 error->addError(token.range, ErrorType::ParserInvalidToken)
551 << (token.kind == TokenKind::NewLine ? "NewLine" : token.text);
552 return false;
553 }
554
555 llvm_unreachable("Unknown token kind.");
556}
557
558Parser::Parser(CodeTokenizer *tokenizer, const Registry &matcherRegistry,
559 const NamedValueMap *namedValues, Diagnostics *error)
560 : tokenizer(tokenizer),
561 sema(std::make_unique<RegistrySema>(matcherRegistry)),
562 namedValues(namedValues), error(error) {}
563
565
566std::optional<MatcherCtor>
567Parser::RegistrySema::lookupMatcherCtor(llvm::StringRef matcherName) {
568 return RegistryManager::lookupMatcherCtor(matcherName, matcherRegistry);
569}
570
572 MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
574 return RegistryManager::constructMatcher(ctor, nameRange, functionName, args,
575 error);
576}
577
579 llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
581}
582
583std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
584 llvm::ArrayRef<ArgKind> acceptedTypes) {
585 return RegistryManager::getMatcherCompletions(acceptedTypes, matcherRegistry);
586}
587
588bool Parser::parseExpression(llvm::StringRef &code,
589 const Registry &matcherRegistry,
590 const NamedValueMap *namedValues,
591 VariantValue *value, Diagnostics *error) {
592 CodeTokenizer tokenizer(code, error);
593 Parser parser(&tokenizer, matcherRegistry, namedValues, error);
594 if (!parser.parseExpressionImpl(value))
595 return false;
596 auto nextToken = tokenizer.peekNextToken();
597 if (nextToken.kind != TokenKind::Eof &&
598 nextToken.kind != TokenKind::NewLine) {
599 error->addError(tokenizer.peekNextToken().range,
601 return false;
602 }
603 return true;
604}
605
606std::vector<MatcherCompletion>
607Parser::completeExpression(llvm::StringRef &code, unsigned completionOffset,
608 const Registry &matcherRegistry,
609 const NamedValueMap *namedValues) {
610 Diagnostics error;
611 CodeTokenizer tokenizer(code, &error, completionOffset);
612 Parser parser(&tokenizer, matcherRegistry, namedValues, &error);
613 VariantValue dummy;
614 parser.parseExpressionImpl(&dummy);
615
616 return parser.completions;
617}
618
619std::optional<DynMatcher> Parser::parseMatcherExpression(
620 llvm::StringRef &code, const Registry &matcherRegistry,
621 const NamedValueMap *namedValues, Diagnostics *error) {
622 VariantValue value;
623 if (!parseExpression(code, matcherRegistry, namedValues, &value, error))
624 return std::nullopt;
625 if (!value.isMatcher()) {
626 error->addError(SourceRange(), ErrorType::ParserNotAMatcher);
627 return std::nullopt;
628 }
629 std::optional<DynMatcher> result = value.getMatcher().getDynMatcher();
630 if (!result) {
632 << value.getTypeAsString();
633 }
634 return result;
635}
636
637} // namespace mlir::query::matcher::internal
static std::vector< MatcherCompletion > getMatcherCompletions(ArrayRef< ArgKind > acceptedTypes, const Registry &matcherRegistry)
static std::vector< ArgKind > getAcceptedCompletionTypes(llvm::ArrayRef< std::pair< MatcherCtor, unsigned > > context)
static std::optional< MatcherCtor > lookupMatcherCtor(llvm::StringRef matcherName, const Registry &matcherRegistry)
static VariantMatcher constructMatcher(MatcherCtor ctor, internal::SourceRange nameRange, llvm::StringRef functionName, ArrayRef< ParserValue > args, internal::Diagnostics *error)
std::optional< DynMatcher > getDynMatcher() const
const VariantMatcher & getMatcher() const
const llvm::StringRef & getString() const
ArgStream addError(SourceRange range, ErrorType error)
CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error, unsigned codeCompletionOffset)
Definition Parser.cpp:49
CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error)
Definition Parser.cpp:43
std::optional< MatcherCtor > lookupMatcherCtor(llvm::StringRef matcherName) override
Definition Parser.cpp:567
std::vector< ArgKind > getAcceptedCompletionTypes(llvm::ArrayRef< std::pair< MatcherCtor, unsigned > > context) override
Definition Parser.cpp:578
std::vector< MatcherCompletion > getMatcherCompletions(llvm::ArrayRef< ArgKind > acceptedTypes) override
Definition Parser.cpp:583
VariantMatcher actOnMatcherExpression(MatcherCtor Ctor, SourceRange NameRange, StringRef functionName, ArrayRef< ParserValue > Args, Diagnostics *Error) override
Definition Parser.cpp:571
virtual std::vector< MatcherCompletion > getMatcherCompletions(llvm::ArrayRef< ArgKind > acceptedTypes)
Definition Parser.cpp:272
virtual std::vector< ArgKind > getAcceptedCompletionTypes(llvm::ArrayRef< std::pair< MatcherCtor, unsigned > > Context)
Definition Parser.cpp:266
static bool parseExpression(llvm::StringRef &code, const Registry &matcherRegistry, const NamedValueMap *namedValues, VariantValue *value, Diagnostics *error)
Definition Parser.cpp:588
llvm::StringMap< VariantValue > NamedValueMap
Definition Parser.h:114
static std::vector< MatcherCompletion > completeExpression(llvm::StringRef &code, unsigned completionOffset, const Registry &matcherRegistry, const NamedValueMap *namedValues)
Definition Parser.cpp:607
static std::optional< DynMatcher > parseMatcherExpression(llvm::StringRef &matcherCode, const Registry &matcherRegistry, const NamedValueMap *namedValues, Diagnostics *error)
Definition Parser.cpp:619
const internal::MatcherDescriptor * MatcherCtor
void set(TokenKind newKind, llvm::StringRef newText)
Definition Parser.cpp:24