MLIR  21.0.0git
Parser.cpp
Go to the documentation of this file.
1 //===- Parser.cpp - Matcher expression parser -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Recursive parser implementation for the matcher expression grammar.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "Parser.h"
14 
15 #include <vector>
16 
18 
19 // Simple structure to hold information for one token from the parser.
21  TokenInfo() = default;
22 
23  // Method to set the kind and text of the token
24  void set(TokenKind newKind, llvm::StringRef newText) {
25  kind = newKind;
26  text = newText;
27  }
28 
29  // Known identifiers.
30  static const char *const ID_Extract;
31 
32  llvm::StringRef text;
36 };
37 
38 const char *const Parser::TokenInfo::ID_Extract = "extract";
39 
41 public:
42  // Constructor with matcherCode and error
43  explicit CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error)
44  : code(matcherCode), startOfLine(matcherCode), error(error) {
45  nextToken = getNextToken();
46  }
47 
48  // Constructor with matcherCode, error, and codeCompletionOffset
49  CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error,
50  unsigned codeCompletionOffset)
51  : code(matcherCode), startOfLine(matcherCode), error(error),
52  codeCompletionLocation(matcherCode.data() + codeCompletionOffset) {
53  nextToken = getNextToken();
54  }
55 
56  // Peek at next token without consuming it
57  const TokenInfo &peekNextToken() const { return nextToken; }
58 
59  // Consume and return the next token
61  TokenInfo thisToken = nextToken;
62  nextToken = getNextToken();
63  return thisToken;
64  }
65 
66  // Skip any newline tokens
68  while (nextToken.kind == TokenKind::NewLine)
69  nextToken = getNextToken();
70  return nextToken;
71  }
72 
73  // Consume and return next token, ignoring newlines
75  skipNewlines();
76  return nextToken.kind == TokenKind::Eof ? nextToken : consumeNextToken();
77  }
78 
79  // Return kind of next token
80  TokenKind nextTokenKind() const { return nextToken.kind; }
81 
82 private:
83  // Helper function to get the first character as a new StringRef and drop it
84  // from the original string
85  llvm::StringRef firstCharacterAndDrop(llvm::StringRef &str) {
86  assert(!str.empty());
87  llvm::StringRef firstChar = str.substr(0, 1);
88  str = str.drop_front();
89  return firstChar;
90  }
91 
92  // Get next token, consuming whitespaces and handling different token types
93  TokenInfo getNextToken() {
94  consumeWhitespace();
95  TokenInfo result;
96  result.range.start = currentLocation();
97 
98  // Code completion case
99  if (codeCompletionLocation && codeCompletionLocation <= code.data()) {
100  result.set(TokenKind::CodeCompletion,
101  llvm::StringRef(codeCompletionLocation, 0));
102  codeCompletionLocation = nullptr;
103  return result;
104  }
105 
106  // End of file case
107  if (code.empty()) {
108  result.set(TokenKind::Eof, "");
109  return result;
110  }
111 
112  // Switch to handle specific characters
113  switch (code[0]) {
114  case '#':
115  code = code.drop_until([](char c) { return c == '\n'; });
116  return getNextToken();
117  case ',':
118  result.set(TokenKind::Comma, firstCharacterAndDrop(code));
119  break;
120  case '.':
121  result.set(TokenKind::Period, firstCharacterAndDrop(code));
122  break;
123  case '\n':
124  ++line;
125  startOfLine = code.drop_front();
126  result.set(TokenKind::NewLine, firstCharacterAndDrop(code));
127  break;
128  case '(':
129  result.set(TokenKind::OpenParen, firstCharacterAndDrop(code));
130  break;
131  case ')':
132  result.set(TokenKind::CloseParen, firstCharacterAndDrop(code));
133  break;
134  case '"':
135  case '\'':
136  consumeStringLiteral(&result);
137  break;
138  case '0':
139  case '1':
140  case '2':
141  case '3':
142  case '4':
143  case '5':
144  case '6':
145  case '7':
146  case '8':
147  case '9':
148  consumeNumberLiteral(&result);
149  break;
150  default:
151  parseIdentifierOrInvalid(&result);
152  break;
153  }
154 
155  result.range.end = currentLocation();
156  return result;
157  }
158 
159  void consumeNumberLiteral(TokenInfo *result) {
160  StringRef original = code;
161  unsigned value = 0;
162  if (!code.consumeInteger(0, value)) {
163  size_t numConsumed = original.size() - code.size();
164  result->text = original.take_front(numConsumed);
165  result->kind = TokenKind::Literal;
166  result->value = static_cast<int64_t>(value);
167  return;
168  }
169  }
170 
171  // Consume a string literal, handle escape sequences and missing closing
172  // quote.
173  void consumeStringLiteral(TokenInfo *result) {
174  bool inEscape = false;
175  const char marker = code[0];
176  for (size_t length = 1; length < code.size(); ++length) {
177  if (inEscape) {
178  inEscape = false;
179  continue;
180  }
181  if (code[length] == '\\') {
182  inEscape = true;
183  continue;
184  }
185  if (code[length] == marker) {
186  result->kind = TokenKind::Literal;
187  result->text = code.substr(0, length + 1);
188  result->value = code.substr(1, length - 1);
189  code = code.drop_front(length + 1);
190  return;
191  }
192  }
193  llvm::StringRef errorText = code;
194  code = code.drop_front(code.size());
195  SourceRange range;
196  range.start = result->range.start;
197  range.end = currentLocation();
198  error->addError(range, ErrorType::ParserStringError) << errorText;
199  result->kind = TokenKind::Error;
200  }
201 
202  void parseIdentifierOrInvalid(TokenInfo *result) {
203  if (isalnum(code[0])) {
204  // Parse an identifier
205  size_t tokenLength = 1;
206 
207  while (true) {
208  // A code completion location in/immediately after an identifier will
209  // cause the portion of the identifier before the code completion
210  // location to become a code completion token.
211  if (codeCompletionLocation == code.data() + tokenLength) {
212  codeCompletionLocation = nullptr;
213  result->kind = TokenKind::CodeCompletion;
214  result->text = code.substr(0, tokenLength);
215  code = code.drop_front(tokenLength);
216  return;
217  }
218  if (tokenLength == code.size() || !(isalnum(code[tokenLength])))
219  break;
220  ++tokenLength;
221  }
222  llvm::StringRef token = code.substr(0, tokenLength);
223  code = code.drop_front(tokenLength);
224  // Check if the identifier is a boolean literal
225  if (token == "true") {
226  result->text = "false";
227  result->kind = TokenKind::Literal;
228  result->value = true;
229  } else if (token == "false") {
230  result->text = "false";
231  result->kind = TokenKind::Literal;
232  result->value = false;
233  } else {
234  // Otherwise it is treated as a normal identifier
235  result->kind = TokenKind::Ident;
236  result->text = token;
237  }
238  } else {
239  result->kind = TokenKind::InvalidChar;
240  result->text = code.substr(0, 1);
241  code = code.drop_front(1);
242  }
243  }
244 
245  // Consume all leading whitespace from code, except newlines
246  void consumeWhitespace() { code = code.ltrim(" \t\v\f\r"); }
247 
248  // Returns the current location in the source code
249  SourceLocation currentLocation() {
250  SourceLocation location;
251  location.line = line;
252  location.column = code.data() - startOfLine.data() + 1;
253  return location;
254  }
255 
256  llvm::StringRef code;
257  llvm::StringRef startOfLine;
258  unsigned line = 1;
259  Diagnostics *error;
260  TokenInfo nextToken;
261  const char *codeCompletionLocation = nullptr;
262 };
263 
264 Parser::Sema::~Sema() = default;
265 
267  llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
268  return {};
269 }
270 
271 std::vector<MatcherCompletion>
273  return {};
274 }
275 
276 // Entry for the scope of a parser
279 
280  ScopedContextEntry(Parser *parser, MatcherCtor c) : parser(parser) {
281  parser->contextStack.emplace_back(c, 0u);
282  }
283 
284  ~ScopedContextEntry() { parser->contextStack.pop_back(); }
285 
286  void nextArg() { ++parser->contextStack.back().second; }
287 };
288 
289 // Parse and validate expressions starting with an identifier.
290 // This function can parse named values and matchers. In case of failure, it
291 // will try to determine the user's intent to give an appropriate error message.
292 bool Parser::parseIdentifierPrefixImpl(VariantValue *value) {
293  const TokenInfo nameToken = tokenizer->consumeNextToken();
294 
295  if (tokenizer->nextTokenKind() != TokenKind::OpenParen) {
296  // Parse as a named value.
297  if (auto namedValue = namedValues ? namedValues->lookup(nameToken.text)
298  : VariantValue()) {
299 
300  if (tokenizer->nextTokenKind() != TokenKind::Period) {
301  *value = namedValue;
302  return true;
303  }
304 
305  if (!namedValue.isMatcher()) {
306  error->addError(tokenizer->peekNextToken().range,
308  return false;
309  }
310  }
311 
312  if (tokenizer->nextTokenKind() == TokenKind::NewLine) {
313  error->addError(tokenizer->peekNextToken().range,
315  << "NewLine";
316  return false;
317  }
318 
319  // If the syntax is correct and the name is not a matcher either, report
320  // an unknown named value.
321  if ((tokenizer->nextTokenKind() == TokenKind::Comma ||
322  tokenizer->nextTokenKind() == TokenKind::CloseParen ||
323  tokenizer->nextTokenKind() == TokenKind::NewLine ||
324  tokenizer->nextTokenKind() == TokenKind::Eof) &&
325  !sema->lookupMatcherCtor(nameToken.text)) {
326  error->addError(nameToken.range, ErrorType::RegistryValueNotFound)
327  << nameToken.text;
328  return false;
329  }
330  // Otherwise, fallback to the matcher parser.
331  }
332 
333  tokenizer->skipNewlines();
334 
335  assert(nameToken.kind == TokenKind::Ident);
336  TokenInfo openToken = tokenizer->consumeNextToken();
337  if (openToken.kind != TokenKind::OpenParen) {
338  error->addError(openToken.range, ErrorType::ParserNoOpenParen)
339  << openToken.text;
340  return false;
341  }
342 
343  std::optional<MatcherCtor> ctor = sema->lookupMatcherCtor(nameToken.text);
344 
345  // Parse as a matcher expression.
346  return parseMatcherExpressionImpl(nameToken, openToken, ctor, value);
347 }
348 
349 bool Parser::parseChainedExpression(std::string &argument) {
350  // Parse the parenthesized argument to .extract("foo")
351  // Note: EOF is handled inside the consume functions and would fail below when
352  // checking token kind.
353  const TokenInfo openToken = tokenizer->consumeNextToken();
354  const TokenInfo argumentToken = tokenizer->consumeNextTokenIgnoreNewlines();
355  const TokenInfo closeToken = tokenizer->consumeNextTokenIgnoreNewlines();
356 
357  if (openToken.kind != TokenKind::OpenParen) {
358  error->addError(openToken.range, ErrorType::ParserChainedExprNoOpenParen);
359  return false;
360  }
361 
362  if (argumentToken.kind != TokenKind::Literal ||
363  !argumentToken.value.isString()) {
364  error->addError(argumentToken.range,
366  return false;
367  }
368 
369  if (closeToken.kind != TokenKind::CloseParen) {
370  error->addError(closeToken.range, ErrorType::ParserChainedExprNoCloseParen);
371  return false;
372  }
373 
374  // If all checks passed, extract the argument and return true.
375  argument = argumentToken.value.getString();
376  return true;
377 }
378 
379 // Parse the arguments of a matcher
380 bool Parser::parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
381  const TokenInfo &nameToken, TokenInfo &endToken) {
382  ScopedContextEntry sce(this, ctor);
383 
384  while (tokenizer->nextTokenKind() != TokenKind::Eof) {
385  if (tokenizer->nextTokenKind() == TokenKind::CloseParen) {
386  // end of args.
387  endToken = tokenizer->consumeNextToken();
388  break;
389  }
390 
391  if (!args.empty()) {
392  // We must find a , token to continue.
393  TokenInfo commaToken = tokenizer->consumeNextToken();
394  if (commaToken.kind != TokenKind::Comma) {
395  error->addError(commaToken.range, ErrorType::ParserNoComma)
396  << commaToken.text;
397  return false;
398  }
399  }
400 
401  ParserValue argValue;
402  tokenizer->skipNewlines();
403 
404  argValue.text = tokenizer->peekNextToken().text;
405  argValue.range = tokenizer->peekNextToken().range;
406  if (!parseExpressionImpl(&argValue.value)) {
407  return false;
408  }
409 
410  tokenizer->skipNewlines();
411  args.push_back(argValue);
412  sce.nextArg();
413  }
414 
415  return true;
416 }
417 
418 // Parse and validate a matcher expression.
419 bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken,
420  const TokenInfo &openToken,
421  std::optional<MatcherCtor> ctor,
422  VariantValue *value) {
423  if (!ctor) {
424  error->addError(nameToken.range, ErrorType::RegistryMatcherNotFound)
425  << nameToken.text;
426  // Do not return here. We need to continue to give completion suggestions.
427  }
428 
429  std::vector<ParserValue> args;
430  TokenInfo endToken;
431 
432  tokenizer->skipNewlines();
433 
434  if (!parseMatcherArgs(args, ctor.value_or(nullptr), nameToken, endToken)) {
435  return false;
436  }
437 
438  // Check for the missing closing parenthesis
439  if (endToken.kind != TokenKind::CloseParen) {
440  error->addError(openToken.range, ErrorType::ParserNoCloseParen)
441  << nameToken.text;
442  return false;
443  }
444 
445  std::string functionName;
446  if (tokenizer->peekNextToken().kind == TokenKind::Period) {
447  tokenizer->consumeNextToken();
448  TokenInfo chainCallToken = tokenizer->consumeNextToken();
449  if (chainCallToken.kind == TokenKind::CodeCompletion) {
450  addCompletion(chainCallToken, MatcherCompletion("extract(\"", "extract"));
451  return false;
452  }
453 
454  if (chainCallToken.kind != TokenKind::Ident ||
455  chainCallToken.text != TokenInfo::ID_Extract) {
456  error->addError(chainCallToken.range,
458  return false;
459  }
460 
461  if (chainCallToken.text == TokenInfo::ID_Extract &&
462  !parseChainedExpression(functionName))
463  return false;
464  }
465 
466  if (!ctor)
467  return false;
468  // Merge the start and end infos.
469  SourceRange matcherRange = nameToken.range;
470  matcherRange.end = endToken.range.end;
471  VariantMatcher result = sema->actOnMatcherExpression(
472  *ctor, matcherRange, functionName, args, error);
473  if (result.isNull())
474  return false;
475  *value = result;
476  return true;
477 }
478 
479 // If the prefix of this completion matches the completion token, add it to
480 // completions minus the prefix.
481 void Parser::addCompletion(const TokenInfo &compToken,
482  const MatcherCompletion &completion) {
483  if (llvm::StringRef(completion.typedText).starts_with(compToken.text)) {
484  completions.emplace_back(completion.typedText.substr(compToken.text.size()),
485  completion.matcherDecl);
486  }
487 }
488 
489 std::vector<MatcherCompletion>
490 Parser::getNamedValueCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) {
491  if (!namedValues)
492  return {};
493 
494  std::vector<MatcherCompletion> result;
495  for (const auto &entry : *namedValues) {
496  std::string decl =
497  (entry.getValue().getTypeAsString() + " " + entry.getKey()).str();
498  result.emplace_back(entry.getKey(), decl);
499  }
500  return result;
501 }
502 
503 void Parser::addExpressionCompletions() {
504  const TokenInfo compToken = tokenizer->consumeNextTokenIgnoreNewlines();
505  assert(compToken.kind == TokenKind::CodeCompletion);
506 
507  // We cannot complete code if there is an invalid element on the context
508  // stack.
509  for (const auto &entry : contextStack) {
510  if (!entry.first)
511  return;
512  }
513 
514  auto acceptedTypes = sema->getAcceptedCompletionTypes(contextStack);
515  for (const auto &completion : sema->getMatcherCompletions(acceptedTypes)) {
516  addCompletion(compToken, completion);
517  }
518 
519  for (const auto &completion : getNamedValueCompletions(acceptedTypes)) {
520  addCompletion(compToken, completion);
521  }
522 }
523 
524 // Parse an <Expresssion>
525 bool Parser::parseExpressionImpl(VariantValue *value) {
526  switch (tokenizer->nextTokenKind()) {
527  case TokenKind::Literal:
528  *value = tokenizer->consumeNextToken().value;
529  return true;
530  case TokenKind::Ident:
531  return parseIdentifierPrefixImpl(value);
533  addExpressionCompletions();
534  return false;
535  case TokenKind::Eof:
536  error->addError(tokenizer->consumeNextToken().range,
538  return false;
539 
540  case TokenKind::Error:
541  // This error was already reported by the tokenizer.
542  return false;
543  case TokenKind::NewLine:
546  case TokenKind::Comma:
547  case TokenKind::Period:
549  const TokenInfo token = tokenizer->consumeNextToken();
550  error->addError(token.range, ErrorType::ParserInvalidToken)
551  << (token.kind == TokenKind::NewLine ? "NewLine" : token.text);
552  return false;
553  }
554 
555  llvm_unreachable("Unknown token kind.");
556 }
557 
558 Parser::Parser(CodeTokenizer *tokenizer, const Registry &matcherRegistry,
559  const NamedValueMap *namedValues, Diagnostics *error)
560  : tokenizer(tokenizer),
561  sema(std::make_unique<RegistrySema>(matcherRegistry)),
562  namedValues(namedValues), error(error) {}
563 
565 
566 std::optional<MatcherCtor>
567 Parser::RegistrySema::lookupMatcherCtor(llvm::StringRef matcherName) {
568  return RegistryManager::lookupMatcherCtor(matcherName, matcherRegistry);
569 }
570 
572  MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
574  return RegistryManager::constructMatcher(ctor, nameRange, functionName, args,
575  error);
576 }
577 
579  llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
581 }
582 
583 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
584  llvm::ArrayRef<ArgKind> acceptedTypes) {
585  return RegistryManager::getMatcherCompletions(acceptedTypes, matcherRegistry);
586 }
587 
588 bool Parser::parseExpression(llvm::StringRef &code,
589  const Registry &matcherRegistry,
590  const NamedValueMap *namedValues,
591  VariantValue *value, Diagnostics *error) {
592  CodeTokenizer tokenizer(code, error);
593  Parser parser(&tokenizer, matcherRegistry, namedValues, error);
594  if (!parser.parseExpressionImpl(value))
595  return false;
596  auto nextToken = tokenizer.peekNextToken();
597  if (nextToken.kind != TokenKind::Eof &&
598  nextToken.kind != TokenKind::NewLine) {
599  error->addError(tokenizer.peekNextToken().range,
601  return false;
602  }
603  return true;
604 }
605 
606 std::vector<MatcherCompletion>
607 Parser::completeExpression(llvm::StringRef &code, unsigned completionOffset,
608  const Registry &matcherRegistry,
609  const NamedValueMap *namedValues) {
610  Diagnostics error;
611  CodeTokenizer tokenizer(code, &error, completionOffset);
612  Parser parser(&tokenizer, matcherRegistry, namedValues, &error);
613  VariantValue dummy;
614  parser.parseExpressionImpl(&dummy);
615 
616  return parser.completions;
617 }
618 
619 std::optional<DynMatcher> Parser::parseMatcherExpression(
620  llvm::StringRef &code, const Registry &matcherRegistry,
621  const NamedValueMap *namedValues, Diagnostics *error) {
622  VariantValue value;
623  if (!parseExpression(code, matcherRegistry, namedValues, &value, error))
624  return std::nullopt;
625  if (!value.isMatcher()) {
627  return std::nullopt;
628  }
629  std::optional<DynMatcher> result = value.getMatcher().getDynMatcher();
630  if (!result) {
632  << value.getTypeAsString();
633  }
634  return result;
635 }
636 
637 } // namespace mlir::query::matcher::internal
static std::vector< MatcherCompletion > getMatcherCompletions(ArrayRef< ArgKind > acceptedTypes, const Registry &matcherRegistry)
static std::vector< ArgKind > getAcceptedCompletionTypes(llvm::ArrayRef< std::pair< MatcherCtor, unsigned >> context)
static std::optional< MatcherCtor > lookupMatcherCtor(llvm::StringRef matcherName, const Registry &matcherRegistry)
static VariantMatcher constructMatcher(MatcherCtor ctor, internal::SourceRange nameRange, llvm::StringRef functionName, ArrayRef< ParserValue > args, internal::Diagnostics *error)
std::optional< DynMatcher > getDynMatcher() const
const VariantMatcher & getMatcher() const
ArgStream addError(SourceRange range, ErrorType error)
Definition: Diagnostics.cpp:20
CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error, unsigned codeCompletionOffset)
Definition: Parser.cpp:49
CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error)
Definition: Parser.cpp:43
std::optional< MatcherCtor > lookupMatcherCtor(llvm::StringRef matcherName) override
Definition: Parser.cpp:567
std::vector< MatcherCompletion > getMatcherCompletions(llvm::ArrayRef< ArgKind > acceptedTypes) override
Definition: Parser.cpp:583
std::vector< ArgKind > getAcceptedCompletionTypes(llvm::ArrayRef< std::pair< MatcherCtor, unsigned >> context) override
Definition: Parser.cpp:578
VariantMatcher actOnMatcherExpression(MatcherCtor Ctor, SourceRange NameRange, StringRef functionName, ArrayRef< ParserValue > Args, Diagnostics *Error) override
Definition: Parser.cpp:571
virtual std::vector< MatcherCompletion > getMatcherCompletions(llvm::ArrayRef< ArgKind > acceptedTypes)
Definition: Parser.cpp:272
virtual std::vector< ArgKind > getAcceptedCompletionTypes(llvm::ArrayRef< std::pair< MatcherCtor, unsigned >> Context)
Definition: Parser.cpp:266
static bool parseExpression(llvm::StringRef &code, const Registry &matcherRegistry, const NamedValueMap *namedValues, VariantValue *value, Diagnostics *error)
Definition: Parser.cpp:588
llvm::StringMap< VariantValue > NamedValueMap
Definition: Parser.h:114
static std::vector< MatcherCompletion > completeExpression(llvm::StringRef &code, unsigned completionOffset, const Registry &matcherRegistry, const NamedValueMap *namedValues)
Definition: Parser.cpp:607
static std::optional< DynMatcher > parseMatcherExpression(llvm::StringRef &matcherCode, const Registry &matcherRegistry, const NamedValueMap *namedValues, Diagnostics *error)
Definition: Parser.cpp:619
const internal::MatcherDescriptor * MatcherCtor
void set(TokenKind newKind, llvm::StringRef newText)
Definition: Parser.cpp:24