MLIR  22.0.0git
Lexer.cpp
Go to the documentation of this file.
1 //===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the lexer for the MLIR textual form.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "Lexer.h"
14 #include "Token.h"
16 #include "mlir/IR/Diagnostics.h"
17 #include "mlir/IR/Location.h"
18 #include "mlir/IR/MLIRContext.h"
19 #include "mlir/Support/LLVM.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/ADT/StringSwitch.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/SourceMgr.h"
25 #include <cassert>
26 #include <cctype>
27 
28 using namespace mlir;
29 
30 // Returns true if 'c' is an allowable punctuation character: [$._-]
31 // Returns false otherwise.
32 static bool isPunct(char c) {
33  return c == '$' || c == '.' || c == '_' || c == '-';
34 }
35 
36 Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
37  AsmParserCodeCompleteContext *codeCompleteContext)
38  : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
39  auto bufferID = sourceMgr.getMainFileID();
40 
41  // Check to see if the main buffer contains the last buffer, and if so the
42  // last buffer should be used as main file for parsing.
43  if (sourceMgr.getNumBuffers() > 1) {
44  unsigned lastFileID = sourceMgr.getNumBuffers();
45  const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
46  const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
47  if (main->getBufferStart() <= last->getBufferStart() &&
48  main->getBufferEnd() >= last->getBufferEnd()) {
49  bufferID = lastFileID;
50  }
51  }
52  curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
53  curPtr = curBuffer.begin();
54 
55  // Set the code completion location if it was provided.
56  if (codeCompleteContext)
57  codeCompleteLoc = codeCompleteContext->getCodeCompleteLoc().getPointer();
58 }
59 
60 /// Encode the specified source location information into an attribute for
61 /// attachment to the IR.
63  auto &sourceMgr = getSourceMgr();
64  unsigned mainFileID = sourceMgr.getMainFileID();
65 
66  // TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can
67  // use it here.
68  auto &bufferInfo = sourceMgr.getBufferInfo(mainFileID);
69  unsigned lineNo = bufferInfo.getLineNumber(loc.getPointer());
70  unsigned column =
71  (loc.getPointer() - bufferInfo.getPointerForLineNumber(lineNo)) + 1;
72  auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
73 
74  return FileLineColLoc::get(context, buffer->getBufferIdentifier(), lineNo,
75  column);
76 }
77 
78 /// emitError - Emit an error message and return an Token::error token.
79 Token Lexer::emitError(const char *loc, const Twine &message) {
80  mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
81  message);
82  return formToken(Token::error, loc);
83 }
84 
86  const char *curBufferEnd = curBuffer.end();
87  while (true) {
88  const char *tokStart = curPtr;
89 
90  // Check to see if the current token is at the code completion location.
91  if (tokStart == codeCompleteLoc)
92  return formToken(Token::code_complete, tokStart);
93 
94  if (tokStart == curBufferEnd)
95  return formToken(Token::eof, tokStart);
96 
97  // Lex the next token.
98  switch (*curPtr++) {
99  default:
100  // Handle bare identifiers.
101  if (isalpha(curPtr[-1]))
102  return lexBareIdentifierOrKeyword(tokStart);
103 
104  // Unknown character, emit an error.
105  return emitError(tokStart, "unexpected character");
106 
107  case ' ':
108  case '\t':
109  case '\n':
110  case '\r':
111  // Handle whitespace.
112  continue;
113 
114  case '_':
115  // Handle bare identifiers.
116  return lexBareIdentifierOrKeyword(tokStart);
117 
118  case 0:
119  // This may either be a nul character in the source file or may be the EOF
120  // marker that llvm::MemoryBuffer guarantees will be there.
121  if (curPtr - 1 == curBufferEnd)
122  return formToken(Token::eof, tokStart);
123  continue;
124 
125  case ':':
126  return formToken(Token::colon, tokStart);
127  case ',':
128  return formToken(Token::comma, tokStart);
129  case '.':
130  return lexEllipsis(tokStart);
131  case '(':
132  return formToken(Token::l_paren, tokStart);
133  case ')':
134  return formToken(Token::r_paren, tokStart);
135  case '{':
136  if (*curPtr == '-' && *(curPtr + 1) == '#') {
137  curPtr += 2;
138  return formToken(Token::file_metadata_begin, tokStart);
139  }
140  return formToken(Token::l_brace, tokStart);
141  case '}':
142  return formToken(Token::r_brace, tokStart);
143  case '[':
144  return formToken(Token::l_square, tokStart);
145  case ']':
146  return formToken(Token::r_square, tokStart);
147  case '<':
148  return formToken(Token::less, tokStart);
149  case '>':
150  return formToken(Token::greater, tokStart);
151  case '=':
152  return formToken(Token::equal, tokStart);
153 
154  case '+':
155  return formToken(Token::plus, tokStart);
156  case '*':
157  return formToken(Token::star, tokStart);
158  case '-':
159  if (*curPtr == '>') {
160  ++curPtr;
161  return formToken(Token::arrow, tokStart);
162  }
163  return formToken(Token::minus, tokStart);
164 
165  case '?':
166  return formToken(Token::question, tokStart);
167 
168  case '|':
169  return formToken(Token::vertical_bar, tokStart);
170 
171  case '/':
172  if (*curPtr == '/') {
173  skipComment();
174  continue;
175  }
176  return formToken(Token::slash, tokStart);
177 
178  case '@':
179  return lexAtIdentifier(tokStart);
180 
181  case '#':
182  if (*curPtr == '-' && *(curPtr + 1) == '}') {
183  curPtr += 2;
184  return formToken(Token::file_metadata_end, tokStart);
185  }
186  [[fallthrough]];
187  case '!':
188  case '^':
189  case '%':
190  return lexPrefixedIdentifier(tokStart);
191  case '"':
192  return lexString(tokStart);
193 
194  case '0':
195  case '1':
196  case '2':
197  case '3':
198  case '4':
199  case '5':
200  case '6':
201  case '7':
202  case '8':
203  case '9':
204  return lexNumber(tokStart);
205  }
206  }
207 }
208 
209 /// Lex an '@foo' identifier.
210 ///
211 /// symbol-ref-id ::= `@` (bare-id | string-literal)
212 ///
213 Token Lexer::lexAtIdentifier(const char *tokStart) {
214  char cur = *curPtr++;
215 
216  // Try to parse a string literal, if present.
217  if (cur == '"') {
218  Token stringIdentifier = lexString(curPtr);
219  if (stringIdentifier.is(Token::error))
220  return stringIdentifier;
221  return formToken(Token::at_identifier, tokStart);
222  }
223 
224  // Otherwise, these always start with a letter or underscore.
225  if (!isalpha(cur) && cur != '_')
226  return emitError(curPtr - 1,
227  "@ identifier expected to start with letter or '_'");
228 
229  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
230  *curPtr == '$' || *curPtr == '.')
231  ++curPtr;
232  return formToken(Token::at_identifier, tokStart);
233 }
234 
235 /// Lex a bare identifier or keyword that starts with a letter.
236 ///
237 /// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
238 /// integer-type ::= `[su]?i[1-9][0-9]*`
239 ///
240 Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
241  // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
242  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
243  *curPtr == '$' || *curPtr == '.')
244  ++curPtr;
245 
246  // Check to see if this identifier is a keyword.
247  StringRef spelling(tokStart, curPtr - tokStart);
248 
249  auto isAllDigit = [](StringRef str) {
250  return llvm::all_of(str, llvm::isDigit);
251  };
252 
253  // Check for i123, si456, ui789.
254  if ((spelling.size() > 1 && tokStart[0] == 'i' &&
255  isAllDigit(spelling.drop_front())) ||
256  ((spelling.size() > 2 && tokStart[1] == 'i' &&
257  (tokStart[0] == 's' || tokStart[0] == 'u')) &&
258  isAllDigit(spelling.drop_front(2))))
259  return Token(Token::inttype, spelling);
260 
262 #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
263 #include "TokenKinds.def"
264  .Default(Token::bare_identifier);
265 
266  return Token(kind, spelling);
267 }
268 
269 /// Skip a comment line, starting with a '//'.
270 ///
271 /// TODO: add a regex for comments here and to the spec.
272 ///
273 void Lexer::skipComment() {
274  // Advance over the second '/' in a '//' comment.
275  assert(*curPtr == '/');
276  ++curPtr;
277 
278  const char *curBufferEnd = curBuffer.end();
279  while (true) {
280  if (curPtr == curBufferEnd)
281  return;
282 
283  switch (*curPtr++) {
284  case '\n':
285  case '\r':
286  // Newline is end of comment.
287  return;
288  case 0:
289  // If this is the end of the buffer, end the comment.
290  if (curPtr - 1 == curBufferEnd) {
291  --curPtr;
292  return;
293  }
294  [[fallthrough]];
295  default:
296  // Skip over other characters.
297  break;
298  }
299  }
300 }
301 
302 /// Lex an ellipsis.
303 ///
304 /// ellipsis ::= '...'
305 ///
306 Token Lexer::lexEllipsis(const char *tokStart) {
307  assert(curPtr[-1] == '.');
308 
309  if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
310  return emitError(curPtr, "expected three consecutive dots for an ellipsis");
311 
312  curPtr += 2;
313  return formToken(Token::ellipsis, tokStart);
314 }
315 
316 /// Lex a number literal.
317 ///
318 /// integer-literal ::= digit+ | `0x` hex_digit+
319 /// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
320 ///
321 Token Lexer::lexNumber(const char *tokStart) {
322  assert(isdigit(curPtr[-1]));
323 
324  // Handle the hexadecimal case.
325  if (curPtr[-1] == '0' && *curPtr == 'x') {
326  // If we see stuff like 0xi32, this is a literal `0` followed by an
327  // identifier `xi32`, stop after `0`.
328  if (!isxdigit(curPtr[1]))
329  return formToken(Token::integer, tokStart);
330 
331  curPtr += 2;
332  while (isxdigit(*curPtr))
333  ++curPtr;
334 
335  return formToken(Token::integer, tokStart);
336  }
337 
338  // Handle the normal decimal case.
339  while (isdigit(*curPtr))
340  ++curPtr;
341 
342  if (*curPtr != '.')
343  return formToken(Token::integer, tokStart);
344  ++curPtr;
345 
346  // Skip over [0-9]*([eE][-+]?[0-9]+)?
347  while (isdigit(*curPtr))
348  ++curPtr;
349 
350  if (*curPtr == 'e' || *curPtr == 'E') {
351  if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
352  ((curPtr[1] == '-' || curPtr[1] == '+') &&
353  isdigit(static_cast<unsigned char>(curPtr[2])))) {
354  curPtr += 2;
355  while (isdigit(*curPtr))
356  ++curPtr;
357  }
358  }
359  return formToken(Token::floatliteral, tokStart);
360 }
361 
362 /// Lex an identifier that starts with a prefix followed by suffix-id.
363 ///
364 /// attribute-id ::= `#` suffix-id
365 /// ssa-id ::= '%' suffix-id
366 /// block-id ::= '^' suffix-id
367 /// type-id ::= '!' suffix-id
368 /// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
369 /// id-punct ::= `$` | `.` | `_` | `-`
370 ///
371 Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
373  StringRef errorKind;
374  switch (*tokStart) {
375  case '#':
376  kind = Token::hash_identifier;
377  errorKind = "invalid attribute name";
378  break;
379  case '%':
380  kind = Token::percent_identifier;
381  errorKind = "invalid SSA name";
382  break;
383  case '^':
384  kind = Token::caret_identifier;
385  errorKind = "invalid block name";
386  break;
387  case '!':
388  kind = Token::exclamation_identifier;
389  errorKind = "invalid type identifier";
390  break;
391  default:
392  llvm_unreachable("invalid caller");
393  }
394 
395  // Parse suffix-id.
396  if (isdigit(*curPtr)) {
397  // If suffix-id starts with a digit, the rest must be digits.
398  while (isdigit(*curPtr))
399  ++curPtr;
400  } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
401  do {
402  ++curPtr;
403  } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
404  } else if (curPtr == codeCompleteLoc) {
405  return formToken(Token::code_complete, tokStart);
406  } else {
407  return emitError(curPtr - 1, errorKind);
408  }
409 
410  // Check for a code completion within the identifier.
411  if (codeCompleteLoc && codeCompleteLoc >= tokStart &&
412  codeCompleteLoc <= curPtr) {
413  return Token(Token::code_complete,
414  StringRef(tokStart, codeCompleteLoc - tokStart));
415  }
416 
417  return formToken(kind, tokStart);
418 }
419 
420 /// Lex a string literal.
421 ///
422 /// string-literal ::= '"' [^"\n\f\v\r]* '"'
423 ///
424 /// TODO: define escaping rules.
425 Token Lexer::lexString(const char *tokStart) {
426  assert(curPtr[-1] == '"');
427 
428  const char *curBufferEnd = curBuffer.end();
429  while (true) {
430  // Check to see if there is a code completion location within the string. In
431  // these cases we generate a completion location and place the currently
432  // lexed string within the token. This allows for the parser to use the
433  // partially lexed string when computing the completion results.
434  if (curPtr == codeCompleteLoc)
435  return formToken(Token::code_complete, tokStart);
436 
437  switch (*curPtr++) {
438  case '"':
439  return formToken(Token::string, tokStart);
440  case 0:
441  // If this is a random nul character in the middle of a string, just
442  // include it. If it is the end of file, then it is an error.
443  if (curPtr - 1 != curBufferEnd)
444  continue;
445  [[fallthrough]];
446  case '\n':
447  case '\v':
448  case '\f':
449  return emitError(curPtr - 1, "expected '\"' in string literal");
450  case '\\':
451  // Handle explicitly a few escapes.
452  if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
453  ++curPtr;
454  else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
455  // Support \xx for two hex digits.
456  curPtr += 2;
457  else
458  return emitError(curPtr - 1, "unknown escape in string literal");
459  continue;
460 
461  default:
462  continue;
463  }
464  }
465 }
static bool isPunct(char c)
Definition: Lexer.cpp:32
union mlir::linalg::@1243::ArityGroupAndKind::Kind kind
This class provides an abstract interface into the parser for hooking in code completion events.
Definition: CodeComplete.h:24
SMLoc getCodeCompleteLoc() const
Return the source location used to provide code completion.
Definition: CodeComplete.h:29
static FileLineColLoc get(StringAttr filename, unsigned line, unsigned column)
Definition: Location.cpp:157
Token lexToken()
Definition: Lexer.cpp:85
const llvm::SourceMgr & getSourceMgr()
Definition: Lexer.h:28
Location getEncodedSourceLocation(SMLoc loc)
Encode the specified source location information into a Location object for attachment to the IR or e...
Definition: Lexer.cpp:62
Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context, AsmParserCodeCompleteContext *codeCompleteContext)
Definition: Lexer.cpp:36
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:63
This represents a token in the MLIR syntax.
Definition: Token.h:20
bool is(Kind k) const
Definition: Token.h:38
Include the generated interface declarations.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.