MLIR 23.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the lexer for the MLIR textual form.
10//
11//===----------------------------------------------------------------------===//
12
13#include "Lexer.h"
14#include "Token.h"
16#include "mlir/IR/Diagnostics.h"
17#include "mlir/IR/Location.h"
18#include "mlir/IR/MLIRContext.h"
19#include "mlir/Support/LLVM.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/StringExtras.h"
22#include "llvm/ADT/StringSwitch.h"
23#include "llvm/Support/ErrorHandling.h"
24#include "llvm/Support/SourceMgr.h"
25#include <cassert>
26#include <cctype>
27
28using namespace mlir;
29
30// Returns true if 'c' is an allowable punctuation character: [$._-]
31// Returns false otherwise.
32static bool isPunct(char c) {
33 return c == '$' || c == '.' || c == '_' || c == '-';
34}
35
36Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
37 AsmParserCodeCompleteContext *codeCompleteContext)
38 : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
39 auto bufferID = sourceMgr.getMainFileID();
40
41 // Check to see if the main buffer contains the last buffer, and if so the
42 // last buffer should be used as main file for parsing.
43 if (sourceMgr.getNumBuffers() > 1) {
44 unsigned lastFileID = sourceMgr.getNumBuffers();
45 const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
46 const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
47 if (main->getBufferStart() <= last->getBufferStart() &&
48 main->getBufferEnd() >= last->getBufferEnd()) {
49 bufferID = lastFileID;
50 }
51 }
52 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
53 curPtr = curBuffer.begin();
54
55 // Set the code completion location if it was provided.
56 if (codeCompleteContext)
57 codeCompleteLoc = codeCompleteContext->getCodeCompleteLoc().getPointer();
58}
59
60/// Encode the specified source location information into an attribute for
61/// attachment to the IR.
63 auto &sourceMgr = getSourceMgr();
64 unsigned mainFileID = sourceMgr.getMainFileID();
65
66 auto [lineNo, column] = sourceMgr.getLineAndColumn(loc);
67 auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
68
69 return FileLineColLoc::get(context, buffer->getBufferIdentifier(), lineNo,
70 column);
71}
72
73/// emitError - Emit an error message and return an Token::error token.
74Token Lexer::emitError(const char *loc, const Twine &message) {
75 mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
76 message);
77 return formToken(Token::error, loc);
78}
79
81 const char *curBufferEnd = curBuffer.end();
82 while (true) {
83 const char *tokStart = curPtr;
84
85 // Check to see if the current token is at the code completion location.
86 if (tokStart == codeCompleteLoc)
87 return formToken(Token::code_complete, tokStart);
88
89 if (tokStart == curBufferEnd)
90 return formToken(Token::eof, tokStart);
91
92 // Lex the next token.
93 switch (*curPtr++) {
94 default:
95 // Handle bare identifiers.
96 if (isalpha(curPtr[-1]))
97 return lexBareIdentifierOrKeyword(tokStart);
98
99 // Unknown character, emit an error.
100 return emitError(tokStart, "unexpected character");
101
102 case ' ':
103 case '\t':
104 case '\n':
105 case '\r':
106 // Handle whitespace.
107 continue;
108
109 case '_':
110 // Handle bare identifiers.
111 return lexBareIdentifierOrKeyword(tokStart);
112
113 case 0:
114 // This may either be a nul character in the source file or may be the EOF
115 // marker that llvm::MemoryBuffer guarantees will be there.
116 if (curPtr - 1 == curBufferEnd)
117 return formToken(Token::eof, tokStart);
118 continue;
119
120 case ':':
121 return formToken(Token::colon, tokStart);
122 case ',':
123 return formToken(Token::comma, tokStart);
124 case '.':
125 return lexEllipsis(tokStart);
126 case '(':
127 return formToken(Token::l_paren, tokStart);
128 case ')':
129 return formToken(Token::r_paren, tokStart);
130 case '{':
131 if (*curPtr == '-' && *(curPtr + 1) == '#') {
132 curPtr += 2;
133 return formToken(Token::file_metadata_begin, tokStart);
134 }
135 return formToken(Token::l_brace, tokStart);
136 case '}':
137 return formToken(Token::r_brace, tokStart);
138 case '[':
139 return formToken(Token::l_square, tokStart);
140 case ']':
141 return formToken(Token::r_square, tokStart);
142 case '<':
143 return formToken(Token::less, tokStart);
144 case '>':
145 return formToken(Token::greater, tokStart);
146 case '=':
147 return formToken(Token::equal, tokStart);
148
149 case '+':
150 return formToken(Token::plus, tokStart);
151 case '*':
152 return formToken(Token::star, tokStart);
153 case '-':
154 if (*curPtr == '>') {
155 ++curPtr;
156 return formToken(Token::arrow, tokStart);
157 }
158 return formToken(Token::minus, tokStart);
159
160 case '?':
161 return formToken(Token::question, tokStart);
162
163 case '|':
164 return formToken(Token::vertical_bar, tokStart);
165
166 case '/':
167 if (*curPtr == '/') {
168 skipComment();
169 continue;
170 }
171 return formToken(Token::slash, tokStart);
172
173 case '@':
174 return lexAtIdentifier(tokStart);
175
176 case '#':
177 if (*curPtr == '-' && *(curPtr + 1) == '}') {
178 curPtr += 2;
179 return formToken(Token::file_metadata_end, tokStart);
180 }
181 [[fallthrough]];
182 case '!':
183 case '^':
184 case '%':
185 return lexPrefixedIdentifier(tokStart);
186 case '"':
187 return lexString(tokStart);
188
189 case '0':
190 case '1':
191 case '2':
192 case '3':
193 case '4':
194 case '5':
195 case '6':
196 case '7':
197 case '8':
198 case '9':
199 return lexNumber(tokStart);
200 }
201 }
202}
203
204/// Lex an '@foo' identifier.
205///
206/// symbol-ref-id ::= `@` (bare-id | string-literal)
207///
208Token Lexer::lexAtIdentifier(const char *tokStart) {
209 char cur = *curPtr++;
210
211 // Try to parse a string literal, if present.
212 if (cur == '"') {
213 Token stringIdentifier = lexString(curPtr);
214 if (stringIdentifier.is(Token::error))
215 return stringIdentifier;
216 return formToken(Token::at_identifier, tokStart);
217 }
218
219 // Otherwise, these always start with a letter or underscore.
220 if (!isalpha(cur) && cur != '_')
221 return emitError(curPtr - 1,
222 "@ identifier expected to start with letter or '_'");
223
224 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
225 *curPtr == '$' || *curPtr == '.')
226 ++curPtr;
227 return formToken(Token::at_identifier, tokStart);
228}
229
230/// Lex a bare identifier or keyword that starts with a letter.
231///
232/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
233/// integer-type ::= `[su]?i[1-9][0-9]*`
234///
235Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
236 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
237 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
238 *curPtr == '$' || *curPtr == '.')
239 ++curPtr;
240
241 // Check to see if this identifier is a keyword.
242 StringRef spelling(tokStart, curPtr - tokStart);
243
244 auto isAllDigit = [](StringRef str) {
245 return llvm::all_of(str, llvm::isDigit);
246 };
247
248 // Check for i123, si456, ui789.
249 if ((spelling.size() > 1 && tokStart[0] == 'i' &&
250 isAllDigit(spelling.drop_front())) ||
251 ((spelling.size() > 2 && tokStart[1] == 'i' &&
252 (tokStart[0] == 's' || tokStart[0] == 'u')) &&
253 isAllDigit(spelling.drop_front(2))))
254 return Token(Token::inttype, spelling);
255
257#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
258#include "TokenKinds.def"
259 .Default(Token::bare_identifier);
260
261 return Token(kind, spelling);
262}
263
264/// Skip a comment line, starting with a '//'.
265///
266/// TODO: add a regex for comments here and to the spec.
267///
268void Lexer::skipComment() {
269 // Advance over the second '/' in a '//' comment.
270 assert(*curPtr == '/');
271 ++curPtr;
272
273 const char *curBufferEnd = curBuffer.end();
274 while (true) {
275 if (curPtr == curBufferEnd)
276 return;
277
278 switch (*curPtr++) {
279 case '\n':
280 case '\r':
281 // Newline is end of comment.
282 return;
283 case 0:
284 // If this is the end of the buffer, end the comment.
285 if (curPtr - 1 == curBufferEnd) {
286 --curPtr;
287 return;
288 }
289 [[fallthrough]];
290 default:
291 // Skip over other characters.
292 break;
293 }
294 }
295}
296
297/// Lex an ellipsis.
298///
299/// ellipsis ::= '...'
300///
301Token Lexer::lexEllipsis(const char *tokStart) {
302 assert(curPtr[-1] == '.');
303
304 if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
305 return emitError(curPtr, "expected three consecutive dots for an ellipsis");
306
307 curPtr += 2;
308 return formToken(Token::ellipsis, tokStart);
309}
310
311/// Lex a number literal.
312///
313/// integer-literal ::= digit+ | `0x` hex_digit+
314/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
315///
316Token Lexer::lexNumber(const char *tokStart) {
317 assert(isdigit(curPtr[-1]));
318
319 // Handle the hexadecimal case.
320 if (curPtr[-1] == '0' && *curPtr == 'x') {
321 // If we see stuff like 0xi32, this is a literal `0` followed by an
322 // identifier `xi32`, stop after `0`.
323 if (!isxdigit(curPtr[1]))
324 return formToken(Token::integer, tokStart);
325
326 curPtr += 2;
327 while (isxdigit(*curPtr))
328 ++curPtr;
329
330 return formToken(Token::integer, tokStart);
331 }
332
333 // Handle the normal decimal case.
334 while (isdigit(*curPtr))
335 ++curPtr;
336
337 if (*curPtr != '.')
338 return formToken(Token::integer, tokStart);
339 ++curPtr;
340
341 // Skip over [0-9]*([eE][-+]?[0-9]+)?
342 while (isdigit(*curPtr))
343 ++curPtr;
344
345 if (*curPtr == 'e' || *curPtr == 'E') {
346 if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
347 ((curPtr[1] == '-' || curPtr[1] == '+') &&
348 isdigit(static_cast<unsigned char>(curPtr[2])))) {
349 curPtr += 2;
350 while (isdigit(*curPtr))
351 ++curPtr;
352 }
353 }
354 return formToken(Token::floatliteral, tokStart);
355}
356
357/// Lex an identifier that starts with a prefix followed by suffix-id.
358///
359/// attribute-id ::= `#` suffix-id
360/// ssa-id ::= '%' suffix-id
361/// block-id ::= '^' suffix-id
362/// type-id ::= '!' suffix-id
363/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
364/// id-punct ::= `$` | `.` | `_` | `-`
365///
366Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
367 Token::Kind kind;
368 StringRef errorKind;
369 switch (*tokStart) {
370 case '#':
371 kind = Token::hash_identifier;
372 errorKind = "invalid attribute name";
373 break;
374 case '%':
375 kind = Token::percent_identifier;
376 errorKind = "invalid SSA name";
377 break;
378 case '^':
379 kind = Token::caret_identifier;
380 errorKind = "invalid block name";
381 break;
382 case '!':
383 kind = Token::exclamation_identifier;
384 errorKind = "invalid type identifier";
385 break;
386 default:
387 llvm_unreachable("invalid caller");
388 }
389
390 // Parse suffix-id.
391 if (isdigit(*curPtr)) {
392 // If suffix-id starts with a digit, the rest must be digits.
393 while (isdigit(*curPtr))
394 ++curPtr;
395 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
396 do {
397 ++curPtr;
398 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
399 } else if (curPtr == codeCompleteLoc) {
400 return formToken(Token::code_complete, tokStart);
401 } else {
402 return emitError(curPtr - 1, errorKind);
403 }
404
405 // Check for a code completion within the identifier.
406 if (codeCompleteLoc && codeCompleteLoc >= tokStart &&
407 codeCompleteLoc <= curPtr) {
408 return Token(Token::code_complete,
409 StringRef(tokStart, codeCompleteLoc - tokStart));
410 }
411
412 return formToken(kind, tokStart);
413}
414
415/// Lex a string literal.
416///
417/// string-literal ::= '"' [^"\n\f\v\r]* '"'
418///
419/// TODO: define escaping rules.
420Token Lexer::lexString(const char *tokStart) {
421 assert(curPtr[-1] == '"');
422
423 const char *curBufferEnd = curBuffer.end();
424 while (true) {
425 // Check to see if there is a code completion location within the string. In
426 // these cases we generate a completion location and place the currently
427 // lexed string within the token. This allows for the parser to use the
428 // partially lexed string when computing the completion results.
429 if (curPtr == codeCompleteLoc)
430 return formToken(Token::code_complete, tokStart);
431
432 switch (*curPtr++) {
433 case '"':
434 return formToken(Token::string, tokStart);
435 case 0:
436 // If this is a random nul character in the middle of a string, just
437 // include it. If it is the end of file, then it is an error.
438 if (curPtr - 1 != curBufferEnd)
439 continue;
440 [[fallthrough]];
441 case '\n':
442 case '\v':
443 case '\f':
444 return emitError(curPtr - 1, "expected '\"' in string literal");
445 case '\\':
446 // Handle explicitly a few escapes.
447 if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
448 ++curPtr;
449 else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
450 // Support \xx for two hex digits.
451 curPtr += 2;
452 else
453 return emitError(curPtr - 1, "unknown escape in string literal");
454 continue;
455
456 default:
457 continue;
458 }
459 }
460}
static bool isPunct(char c)
Definition Lexer.cpp:32
This class provides an abstract interface into the parser for hooking in code completion events.
static FileLineColLoc get(StringAttr filename, unsigned line, unsigned column)
Definition Location.cpp:157
Token lexToken()
Definition Lexer.cpp:80
Location getEncodedSourceLocation(SMLoc loc)
Encode the specified source location information into a Location object for attachment to the IR or e...
Definition Lexer.cpp:62
const llvm::SourceMgr & getSourceMgr()
Definition Lexer.h:28
Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context, AsmParserCodeCompleteContext *codeCompleteContext)
Definition Lexer.cpp:36
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This represents a token in the MLIR syntax.
Definition Token.h:20
bool is(Kind k) const
Definition Token.h:38
Include the generated interface declarations.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
llvm::StringSwitch< T, R > StringSwitch
Definition LLVM.h:136