MLIR 22.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the lexer for the MLIR textual form.
10//
11//===----------------------------------------------------------------------===//
12
13#include "Lexer.h"
14#include "Token.h"
16#include "mlir/IR/Diagnostics.h"
17#include "mlir/IR/Location.h"
18#include "mlir/IR/MLIRContext.h"
19#include "mlir/Support/LLVM.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/StringExtras.h"
22#include "llvm/ADT/StringSwitch.h"
23#include "llvm/Support/ErrorHandling.h"
24#include "llvm/Support/SourceMgr.h"
25#include <cassert>
26#include <cctype>
27
28using namespace mlir;
29
30// Returns true if 'c' is an allowable punctuation character: [$._-]
31// Returns false otherwise.
32static bool isPunct(char c) {
33 return c == '$' || c == '.' || c == '_' || c == '-';
34}
35
36Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
37 AsmParserCodeCompleteContext *codeCompleteContext)
38 : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
39 auto bufferID = sourceMgr.getMainFileID();
40
41 // Check to see if the main buffer contains the last buffer, and if so the
42 // last buffer should be used as main file for parsing.
43 if (sourceMgr.getNumBuffers() > 1) {
44 unsigned lastFileID = sourceMgr.getNumBuffers();
45 const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
46 const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
47 if (main->getBufferStart() <= last->getBufferStart() &&
48 main->getBufferEnd() >= last->getBufferEnd()) {
49 bufferID = lastFileID;
50 }
51 }
52 curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
53 curPtr = curBuffer.begin();
54
55 // Set the code completion location if it was provided.
56 if (codeCompleteContext)
57 codeCompleteLoc = codeCompleteContext->getCodeCompleteLoc().getPointer();
58}
59
60/// Encode the specified source location information into an attribute for
61/// attachment to the IR.
63 auto &sourceMgr = getSourceMgr();
64 unsigned mainFileID = sourceMgr.getMainFileID();
65
66 // TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can
67 // use it here.
68 auto &bufferInfo = sourceMgr.getBufferInfo(mainFileID);
69 unsigned lineNo = bufferInfo.getLineNumber(loc.getPointer());
70 unsigned column =
71 (loc.getPointer() - bufferInfo.getPointerForLineNumber(lineNo)) + 1;
72 auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);
73
74 return FileLineColLoc::get(context, buffer->getBufferIdentifier(), lineNo,
75 column);
76}
77
78/// emitError - Emit an error message and return an Token::error token.
79Token Lexer::emitError(const char *loc, const Twine &message) {
80 mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
81 message);
82 return formToken(Token::error, loc);
83}
84
86 const char *curBufferEnd = curBuffer.end();
87 while (true) {
88 const char *tokStart = curPtr;
89
90 // Check to see if the current token is at the code completion location.
91 if (tokStart == codeCompleteLoc)
92 return formToken(Token::code_complete, tokStart);
93
94 if (tokStart == curBufferEnd)
95 return formToken(Token::eof, tokStart);
96
97 // Lex the next token.
98 switch (*curPtr++) {
99 default:
100 // Handle bare identifiers.
101 if (isalpha(curPtr[-1]))
102 return lexBareIdentifierOrKeyword(tokStart);
103
104 // Unknown character, emit an error.
105 return emitError(tokStart, "unexpected character");
106
107 case ' ':
108 case '\t':
109 case '\n':
110 case '\r':
111 // Handle whitespace.
112 continue;
113
114 case '_':
115 // Handle bare identifiers.
116 return lexBareIdentifierOrKeyword(tokStart);
117
118 case 0:
119 // This may either be a nul character in the source file or may be the EOF
120 // marker that llvm::MemoryBuffer guarantees will be there.
121 if (curPtr - 1 == curBufferEnd)
122 return formToken(Token::eof, tokStart);
123 continue;
124
125 case ':':
126 return formToken(Token::colon, tokStart);
127 case ',':
128 return formToken(Token::comma, tokStart);
129 case '.':
130 return lexEllipsis(tokStart);
131 case '(':
132 return formToken(Token::l_paren, tokStart);
133 case ')':
134 return formToken(Token::r_paren, tokStart);
135 case '{':
136 if (*curPtr == '-' && *(curPtr + 1) == '#') {
137 curPtr += 2;
138 return formToken(Token::file_metadata_begin, tokStart);
139 }
140 return formToken(Token::l_brace, tokStart);
141 case '}':
142 return formToken(Token::r_brace, tokStart);
143 case '[':
144 return formToken(Token::l_square, tokStart);
145 case ']':
146 return formToken(Token::r_square, tokStart);
147 case '<':
148 return formToken(Token::less, tokStart);
149 case '>':
150 return formToken(Token::greater, tokStart);
151 case '=':
152 return formToken(Token::equal, tokStart);
153
154 case '+':
155 return formToken(Token::plus, tokStart);
156 case '*':
157 return formToken(Token::star, tokStart);
158 case '-':
159 if (*curPtr == '>') {
160 ++curPtr;
161 return formToken(Token::arrow, tokStart);
162 }
163 return formToken(Token::minus, tokStart);
164
165 case '?':
166 return formToken(Token::question, tokStart);
167
168 case '|':
169 return formToken(Token::vertical_bar, tokStart);
170
171 case '/':
172 if (*curPtr == '/') {
173 skipComment();
174 continue;
175 }
176 return formToken(Token::slash, tokStart);
177
178 case '@':
179 return lexAtIdentifier(tokStart);
180
181 case '#':
182 if (*curPtr == '-' && *(curPtr + 1) == '}') {
183 curPtr += 2;
184 return formToken(Token::file_metadata_end, tokStart);
185 }
186 [[fallthrough]];
187 case '!':
188 case '^':
189 case '%':
190 return lexPrefixedIdentifier(tokStart);
191 case '"':
192 return lexString(tokStart);
193
194 case '0':
195 case '1':
196 case '2':
197 case '3':
198 case '4':
199 case '5':
200 case '6':
201 case '7':
202 case '8':
203 case '9':
204 return lexNumber(tokStart);
205 }
206 }
207}
208
209/// Lex an '@foo' identifier.
210///
211/// symbol-ref-id ::= `@` (bare-id | string-literal)
212///
213Token Lexer::lexAtIdentifier(const char *tokStart) {
214 char cur = *curPtr++;
215
216 // Try to parse a string literal, if present.
217 if (cur == '"') {
218 Token stringIdentifier = lexString(curPtr);
219 if (stringIdentifier.is(Token::error))
220 return stringIdentifier;
221 return formToken(Token::at_identifier, tokStart);
222 }
223
224 // Otherwise, these always start with a letter or underscore.
225 if (!isalpha(cur) && cur != '_')
226 return emitError(curPtr - 1,
227 "@ identifier expected to start with letter or '_'");
228
229 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
230 *curPtr == '$' || *curPtr == '.')
231 ++curPtr;
232 return formToken(Token::at_identifier, tokStart);
233}
234
235/// Lex a bare identifier or keyword that starts with a letter.
236///
237/// bare-id ::= (letter|[_]) (letter|digit|[_$.])*
238/// integer-type ::= `[su]?i[1-9][0-9]*`
239///
240Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
241 // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
242 while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
243 *curPtr == '$' || *curPtr == '.')
244 ++curPtr;
245
246 // Check to see if this identifier is a keyword.
247 StringRef spelling(tokStart, curPtr - tokStart);
248
249 auto isAllDigit = [](StringRef str) {
250 return llvm::all_of(str, llvm::isDigit);
251 };
252
253 // Check for i123, si456, ui789.
254 if ((spelling.size() > 1 && tokStart[0] == 'i' &&
255 isAllDigit(spelling.drop_front())) ||
256 ((spelling.size() > 2 && tokStart[1] == 'i' &&
257 (tokStart[0] == 's' || tokStart[0] == 'u')) &&
258 isAllDigit(spelling.drop_front(2))))
259 return Token(Token::inttype, spelling);
260
262#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
263#include "TokenKinds.def"
264 .Default(Token::bare_identifier);
265
266 return Token(kind, spelling);
267}
268
269/// Skip a comment line, starting with a '//'.
270///
271/// TODO: add a regex for comments here and to the spec.
272///
273void Lexer::skipComment() {
274 // Advance over the second '/' in a '//' comment.
275 assert(*curPtr == '/');
276 ++curPtr;
277
278 const char *curBufferEnd = curBuffer.end();
279 while (true) {
280 if (curPtr == curBufferEnd)
281 return;
282
283 switch (*curPtr++) {
284 case '\n':
285 case '\r':
286 // Newline is end of comment.
287 return;
288 case 0:
289 // If this is the end of the buffer, end the comment.
290 if (curPtr - 1 == curBufferEnd) {
291 --curPtr;
292 return;
293 }
294 [[fallthrough]];
295 default:
296 // Skip over other characters.
297 break;
298 }
299 }
300}
301
302/// Lex an ellipsis.
303///
304/// ellipsis ::= '...'
305///
306Token Lexer::lexEllipsis(const char *tokStart) {
307 assert(curPtr[-1] == '.');
308
309 if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
310 return emitError(curPtr, "expected three consecutive dots for an ellipsis");
311
312 curPtr += 2;
313 return formToken(Token::ellipsis, tokStart);
314}
315
316/// Lex a number literal.
317///
318/// integer-literal ::= digit+ | `0x` hex_digit+
319/// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
320///
321Token Lexer::lexNumber(const char *tokStart) {
322 assert(isdigit(curPtr[-1]));
323
324 // Handle the hexadecimal case.
325 if (curPtr[-1] == '0' && *curPtr == 'x') {
326 // If we see stuff like 0xi32, this is a literal `0` followed by an
327 // identifier `xi32`, stop after `0`.
328 if (!isxdigit(curPtr[1]))
329 return formToken(Token::integer, tokStart);
330
331 curPtr += 2;
332 while (isxdigit(*curPtr))
333 ++curPtr;
334
335 return formToken(Token::integer, tokStart);
336 }
337
338 // Handle the normal decimal case.
339 while (isdigit(*curPtr))
340 ++curPtr;
341
342 if (*curPtr != '.')
343 return formToken(Token::integer, tokStart);
344 ++curPtr;
345
346 // Skip over [0-9]*([eE][-+]?[0-9]+)?
347 while (isdigit(*curPtr))
348 ++curPtr;
349
350 if (*curPtr == 'e' || *curPtr == 'E') {
351 if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
352 ((curPtr[1] == '-' || curPtr[1] == '+') &&
353 isdigit(static_cast<unsigned char>(curPtr[2])))) {
354 curPtr += 2;
355 while (isdigit(*curPtr))
356 ++curPtr;
357 }
358 }
359 return formToken(Token::floatliteral, tokStart);
360}
361
362/// Lex an identifier that starts with a prefix followed by suffix-id.
363///
364/// attribute-id ::= `#` suffix-id
365/// ssa-id ::= '%' suffix-id
366/// block-id ::= '^' suffix-id
367/// type-id ::= '!' suffix-id
368/// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
369/// id-punct ::= `$` | `.` | `_` | `-`
370///
371Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
372 Token::Kind kind;
373 StringRef errorKind;
374 switch (*tokStart) {
375 case '#':
376 kind = Token::hash_identifier;
377 errorKind = "invalid attribute name";
378 break;
379 case '%':
380 kind = Token::percent_identifier;
381 errorKind = "invalid SSA name";
382 break;
383 case '^':
384 kind = Token::caret_identifier;
385 errorKind = "invalid block name";
386 break;
387 case '!':
388 kind = Token::exclamation_identifier;
389 errorKind = "invalid type identifier";
390 break;
391 default:
392 llvm_unreachable("invalid caller");
393 }
394
395 // Parse suffix-id.
396 if (isdigit(*curPtr)) {
397 // If suffix-id starts with a digit, the rest must be digits.
398 while (isdigit(*curPtr))
399 ++curPtr;
400 } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
401 do {
402 ++curPtr;
403 } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
404 } else if (curPtr == codeCompleteLoc) {
405 return formToken(Token::code_complete, tokStart);
406 } else {
407 return emitError(curPtr - 1, errorKind);
408 }
409
410 // Check for a code completion within the identifier.
411 if (codeCompleteLoc && codeCompleteLoc >= tokStart &&
412 codeCompleteLoc <= curPtr) {
413 return Token(Token::code_complete,
414 StringRef(tokStart, codeCompleteLoc - tokStart));
415 }
416
417 return formToken(kind, tokStart);
418}
419
420/// Lex a string literal.
421///
422/// string-literal ::= '"' [^"\n\f\v\r]* '"'
423///
424/// TODO: define escaping rules.
425Token Lexer::lexString(const char *tokStart) {
426 assert(curPtr[-1] == '"');
427
428 const char *curBufferEnd = curBuffer.end();
429 while (true) {
430 // Check to see if there is a code completion location within the string. In
431 // these cases we generate a completion location and place the currently
432 // lexed string within the token. This allows for the parser to use the
433 // partially lexed string when computing the completion results.
434 if (curPtr == codeCompleteLoc)
435 return formToken(Token::code_complete, tokStart);
436
437 switch (*curPtr++) {
438 case '"':
439 return formToken(Token::string, tokStart);
440 case 0:
441 // If this is a random nul character in the middle of a string, just
442 // include it. If it is the end of file, then it is an error.
443 if (curPtr - 1 != curBufferEnd)
444 continue;
445 [[fallthrough]];
446 case '\n':
447 case '\v':
448 case '\f':
449 return emitError(curPtr - 1, "expected '\"' in string literal");
450 case '\\':
451 // Handle explicitly a few escapes.
452 if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
453 ++curPtr;
454 else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
455 // Support \xx for two hex digits.
456 curPtr += 2;
457 else
458 return emitError(curPtr - 1, "unknown escape in string literal");
459 continue;
460
461 default:
462 continue;
463 }
464 }
465}
static bool isPunct(char c)
Definition Lexer.cpp:32
This class provides an abstract interface into the parser for hooking in code completion events.
static FileLineColLoc get(StringAttr filename, unsigned line, unsigned column)
Definition Location.cpp:157
Token lexToken()
Definition Lexer.cpp:85
Location getEncodedSourceLocation(SMLoc loc)
Encode the specified source location information into a Location object for attachment to the IR or e...
Definition Lexer.cpp:62
const llvm::SourceMgr & getSourceMgr()
Definition Lexer.h:28
Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context, AsmParserCodeCompleteContext *codeCompleteContext)
Definition Lexer.cpp:36
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition Location.h:76
MLIRContext is the top-level object for a collection of MLIR operations.
Definition MLIRContext.h:63
This represents a token in the MLIR syntax.
Definition Token.h:20
bool is(Kind k) const
Definition Token.h:38
Include the generated interface declarations.
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
llvm::StringSwitch< T, R > StringSwitch
Definition LLVM.h:141