doxygen/AsmParser_2Lexer_8cpp_source.html

 //===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This file implements the lexer for the MLIR textual form.

 //

 //===----------------------------------------------------------------------===//


 #include "Lexer.h"

 #include "Token.h"

 #include "mlir/AsmParser/CodeComplete.h"

 #include "mlir/IR/Diagnostics.h"

 #include "mlir/IR/Location.h"

 #include "mlir/IR/MLIRContext.h"

 #include "mlir/Support/LLVM.h"

 #include "llvm/ADT/STLExtras.h"

 #include "llvm/ADT/StringExtras.h"

 #include "llvm/ADT/StringSwitch.h"

 #include "llvm/Support/ErrorHandling.h"

 #include "llvm/Support/SourceMgr.h"

 #include <cassert>

 #include <cctype>


 using namespace mlir;


 // Returns true if 'c' is an allowable punctuation character: [$._-]

 // Returns false otherwise.

 static bool isPunct(char c) {

   return c == '$' || c == '.' || c == '_' || c == '-';

 }


 Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,

              AsmParserCodeCompleteContext *codeCompleteContext)

     : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {

   auto bufferID = sourceMgr.getMainFileID();

   curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();

   curPtr = curBuffer.begin();


   // Set the code completion location if it was provided.

   if (codeCompleteContext)

     codeCompleteLoc = codeCompleteContext->getCodeCompleteLoc().getPointer();

 }


 /// Encode the specified source location information into an attribute for

 /// attachment to the IR.

 Location Lexer::getEncodedSourceLocation(SMLoc loc) {

   auto &sourceMgr = getSourceMgr();

   unsigned mainFileID = sourceMgr.getMainFileID();


   // TODO: Fix performance issues in SourceMgr::getLineAndColumn so that we can

   //       use it here.

   auto &bufferInfo = sourceMgr.getBufferInfo(mainFileID);

   unsigned lineNo = bufferInfo.getLineNumber(loc.getPointer());

   unsigned column =

       (loc.getPointer() - bufferInfo.getPointerForLineNumber(lineNo)) + 1;

   auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);


   return FileLineColLoc::get(context, buffer->getBufferIdentifier(), lineNo,

                              column);

 }


 /// emitError - Emit an error message and return an Token::error token.

 Token Lexer::emitError(const char *loc, const Twine &message) {

   mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),

                   message);

   return formToken(Token::error, loc);

 }


 Token Lexer::lexToken() {

   while (true) {

     const char *tokStart = curPtr;


     // Check to see if the current token is at the code completion location.

     if (tokStart == codeCompleteLoc)

       return formToken(Token::code_complete, tokStart);


     // Lex the next token.

     switch (*curPtr++) {

     default:

       // Handle bare identifiers.

       if (isalpha(curPtr[-1]))

         return lexBareIdentifierOrKeyword(tokStart);


       // Unknown character, emit an error.

       return emitError(tokStart, "unexpected character");


     case ' ':

     case '\t':

     case '\n':

     case '\r':

       // Handle whitespace.

       continue;


     case '_':

       // Handle bare identifiers.

       return lexBareIdentifierOrKeyword(tokStart);


     case 0:

       // This may either be a nul character in the source file or may be the EOF

       // marker that llvm::MemoryBuffer guarantees will be there.

       if (curPtr - 1 == curBuffer.end())

         return formToken(Token::eof, tokStart);

       continue;


     case ':':

       return formToken(Token::colon, tokStart);

     case ',':

       return formToken(Token::comma, tokStart);

     case '.':

       return lexEllipsis(tokStart);

     case '(':

       return formToken(Token::l_paren, tokStart);

     case ')':

       return formToken(Token::r_paren, tokStart);

     case '{':

       if (*curPtr == '-' && *(curPtr + 1) == '#') {

         curPtr += 2;

         return formToken(Token::file_metadata_begin, tokStart);

       }

       return formToken(Token::l_brace, tokStart);

     case '}':

       return formToken(Token::r_brace, tokStart);

     case '[':

       return formToken(Token::l_square, tokStart);

     case ']':

       return formToken(Token::r_square, tokStart);

     case '<':

       return formToken(Token::less, tokStart);

     case '>':

       return formToken(Token::greater, tokStart);

     case '=':

       return formToken(Token::equal, tokStart);


     case '+':

       return formToken(Token::plus, tokStart);

     case '*':

       return formToken(Token::star, tokStart);

     case '-':

       if (*curPtr == '>') {

         ++curPtr;

         return formToken(Token::arrow, tokStart);

       }

       return formToken(Token::minus, tokStart);


     case '?':

       return formToken(Token::question, tokStart);


     case '|':

       return formToken(Token::vertical_bar, tokStart);


     case '/':

       if (*curPtr == '/') {

         skipComment();

         continue;

       }

       return formToken(Token::slash, tokStart);


     case '@':

       return lexAtIdentifier(tokStart);


     case '#':

       if (*curPtr == '-' && *(curPtr + 1) == '}') {

         curPtr += 2;

         return formToken(Token::file_metadata_end, tokStart);

       }

       [[fallthrough]];

     case '!':

     case '^':

     case '%':

       return lexPrefixedIdentifier(tokStart);

     case '"':

       return lexString(tokStart);


     case '0':

     case '1':

     case '2':

     case '3':

     case '4':

     case '5':

     case '6':

     case '7':

     case '8':

     case '9':

       return lexNumber(tokStart);

     }

   }

 }


 /// Lex an '@foo' identifier.

 ///

 ///   symbol-ref-id ::= `@` (bare-id | string-literal)

 ///

 Token Lexer::lexAtIdentifier(const char *tokStart) {

   char cur = *curPtr++;


   // Try to parse a string literal, if present.

   if (cur == '"') {

     Token stringIdentifier = lexString(curPtr);

     if (stringIdentifier.is(Token::error))

       return stringIdentifier;

     return formToken(Token::at_identifier, tokStart);

   }


   // Otherwise, these always start with a letter or underscore.

   if (!isalpha(cur) && cur != '_')

     return emitError(curPtr - 1,

                      "@ identifier expected to start with letter or '_'");


   while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||

          *curPtr == '$' || *curPtr == '.')

     ++curPtr;

   return formToken(Token::at_identifier, tokStart);

 }


 /// Lex a bare identifier or keyword that starts with a letter.

 ///

 ///   bare-id ::= (letter|[_]) (letter|digit|[_$.])*

 ///   integer-type ::= `[su]?i[1-9][0-9]*`

 ///

 Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {

   // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*

   while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||

          *curPtr == '$' || *curPtr == '.')

     ++curPtr;


   // Check to see if this identifier is a keyword.

   StringRef spelling(tokStart, curPtr - tokStart);


   auto isAllDigit = [](StringRef str) {

     return llvm::all_of(str, llvm::isDigit);

   };


   // Check for i123, si456, ui789.

   if ((spelling.size() > 1 && tokStart[0] == 'i' &&

        isAllDigit(spelling.drop_front())) ||

       ((spelling.size() > 2 && tokStart[1] == 'i' &&

         (tokStart[0] == 's' || tokStart[0] == 'u')) &&

        isAllDigit(spelling.drop_front(2))))

     return Token(Token::inttype, spelling);


   Token::Kind kind = StringSwitch<Token::Kind>(spelling)

 #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)

 #include "TokenKinds.def"

                          .Default(Token::bare_identifier);


   return Token(kind, spelling);

 }


 /// Skip a comment line, starting with a '//'.

 ///

 ///   TODO: add a regex for comments here and to the spec.

 ///

 void Lexer::skipComment() {

   // Advance over the second '/' in a '//' comment.

   assert(*curPtr == '/');

   ++curPtr;


   while (true) {

     switch (*curPtr++) {

     case '\n':

     case '\r':

       // Newline is end of comment.

       return;

     case 0:

       // If this is the end of the buffer, end the comment.

       if (curPtr - 1 == curBuffer.end()) {

         --curPtr;

         return;

       }

       [[fallthrough]];

     default:

       // Skip over other characters.

       break;

     }

   }

 }


 /// Lex an ellipsis.

 ///

 ///   ellipsis ::= '...'

 ///

 Token Lexer::lexEllipsis(const char *tokStart) {

   assert(curPtr[-1] == '.');


   if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')

     return emitError(curPtr, "expected three consecutive dots for an ellipsis");


   curPtr += 2;

   return formToken(Token::ellipsis, tokStart);

 }


 /// Lex a number literal.

 ///

 ///   integer-literal ::= digit+ | `0x` hex_digit+

 ///   float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?

 ///

 Token Lexer::lexNumber(const char *tokStart) {

   assert(isdigit(curPtr[-1]));


   // Handle the hexadecimal case.

   if (curPtr[-1] == '0' && *curPtr == 'x') {

     // If we see stuff like 0xi32, this is a literal `0` followed by an

     // identifier `xi32`, stop after `0`.

     if (!isxdigit(curPtr[1]))

       return formToken(Token::integer, tokStart);


     curPtr += 2;

     while (isxdigit(*curPtr))

       ++curPtr;


     return formToken(Token::integer, tokStart);

   }


   // Handle the normal decimal case.

   while (isdigit(*curPtr))

     ++curPtr;


   if (*curPtr != '.')

     return formToken(Token::integer, tokStart);

   ++curPtr;


   // Skip over [0-9]*([eE][-+]?[0-9]+)?

   while (isdigit(*curPtr))

     ++curPtr;


   if (*curPtr == 'e' || *curPtr == 'E') {

     if (isdigit(static_cast<unsigned char>(curPtr[1])) ||

         ((curPtr[1] == '-' || curPtr[1] == '+') &&

          isdigit(static_cast<unsigned char>(curPtr[2])))) {

       curPtr += 2;

       while (isdigit(*curPtr))

         ++curPtr;

     }

   }

   return formToken(Token::floatliteral, tokStart);

 }


 /// Lex an identifier that starts with a prefix followed by suffix-id.

 ///

 ///   attribute-id  ::= `#` suffix-id

 ///   ssa-id        ::= '%' suffix-id

 ///   block-id      ::= '^' suffix-id

 ///   type-id       ::= '!' suffix-id

 ///   suffix-id     ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*

 ///   id-punct      ::= `$` | `.` | `_` | `-`

 ///

 Token Lexer::lexPrefixedIdentifier(const char *tokStart) {

   Token::Kind kind;

   StringRef errorKind;

   switch (*tokStart) {

   case '#':

     kind = Token::hash_identifier;

     errorKind = "invalid attribute name";

     break;

   case '%':

     kind = Token::percent_identifier;

     errorKind = "invalid SSA name";

     break;

   case '^':

     kind = Token::caret_identifier;

     errorKind = "invalid block name";

     break;

   case '!':

     kind = Token::exclamation_identifier;

     errorKind = "invalid type identifier";

     break;

   default:

     llvm_unreachable("invalid caller");

   }


   // Parse suffix-id.

   if (isdigit(*curPtr)) {

     // If suffix-id starts with a digit, the rest must be digits.

     while (isdigit(*curPtr))

       ++curPtr;

   } else if (isalpha(*curPtr) || isPunct(*curPtr)) {

     do {

       ++curPtr;

     } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));

   } else if (curPtr == codeCompleteLoc) {

     return formToken(Token::code_complete, tokStart);

   } else {

     return emitError(curPtr - 1, errorKind);

   }


   // Check for a code completion within the identifier.

   if (codeCompleteLoc && codeCompleteLoc >= tokStart &&

       codeCompleteLoc <= curPtr) {

     return Token(Token::code_complete,

                  StringRef(tokStart, codeCompleteLoc - tokStart));

   }


   return formToken(kind, tokStart);

 }


 /// Lex a string literal.

 ///

 ///   string-literal ::= '"' [^"\n\f\v\r]* '"'

 ///

 /// TODO: define escaping rules.

 Token Lexer::lexString(const char *tokStart) {

   assert(curPtr[-1] == '"');


   while (true) {

     // Check to see if there is a code completion location within the string. In

     // these cases we generate a completion location and place the currently

     // lexed string within the token. This allows for the parser to use the

     // partially lexed string when computing the completion results.

     if (curPtr == codeCompleteLoc)

       return formToken(Token::code_complete, tokStart);


     switch (*curPtr++) {

     case '"':

       return formToken(Token::string, tokStart);

     case 0:

       // If this is a random nul character in the middle of a string, just

       // include it.  If it is the end of file, then it is an error.

       if (curPtr - 1 != curBuffer.end())

         continue;

       [[fallthrough]];

     case '\n':

     case '\v':

     case '\f':

       return emitError(curPtr - 1, "expected '\"' in string literal");

     case '\\':

       // Handle explicitly a few escapes.

       if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')

         ++curPtr;

       else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))

         // Support \xx for two hex digits.

         curPtr += 2;

       else

         return emitError(curPtr - 1, "unknown escape in string literal");

       continue;


     default:

       continue;

     }

   }

 }

CodeComplete.h

isPunct
static bool isPunct(char c)
Definition: Lexer.cpp:32

kind
union mlir::linalg::@1224::ArityGroupAndKind::Kind kind

Location.h

MLIRContext.h

Token.h

Lexer.h

llvm::StringSwitch
Definition: LLVM.h:78

mlir::AsmParserCodeCompleteContext
This class provides an abstract interface into the parser for hooking in code completion events.
Definition: CodeComplete.h:24

mlir::AsmParserCodeCompleteContext::getCodeCompleteLoc
SMLoc getCodeCompleteLoc() const
Return the source location used to provide code completion.
Definition: CodeComplete.h:29

mlir::FileLineColLoc::get
static FileLineColLoc get(StringAttr filename, unsigned line, unsigned column)
Definition: Location.cpp:157

mlir::Lexer::lexToken
Token lexToken()
Definition: Lexer.cpp:73

mlir::Lexer::getSourceMgr
const llvm::SourceMgr & getSourceMgr()
Definition: Lexer.h:28

mlir::Lexer::getEncodedSourceLocation
Location getEncodedSourceLocation(SMLoc loc)
Encode the specified source location information into a Location object for attachment to the IR or e...
Definition: Lexer.cpp:50

mlir::Lexer::Lexer
Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context, AsmParserCodeCompleteContext *codeCompleteContext)
Definition: Lexer.cpp:36

mlir::Location
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
Definition: Location.h:76

mlir::MLIRContext
MLIRContext is the top-level object for a collection of MLIR operations.
Definition: MLIRContext.h:60

mlir::Token
This represents a token in the MLIR syntax.
Definition: Token.h:20

mlir::Token::is
bool is(Kind k) const
Definition: Token.h:38

mlir::Token::Kind
Kind
Definition: Token.h:22

Diagnostics.h

LLVM.h

mlir
Include the generated interface declarations.
Definition: LocalAliasAnalysis.h:20

mlir::emitError
InFlightDiagnostic emitError(Location loc)
Utility method to emit an error message using this location.
Definition: Diagnostics.cpp:328