doxygen/Tools_2PDLL_2Parser_2Lexer_8cpp_source.html

//===- Lexer.cpp ----------------------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "Lexer.h"

#include "mlir/Tools/PDLL/AST/Diagnostic.h"

#include "mlir/Tools/PDLL/Parser/CodeComplete.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/ADT/StringSwitch.h"

#include "llvm/Support/SourceMgr.h"


using namespace mlir;

using namespace mlir::pdll;


//===----------------------------------------------------------------------===//

// Token

//===----------------------------------------------------------------------===//


std::string Token::getStringValue() const {

  assert(getKind() == string || getKind() == string_block ||

         getKind() == code_complete_string);


  // Start by dropping the quotes.

  StringRef bytes = getSpelling();

  if (is(string))

    bytes = bytes.drop_front().drop_back();

  else if (is(string_block))

    bytes = bytes.drop_front(2).drop_back(2);


  std::string result;

  result.reserve(bytes.size());

  for (unsigned i = 0, e = bytes.size(); i != e;) {

    auto c = bytes[i++];

    if (c != '\\') {

      result.push_back(c);

      continue;

    }


    assert(i + 1 <= e && "invalid string should be caught by lexer");

    auto c1 = bytes[i++];

    switch (c1) {

    case '"':

    case '\\':

      result.push_back(c1);

      continue;

    case 'n':

      result.push_back('\n');

      continue;

    case 't':

      result.push_back('\t');

      continue;

    default:

      break;

    }


    assert(i + 1 <= e && "invalid string should be caught by lexer");

    auto c2 = bytes[i++];


    assert(llvm::isHexDigit(c1) && llvm::isHexDigit(c2) && "invalid escape");

    result.push_back((llvm::hexDigitValue(c1) << 4) | llvm::hexDigitValue(c2));

  }


  return result;

}


//===----------------------------------------------------------------------===//

// Lexer

//===----------------------------------------------------------------------===//


Lexer::Lexer(llvm::SourceMgr &mgr, ast::DiagnosticEngine &diagEngine,

             CodeCompleteContext *codeCompleteContext)

    : srcMgr(mgr), diagEngine(diagEngine), addedHandlerToDiagEngine(false),

      codeCompletionLocation(nullptr) {

  curBufferID = mgr.getMainFileID();

  curBuffer = srcMgr.getMemoryBuffer(curBufferID)->getBuffer();

  curPtr = curBuffer.begin();


  // Set the code completion location if necessary.

  if (codeCompleteContext) {

    codeCompletionLocation =

        codeCompleteContext->getCodeCompleteLoc().getPointer();

  }


  // If the diag engine has no handler, add a default that emits to the

  // SourceMgr.

  if (!diagEngine.getHandlerFn()) {

    diagEngine.setHandlerFn([&](const ast::Diagnostic &diag) {

      srcMgr.PrintMessage(diag.getLocation().Start, diag.getSeverity(),

                          diag.getMessage());

      for (const ast::Diagnostic &note : diag.getNotes())

        srcMgr.PrintMessage(note.getLocation().Start, note.getSeverity(),

                            note.getMessage());

    });

    addedHandlerToDiagEngine = true;

  }

}


Lexer::~Lexer() {

  if (addedHandlerToDiagEngine)

    diagEngine.setHandlerFn(nullptr);

}


LogicalResult Lexer::pushInclude(StringRef filename, SMRange includeLoc) {

  std::string includedFile;

  int bufferID =

      srcMgr.AddIncludeFile(filename.str(), includeLoc.End, includedFile);

  if (!bufferID)

    return failure();


  curBufferID = bufferID;

  curBuffer = srcMgr.getMemoryBuffer(curBufferID)->getBuffer();

  curPtr = curBuffer.begin();

  return success();

}


Token Lexer::emitError(SMRange loc, const Twine &msg) {

  diagEngine.emitError(loc, msg);

  return formToken(Token::error, loc.Start.getPointer());

}


Token Lexer::emitErrorAndNote(SMRange loc, const Twine &msg, SMRange noteLoc,

                              const Twine &note) {

  diagEngine.emitError(loc, msg)->attachNote(note, noteLoc);

  return formToken(Token::error, loc.Start.getPointer());

}


Token Lexer::emitError(const char *loc, const Twine &msg) {

  return emitError(

      SMRange(SMLoc::getFromPointer(loc), SMLoc::getFromPointer(loc + 1)), msg);

}


int Lexer::getNextChar() {

  char curChar = *curPtr++;

  switch (curChar) {

  default:

    return static_cast<unsigned char>(curChar);

  case 0: {

    // A nul character in the stream is either the end of the current buffer

    // or a random nul in the file. Disambiguate that here.

    if (curPtr - 1 != curBuffer.end())

      return 0;


    // Otherwise, return end of file.

    --curPtr;

    return EOF;

  }

  case '\n':

  case '\r':

    // Handle the newline character by ignoring it and incrementing the line

    // count. However, be careful about 'dos style' files with \n\r in them.

    // Only treat a \n\r or \r\n as a single line.

    if ((*curPtr == '\n' || (*curPtr == '\r')) && *curPtr != curChar)

      ++curPtr;

    return '\n';

  }

}


Token Lexer::lexToken() {

  while (true) {

    const char *tokStart = curPtr;


    // Check to see if this token is at the code completion location.

    if (tokStart == codeCompletionLocation)

      return formToken(Token::code_complete, tokStart);


    // This always consumes at least one character.

    int curChar = getNextChar();

    switch (curChar) {

    default:

      // Handle identifiers: [a-zA-Z_]

      if (isalpha(curChar) || curChar == '_')

        return lexIdentifier(tokStart);


      // Unknown character, emit an error.

      return emitError(tokStart, "unexpected character");

    case EOF: {

      // Return EOF denoting the end of lexing.

      Token eof = formToken(Token::eof, tokStart);


      // Check to see if we are in an included file.

      SMLoc parentIncludeLoc = srcMgr.getParentIncludeLoc(curBufferID);

      if (parentIncludeLoc.isValid()) {

        curBufferID = srcMgr.FindBufferContainingLoc(parentIncludeLoc);

        curBuffer = srcMgr.getMemoryBuffer(curBufferID)->getBuffer();

        curPtr = parentIncludeLoc.getPointer();

      }


      return eof;

    }


    // Lex punctuation.

    case '-':

      if (*curPtr == '>') {

        ++curPtr;

        return formToken(Token::arrow, tokStart);

      }

      return emitError(tokStart, "unexpected character");

    case ':':

      return formToken(Token::colon, tokStart);

    case ',':

      return formToken(Token::comma, tokStart);

    case '.':

      return formToken(Token::dot, tokStart);

    case '=':

      if (*curPtr == '>') {

        ++curPtr;

        return formToken(Token::equal_arrow, tokStart);

      }

      return formToken(Token::equal, tokStart);

    case ';':

      return formToken(Token::semicolon, tokStart);

    case '[':

      if (*curPtr == '{') {

        ++curPtr;

        return lexString(tokStart, /*isStringBlock=*/true);

      }

      return formToken(Token::l_square, tokStart);

    case ']':

      return formToken(Token::r_square, tokStart);


    case '<':

      return formToken(Token::less, tokStart);

    case '>':

      return formToken(Token::greater, tokStart);

    case '{':

      return formToken(Token::l_brace, tokStart);

    case '}':

      return formToken(Token::r_brace, tokStart);

    case '(':

      return formToken(Token::l_paren, tokStart);

    case ')':

      return formToken(Token::r_paren, tokStart);

    case '/':

      if (*curPtr == '/') {

        lexComment();

        continue;

      }

      return emitError(tokStart, "unexpected character");


    // Ignore whitespace characters.

    case 0:

    case ' ':

    case '\t':

    case '\n':

      return lexToken();


    case '#':

      return lexDirective(tokStart);

    case '"':

      return lexString(tokStart, /*isStringBlock=*/false);


    case '0':

    case '1':

    case '2':

    case '3':

    case '4':

    case '5':

    case '6':

    case '7':

    case '8':

    case '9':

      return lexNumber(tokStart);

    }

  }

}


/// Skip a comment line, starting with a '//'.

void Lexer::lexComment() {

  // Advance over the second '/' in a '//' comment.

  assert(*curPtr == '/');

  ++curPtr;


  while (true) {

    switch (*curPtr++) {

    case '\n':

    case '\r':

      // Newline is end of comment.

      return;

    case 0:

      // If this is the end of the buffer, end the comment.

      if (curPtr - 1 == curBuffer.end()) {

        --curPtr;

        return;

      }

      [[fallthrough]];

    default:

      // Skip over other characters.

      break;

    }

  }

}


Token Lexer::lexDirective(const char *tokStart) {

  // Match the rest with an identifier regex: [0-9a-zA-Z_]*

  while (isalnum(*curPtr) || *curPtr == '_')

    ++curPtr;


  StringRef str(tokStart, curPtr - tokStart);

  return Token(Token::directive, str);

}


Token Lexer::lexIdentifier(const char *tokStart) {

  // Match the rest of the identifier regex: [0-9a-zA-Z_]*

  while (isalnum(*curPtr) || *curPtr == '_')

    ++curPtr;


  // Check to see if this identifier is a keyword.

  StringRef str(tokStart, curPtr - tokStart);

  Token::Kind kind = StringSwitch<Token::Kind>(str)

                         .Case("attr", Token::kw_attr)

                         .Case("Attr", Token::kw_Attr)

                         .Case("erase", Token::kw_erase)

                         .Case("let", Token::kw_let)

                         .Case("Constraint", Token::kw_Constraint)

                         .Case("not", Token::kw_not)

                         .Case("op", Token::kw_op)

                         .Case("Op", Token::kw_Op)

                         .Case("OpName", Token::kw_OpName)

                         .Case("Pattern", Token::kw_Pattern)

                         .Case("replace", Token::kw_replace)

                         .Case("return", Token::kw_return)

                         .Case("rewrite", Token::kw_rewrite)

                         .Case("Rewrite", Token::kw_Rewrite)

                         .Case("type", Token::kw_type)

                         .Case("Type", Token::kw_Type)

                         .Case("TypeRange", Token::kw_TypeRange)

                         .Case("Value", Token::kw_Value)

                         .Case("ValueRange", Token::kw_ValueRange)

                         .Case("with", Token::kw_with)

                         .Case("_", Token::underscore)

                         .Default(Token::identifier);

  return Token(kind, str);

}


Token Lexer::lexNumber(const char *tokStart) {

  assert(isdigit(curPtr[-1]));


  // Handle the normal decimal case.

  while (isdigit(*curPtr))

    ++curPtr;


  return formToken(Token::integer, tokStart);

}


Token Lexer::lexString(const char *tokStart, bool isStringBlock) {

  while (true) {

    // Check to see if there is a code completion location within the string. In

    // these cases we generate a completion location and place the currently

    // lexed string within the token (without the quotes). This allows for the

    // parser to use the partially lexed string when computing the completion

    // results.

    if (curPtr == codeCompletionLocation) {

      return formToken(Token::code_complete_string,

                       tokStart + (isStringBlock ? 2 : 1));

    }


    switch (*curPtr++) {

    case '"':

      // If this is a string block, we only end the string when we encounter a

      // `}]`.

      if (!isStringBlock)

        return formToken(Token::string, tokStart);

      continue;

    case '}':

      // If this is a string block, we only end the string when we encounter a

      // `}]`.

      if (!isStringBlock || *curPtr != ']')

        continue;

      ++curPtr;

      return formToken(Token::string_block, tokStart);

    case 0: {

      // If this is a random nul character in the middle of a string, just

      // include it. If it is the end of file, then it is an error.

      if (curPtr - 1 != curBuffer.end())

        continue;

      --curPtr;


      StringRef expectedEndStr = isStringBlock ? "}]" : "\"";

      return emitError(curPtr - 1,

                       "expected '" + expectedEndStr + "' in string literal");

    }


    case '\n':

    case '\v':

    case '\f':

      // String blocks allow multiple lines.

      if (!isStringBlock)

        return emitError(curPtr - 1, "expected '\"' in string literal");

      continue;


    case '\\':

      // Handle explicitly a few escapes.

      if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' ||

          *curPtr == 't') {

        ++curPtr;

      } else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1])) {

        // Support \xx for two hex digits.

        curPtr += 2;

      } else {

        return emitError(curPtr - 1, "unknown escape in string literal");

      }

      continue;


    default:

      continue;

    }

  }

}

success
return success()

Diagnostic.h

result
result
Definition LinalgTransformOps.cpp:2097

false
false
Parses a map_entries map type from a string format back into its numeric value.
Definition OpenMPDialect.cpp:1915

nullptr
nullptr
Definition OpenMPToLLVMIRTranslation.cpp:1379

CodeComplete.h

Lexer.h

mlir::Lexer::Lexer
Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context, AsmParserCodeCompleteContext *codeCompleteContext)
Definition Lexer.cpp:36

mlir::Token::is
bool is(Kind k) const
Definition Token.h:38

mlir::Token::getStringValue
std::string getStringValue() const
Given a token containing a string literal, return its value, including removing the quote characters ...
Definition Token.cpp:86

mlir::Token::getKind
Kind getKind() const
Definition Token.h:37

mlir::Token::getSpelling
StringRef getSpelling() const
Definition Token.h:34

mlir::pdll::CodeCompleteContext
This class provides an abstract interface into the parser for hooking in code completion events.
Definition CodeComplete.h:30

mlir::pdll::CodeCompleteContext::getCodeCompleteLoc
SMLoc getCodeCompleteLoc() const
Return the location used to provide code completion.
Definition CodeComplete.h:35

mlir::pdll::Lexer::emitError
Token emitError(SMRange loc, const Twine &msg)
Emit an error to the lexer with the given location and message.
Definition Lexer.cpp:120

mlir::pdll::Lexer::emitErrorAndNote
Token emitErrorAndNote(SMRange loc, const Twine &msg, SMRange noteLoc, const Twine &note)
Definition Lexer.cpp:124

mlir::pdll::Lexer::lexToken
Token lexToken()
Lex the next token and return it.

mlir::pdll::Lexer::pushInclude
LogicalResult pushInclude(StringRef filename, SMRange includeLoc)
Push an include of the given file.
Definition Lexer.cpp:107

mlir::pdll::Lexer::~Lexer
~Lexer()
Definition Lexer.cpp:102

mlir::pdll::Token
Definition Lexer.h:32

mlir::pdll::Token::Kind
Kind
Definition Lexer.h:34

mlir::pdll::Token::kw_Pattern
@ kw_Pattern
Definition Lexer.h:61

mlir::pdll::Token::semicolon
@ semicolon
Definition Lexer.h:80

mlir::pdll::Token::kw_Value
@ kw_Value
Definition Lexer.h:68

mlir::pdll::Token::r_paren
@ r_paren
Definition Lexer.h:87

mlir::pdll::Token::dot
@ dot
Definition Lexer.h:77

mlir::pdll::Token::kw_attr
@ kw_attr
Definition Lexer.h:48

mlir::pdll::Token::l_paren
@ l_paren
Definition Lexer.h:86

mlir::pdll::Token::code_complete_string
@ code_complete_string
Token signifying a code completion location within a string.
Definition Lexer.h:41

mlir::pdll::Token::equal_arrow
@ equal_arrow
Definition Lexer.h:79

mlir::pdll::Token::kw_return
@ kw_return
Definition Lexer.h:63

mlir::pdll::Token::directive
@ directive
Tokens.
Definition Lexer.h:93

mlir::pdll::Token::kw_rewrite
@ kw_rewrite
Definition Lexer.h:64

mlir::pdll::Token::equal
@ equal
Definition Lexer.h:78

mlir::pdll::Token::error
@ error
Definition Lexer.h:37

mlir::pdll::Token::eof
@ eof
Markers.
Definition Lexer.h:36

mlir::pdll::Token::l_square
@ l_square
Definition Lexer.h:88

mlir::pdll::Token::underscore
@ underscore
Definition Lexer.h:90

mlir::pdll::Token::code_complete
@ code_complete
Token signifying a code completion location.
Definition Lexer.h:39

mlir::pdll::Token::arrow
@ arrow
Punctuation.
Definition Lexer.h:74

mlir::pdll::Token::kw_TypeRange
@ kw_TypeRange
Definition Lexer.h:67

mlir::pdll::Token::kw_erase
@ kw_erase
Definition Lexer.h:55

mlir::pdll::Token::string
@ string
Definition Lexer.h:97

mlir::pdll::Token::kw_Op
@ kw_Op
Definition Lexer.h:59

mlir::pdll::Token::l_brace
@ l_brace
Definition Lexer.h:84

mlir::pdll::Token::kw_let
@ kw_let
Definition Lexer.h:56

mlir::pdll::Token::kw_op
@ kw_op
Definition Lexer.h:49

mlir::pdll::Token::kw_Rewrite
@ kw_Rewrite
Definition Lexer.h:65

mlir::pdll::Token::kw_ValueRange
@ kw_ValueRange
Definition Lexer.h:69

mlir::pdll::Token::identifier
@ identifier
Definition Lexer.h:94

mlir::pdll::Token::less
@ less
Paired punctuation.
Definition Lexer.h:82

mlir::pdll::Token::kw_type
@ kw_type
Definition Lexer.h:50

mlir::pdll::Token::kw_Type
@ kw_Type
Definition Lexer.h:66

mlir::pdll::Token::greater
@ greater
Definition Lexer.h:83

mlir::pdll::Token::kw_Constraint
@ kw_Constraint
Definition Lexer.h:57

mlir::pdll::Token::colon
@ colon
Definition Lexer.h:75

mlir::pdll::Token::kw_not
@ kw_not
Definition Lexer.h:58

mlir::pdll::Token::kw_with
@ kw_with
Definition Lexer.h:70

mlir::pdll::Token::kw_replace
@ kw_replace
Definition Lexer.h:62

mlir::pdll::Token::r_square
@ r_square
Definition Lexer.h:89

mlir::pdll::Token::kw_OpName
@ kw_OpName
Definition Lexer.h:60

mlir::pdll::Token::r_brace
@ r_brace
Definition Lexer.h:85

mlir::pdll::Token::string_block
@ string_block
Definition Lexer.h:96

mlir::pdll::Token::integer
@ integer
Definition Lexer.h:95

mlir::pdll::Token::comma
@ comma
Definition Lexer.h:76

mlir::pdll::Token::kw_Attr
@ kw_Attr
General keywords.
Definition Lexer.h:54

mlir::pdll::ast::DiagnosticEngine
This class manages the construction and emission of PDLL diagnostics.
Definition Diagnostic.h:139

mlir::pdll
Definition Context.h:16

mlir
Include the generated interface declarations.
Definition AliasAnalysis.h:19

mlir::StringSwitch
llvm::StringSwitch< T, R > StringSwitch
Definition LLVM.h:141