Skip to content

[LLVM][IR] Switch from manual pointer incrementation to function in Lexer #152103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/include/llvm/AsmParser/LLLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,12 @@ namespace llvm {
private:
lltok::Kind LexToken();

// Return closes pointer after `Ptr` that is an end of a label.
// Returns nullptr if `Ptr` doesn't point into a label.
const char *getLabelTail(const char *Ptr);
int getNextChar();
const char *skipNChars(unsigned N);
void advancePositionTo(const char *Ptr);
void SkipLineComment();
bool SkipCComment();
lltok::Kind ReadString(lltok::Kind kind);
Expand Down
135 changes: 75 additions & 60 deletions llvm/lib/AsmParser/LLLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,6 @@ static bool isLabelChar(char C) {
C == '.' || C == '_';
}

/// isLabelTail - Return true if this pointer points to a valid end of a label.
static const char *isLabelTail(const char *CurPtr) {
while (true) {
if (CurPtr[0] == ':') return CurPtr+1;
if (!isLabelChar(CurPtr[0])) return nullptr;
++CurPtr;
}
}

//===----------------------------------------------------------------------===//
// Lexer definition.
//===----------------------------------------------------------------------===//
Expand All @@ -174,19 +165,35 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
CurPtr = CurBuf.begin();
}

const char *LLLexer::getLabelTail(const char *Ptr) {
while (Ptr != CurBuf.end()) {
if (Ptr[0] == ':')
return Ptr + 1;
if (!isLabelChar(Ptr[0]))
return nullptr;
++Ptr;
}
return nullptr;
}

int LLLexer::getNextChar() {
char CurChar = *CurPtr++;
switch (CurChar) {
default: return (unsigned char)CurChar;
case 0:
// A nul character in the stream is either the end of the current buffer or
// a random nul in the file. Disambiguate that here.
if (CurPtr-1 != CurBuf.end())
return 0; // Just whitespace.

// Otherwise, return end of file.
--CurPtr; // Another call to lex will return EOF again.
if (CurPtr == CurBuf.end())
return EOF;
return *CurPtr++;
}

const char *LLLexer::skipNChars(unsigned N) {
while (N--)
getNextChar();
return CurPtr;
}

void LLLexer::advancePositionTo(const char *Ptr) {
while (CurPtr != Ptr) {
if (CurPtr > Ptr) {
--CurPtr;
} else
getNextChar();
}
}

Expand Down Expand Up @@ -215,13 +222,13 @@ lltok::Kind LLLexer::LexToken() {
case '%': return LexPercent();
case '"': return LexQuote();
case '.':
if (const char *Ptr = isLabelTail(CurPtr)) {
CurPtr = Ptr;
if (const char *Ptr = getLabelTail(CurPtr)) {
advancePositionTo(Ptr);
StrVal.assign(TokStart, CurPtr-1);
return lltok::LabelStr;
}
if (CurPtr[0] == '.' && CurPtr[1] == '.') {
CurPtr += 2;
skipNChars(2);
return lltok::dotdotdot;
}
return lltok::Error;
Expand Down Expand Up @@ -298,15 +305,15 @@ lltok::Kind LLLexer::LexAt() {
}

lltok::Kind LLLexer::LexDollar() {
if (const char *Ptr = isLabelTail(TokStart)) {
CurPtr = Ptr;
if (const char *Ptr = getLabelTail(TokStart)) {
advancePositionTo(Ptr);
StrVal.assign(TokStart, CurPtr - 1);
return lltok::LabelStr;
}

// Handle DollarStringConstant: $\"[^\"]*\"
if (CurPtr[0] == '"') {
++CurPtr;
getNextChar();

while (true) {
int CurChar = getNextChar();
Expand Down Expand Up @@ -358,11 +365,11 @@ bool LLLexer::ReadVarName() {
if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_') {
++CurPtr;
getNextChar();
while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_')
++CurPtr;
getNextChar();

StrVal.assign(NameStart, CurPtr);
return true;
Expand All @@ -376,7 +383,8 @@ lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
if (!isdigit(static_cast<unsigned char>(CurPtr[0])))
return lltok::Error;

for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
for (getNextChar(); isdigit(static_cast<unsigned char>(CurPtr[0]));
getNextChar())
/*empty*/;

uint64_t Val = atoull(TokStart + 1, CurPtr);
Expand All @@ -389,7 +397,7 @@ lltok::Kind LLLexer::LexUIntID(lltok::Kind Token) {
lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
// Handle StringConstant: \"[^\"]*\"
if (CurPtr[0] == '"') {
++CurPtr;
getNextChar();

while (true) {
int CurChar = getNextChar();
Expand Down Expand Up @@ -435,7 +443,7 @@ lltok::Kind LLLexer::LexQuote() {
return kind;

if (CurPtr[0] == ':') {
++CurPtr;
getNextChar();
if (StringRef(StrVal).contains(0)) {
LexError("NUL character is not allowed in names");
kind = lltok::Error;
Expand All @@ -455,11 +463,11 @@ lltok::Kind LLLexer::LexExclaim() {
if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') {
++CurPtr;
getNextChar();
while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
CurPtr[0] == '-' || CurPtr[0] == '$' ||
CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\')
++CurPtr;
getNextChar();

StrVal.assign(TokStart+1, CurPtr); // Skip !
UnEscapeLexed(StrVal);
Expand Down Expand Up @@ -495,7 +503,7 @@ lltok::Kind LLLexer::LexIdentifier() {
const char *IntEnd = CurPtr[-1] == 'i' ? nullptr : StartChar;
const char *KeywordEnd = nullptr;

for (; isLabelChar(*CurPtr); ++CurPtr) {
for (; isLabelChar(*CurPtr); getNextChar()) {
// If we decide this is an integer, remember the end of the sequence.
if (!IntEnd && !isdigit(static_cast<unsigned char>(*CurPtr)))
IntEnd = CurPtr;
Expand All @@ -507,15 +515,16 @@ lltok::Kind LLLexer::LexIdentifier() {
// If we stopped due to a colon, unless we were directed to ignore it,
// this really is a label.
if (!IgnoreColonInIdentifiers && *CurPtr == ':') {
StrVal.assign(StartChar-1, CurPtr++);
StrVal.assign(StartChar - 1, CurPtr);
getNextChar();
return lltok::LabelStr;
}

// Otherwise, this wasn't a label. If this was valid as an integer type,
// return it.
if (!IntEnd) IntEnd = CurPtr;
if (IntEnd != StartChar) {
CurPtr = IntEnd;
advancePositionTo(IntEnd);
uint64_t NumBits = atoull(StartChar, CurPtr);
if (NumBits < IntegerType::MIN_INT_BITS ||
NumBits > IntegerType::MAX_INT_BITS) {
Expand All @@ -528,7 +537,7 @@ lltok::Kind LLLexer::LexIdentifier() {

// Otherwise, this was a letter sequence. See which keyword this is.
if (!KeywordEnd) KeywordEnd = CurPtr;
CurPtr = KeywordEnd;
advancePositionTo(KeywordEnd);
--StartChar;
StringRef Keyword(StartChar, CurPtr - StartChar);

Expand Down Expand Up @@ -1042,7 +1051,7 @@ lltok::Kind LLLexer::LexIdentifier() {
StringRef HexStr(TokStart + 3, len);
if (!all_of(HexStr, isxdigit)) {
// Bad token, return it as an error.
CurPtr = TokStart+3;
advancePositionTo(TokStart + 3);
return lltok::Error;
}
APInt Tmp(bits, HexStr, 16);
Expand All @@ -1055,12 +1064,12 @@ lltok::Kind LLLexer::LexIdentifier() {

// If this is "cc1234", return this as just "cc".
if (TokStart[0] == 'c' && TokStart[1] == 'c') {
CurPtr = TokStart+2;
advancePositionTo(TokStart + 2);
return lltok::kw_cc;
}

// Finally, if this isn't known, return an error.
CurPtr = TokStart+1;
advancePositionTo(TokStart + 1);
return lltok::Error;
}

Expand All @@ -1073,24 +1082,25 @@ lltok::Kind LLLexer::LexIdentifier() {
/// HexHalfConstant 0xH[0-9A-Fa-f]+
/// HexBFloatConstant 0xR[0-9A-Fa-f]+
lltok::Kind LLLexer::Lex0x() {
CurPtr = TokStart + 2;
advancePositionTo(TokStart + 2);

char Kind;
if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H' ||
CurPtr[0] == 'R') {
Kind = *CurPtr++;
Kind = *CurPtr;
getNextChar();
} else {
Kind = 'J';
}

if (!isxdigit(static_cast<unsigned char>(CurPtr[0]))) {
// Bad token, return it as an error.
CurPtr = TokStart+1;
advancePositionTo(TokStart + 1);
return lltok::Error;
}

while (isxdigit(static_cast<unsigned char>(CurPtr[0])))
++CurPtr;
getNextChar();

if (Kind == 'J') {
// HexFPConstant - Floating point constant represented in IEEE format as a
Expand Down Expand Up @@ -1145,9 +1155,9 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
if (!isdigit(static_cast<unsigned char>(TokStart[0])) &&
!isdigit(static_cast<unsigned char>(CurPtr[0]))) {
// Okay, this is not a number after the -, it's probably a label.
if (const char *End = isLabelTail(CurPtr)) {
if (const char *End = getLabelTail(CurPtr)) {
StrVal.assign(TokStart, End-1);
CurPtr = End;
advancePositionTo(End);
return lltok::LabelStr;
}

Expand All @@ -1157,13 +1167,13 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
// At this point, it is either a label, int or fp constant.

// Skip digits, we have at least one.
for (; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
for (; isdigit(static_cast<unsigned char>(CurPtr[0])); getNextChar())
/*empty*/;

// Check if this is a fully-numeric label:
if (isdigit(TokStart[0]) && CurPtr[0] == ':') {
uint64_t Val = atoull(TokStart, CurPtr);
++CurPtr; // Skip the colon.
getNextChar(); // Skip the colon.
if ((unsigned)Val != Val)
LexError("invalid value number (too large)");
UIntVal = unsigned(Val);
Expand All @@ -1172,9 +1182,9 @@ lltok::Kind LLLexer::LexDigitOrNegative() {

// Check to see if this really is a string label, e.g. "-1:".
if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
if (const char *End = isLabelTail(CurPtr)) {
if (const char *End = getLabelTail(CurPtr)) {
StrVal.assign(TokStart, End-1);
CurPtr = End;
advancePositionTo(End);
return lltok::LabelStr;
}
}
Expand All @@ -1188,17 +1198,19 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
return lltok::APSInt;
}

++CurPtr;
getNextChar();

// Skip over [0-9]*([eE][-+]?[0-9]+)?
while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
while (isdigit(static_cast<unsigned char>(CurPtr[0])))
getNextChar();

if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
((CurPtr[1] == '-' || CurPtr[1] == '+') &&
isdigit(static_cast<unsigned char>(CurPtr[2])))) {
CurPtr += 2;
while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
skipNChars(2);
while (isdigit(static_cast<unsigned char>(CurPtr[0])))
getNextChar();
}
}

Expand All @@ -1216,26 +1228,29 @@ lltok::Kind LLLexer::LexPositive() {
return lltok::Error;

// Skip digits.
for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
for (getNextChar(); isdigit(static_cast<unsigned char>(CurPtr[0]));
getNextChar())
/*empty*/;

// At this point, we need a '.'.
if (CurPtr[0] != '.') {
CurPtr = TokStart+1;
advancePositionTo(TokStart + 1);
return lltok::Error;
}

++CurPtr;
getNextChar();

// Skip over [0-9]*([eE][-+]?[0-9]+)?
while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
while (isdigit(static_cast<unsigned char>(CurPtr[0])))
getNextChar();

if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
((CurPtr[1] == '-' || CurPtr[1] == '+') &&
isdigit(static_cast<unsigned char>(CurPtr[2])))) {
CurPtr += 2;
while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
skipNChars(2);
while (isdigit(static_cast<unsigned char>(CurPtr[0])))
getNextChar();
}
}

Expand Down