Trusted answers to developer questions
Trusted Answers to Developer Questions

Related Tags

c

What is lexical analysis?

Ahmed Yasser

Grokking Modern System Design Interview for Engineers & Managers

Ace your System Design Interview and take your career to the next level. Learn to handle the design of applications like Netflix, Quora, Facebook, Uber, and many more in a 45-min interview. Learn the RESHADED framework for architecting web-scale applications by determining requirements, constraints, and assumptions before diving into a step-by-step design process.

Overview of lexical analysis

The lexical analyzer is the compiler's first phase, which converts the input into a series of tokens or lexemes.

The first phase of the compiler

There are different types of tokens:

  1. Keywords
  2. Operators
  3. Identifiers
  4. Constants
  5. Special Characters

The Lexical Analyzer divides into three main functions.

  1. Removes comments and white space
  2. Tokenization
  3. Generates an error

How to clear comments and white space

Suppose we're writing the following code in the C programming language:

// Declaring a variable x
int x;
// Initializing the variable x
x=2+3;
return x;

The lexical analyzer removes the comments and unnecessary white space from the code.

int x;x=2+3;return x;

Now lexical analyzer will perform tokenization as its next task.

Tokenization

In tokenization, the lexical analyzer will create a symbol table and place all the identifiers.

The valid tokens created from the code above are as follows:

Tokens



"int"

Keyword

"x"

Identifier

";"

Speical Character

"="

Operator

"+"

Operator

"2"

Constant

"3"

Constant

"return"

Keyword

How to generate an error

If there is any invalid token, we can identify it using a lexical analyzer, which generates the error messages by providing the row and column numbers.

The following line in the code of the C programming language helps with this:

integer x = 2;

It will generate the error message as there is no integer keyword present. And also, if it is an identifier, then there can't be two back-to-back (integer and x) identifiers with space between them; thus, the lexical analysis will generate an error message with the lines row and column number.

Code

Following is the program of lexical analyzer which generates valid as well as invalid tokens of C programming language.

#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
//Following are the functions for checking different tokens
bool isSpecial(char c) {
if (c == ' ' || c == '+' || c == '-' || c == '*' || c == '/' || c == ',' || c == ';' || c == '>' || c == '<' || c == '=' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}') return (true);
return (false);
}
bool isOperator(char c){
if (c == '+' || c == '-' || c == '*' || c == '/' || c == '>' || c == '<' || c == '=') return (true);
return (false);
}
bool isIdentifier(char* in){
if (in[0] == '0' || in[0] == '1' || in[0] == '2' || in[0] == '3' || in[0] == '4' || in[0] == '5' || in[0] == '6' || in[0] == '7' || in[0] == '8' || in[0] == '9' || isSpecial(in[0]) == true) return (false);
return (true);
}
bool isKeyword(char* in) {
if (!strcmp(in, "if") || !strcmp(in, "else") || !strcmp(in, "while") || !strcmp(in, "do") || !strcmp(in, "break") || !strcmp(in, "continue") || !strcmp(in, "int") || !strcmp(in, "double") || !strcmp(in, "float") || !strcmp(in, "return") || !strcmp(in, "char") || !strcmp(in, "case") || !strcmp(in, "char") || !strcmp(in, "sizeof") || !strcmp(in, "long") || !strcmp(in, "short") || !strcmp(in, "typedef") || !strcmp(in, "switch") || !strcmp(in, "unsigned") || !strcmp(in, "void") || !strcmp(in, "static") || !strcmp(in, "inuct") || !strcmp(in, "goto")) return (true);
return (false);
}
bool isInteger(char* in) {
int i, len = strlen(in);
if (len == 0) return (false);
for (i = 0; i < len; i++) {
if (in[i] != '0' && in[i] != '1' && in[i] != '2'&& in[i] != '3' && in[i] != '4' && in[i] != '5' && in[i] != '6' && in[i] != '7' && in[i] != '8' && in[i] != '9' || (in[i] == '-' && i > 0)) return (false);
}
return (true);
}
bool isRealNumber(char* in) {
int i, len = strlen(in);
bool hasDecimal = false;
if (len == 0) return (false);
for (i = 0; i < len; i++) {
if (in[i] != '0' && in[i] != '1' && in[i] != '2' && in[i] != '3' && in[i] != '4' && in[i] != '5' && in[i] != '6' && in[i] != '7' && in[i] != '8' && in[i] != '9' && in[i] != '.' || (in[i] == '-' && i > 0)) return (false);
if (in[i] == '.') hasDecimal = true;
}
return (hasDecimal);
}
char* subString(char* in, int left, int right) {
int i;
char* subStr = (char*)malloc( sizeof(char) * (right - left + 2));
for (i = left; i <= right; i++)
subStr[i - left] = in[i];
subStr[right - left + 1] = '\0';
return (subStr);
}
//This is the main driver code for checking entered string
void lexicalAnalyzer(char* in) {
int left = 0, right = 0;
int length = strlen(in);
while (right <= length && left <= right) {
if (isSpecial(in[right]) == false)
right++;
if (isSpecial(in[right]) == true && left == right) {
if (isOperator(in[right]) == true) printf("Valid Operator : '%c'\n", in[right]);
else printf("Valid Special Character : '%c'\n", in[right]);
right++;
left = right;
} else if (isSpecial(in[right]) == true && left != right || (right == length && left != right)) {
char* subStr = subString(in, left, right - 1);
if (isKeyword(subStr) == true)
printf("Valid Keyword : '%s'\n", subStr);
else if ((isInteger(subStr) == true)|| (isRealNumber(subStr) == true))
printf("Valid Constant : '%s'\n", subStr);
else if (isIdentifier(subStr) == true && isSpecial(in[right - 1]) == false)
printf("Valid Identifier : '%s'\n", subStr);
else if (isIdentifier(subStr) == false && isOperator(in[right - 1]) == false)
printf("Invalid Identifier : '%s'\n", subStr);
left = right;
}
}
return;
}
//Driver Code
int main(){
char in[1000];
scanf("%[^\n]%*c", in);
printf("Entered Code is : '%s' \n", in);
lexicalAnalyzer(in);
return (0);
}

Enter the input below

Lexical analayzer token generator

In the code above:

  • Line 7–40: We have created six different functions to identify which of the token (keywords, operators, identifiers, constants, special characters) is in the input.
  • Line 41-48: In this substring function, we are returning the substring of the given input string char *in by providing its left and right indexes.
  • Line 51-76: This is the primary driver function that iterates over the input string in and checks all the valid tokens. If all the conditions are false, it identifies that token as an invalid identifier.

RELATED TAGS

c

CONTRIBUTOR

Ahmed Yasser
Copyright ©2022 Educative, Inc. All rights reserved

Grokking Modern System Design Interview for Engineers & Managers

Ace your System Design Interview and take your career to the next level. Learn to handle the design of applications like Netflix, Quora, Facebook, Uber, and many more in a 45-min interview. Learn the RESHADED framework for architecting web-scale applications by determining requirements, constraints, and assumptions before diving into a step-by-step design process.

Keep Exploring