What is lexical analysis?
Overview of lexical analysis
The lexical analyzer is the compiler's first phase, which converts the input into a series of tokens or lexemes.
There are different types of tokens:
- Keywords
- Operators
- Identifiers
- Constants
- Special Characters
The Lexical Analyzer divides into three main functions.
- Removes comments and white space
- Tokenization
- Generates an error
How to clear comments and white space
Suppose we're writing the following code in the C programming language:
// Declaring a variable xint x;// Initializing the variable xx=2+3;return x;
The lexical analyzer removes the comments and unnecessary white space from the code.
int x;x=2+3;return x;
Now lexical analyzer will perform tokenization as its next task.
Tokenization
In tokenization, the lexical analyzer will create a symbol table and place all the identifiers.
The valid tokens created from the code above are as follows:
Tokens
"int" | Keyword |
"x" | Identifier |
";" | Speical Character |
"=" | Operator |
"+" | Operator |
"2" | Constant |
"3" | Constant |
"return" | Keyword |
How to generate an error
If there is any invalid token, we can identify it using a lexical analyzer, which generates the error messages by providing the row and column numbers.
The following line in the code of the C programming language helps with this:
integer x = 2;
It will generate the error message as there is no integer keyword present. And also, if it is an identifier, then there can't be two back-to-back (integer and x) identifiers with space between them; thus, the lexical analysis will generate an error message with the lines row and column number.
Code
Following is the program of lexical analyzer which generates valid as well as invalid tokens of C programming language.
#include <stdbool.h>#include <stdio.h>#include <string.h>#include <stdlib.h>//Following are the functions for checking different tokensbool isSpecial(char c) {if (c == ' ' || c == '+' || c == '-' || c == '*' || c == '/' || c == ',' || c == ';' || c == '>' || c == '<' || c == '=' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}') return (true);return (false);}bool isOperator(char c){if (c == '+' || c == '-' || c == '*' || c == '/' || c == '>' || c == '<' || c == '=') return (true);return (false);}bool isIdentifier(char* in){if (in[0] == '0' || in[0] == '1' || in[0] == '2' || in[0] == '3' || in[0] == '4' || in[0] == '5' || in[0] == '6' || in[0] == '7' || in[0] == '8' || in[0] == '9' || isSpecial(in[0]) == true) return (false);return (true);}bool isKeyword(char* in) {if (!strcmp(in, "if") || !strcmp(in, "else") || !strcmp(in, "while") || !strcmp(in, "do") || !strcmp(in, "break") || !strcmp(in, "continue") || !strcmp(in, "int") || !strcmp(in, "double") || !strcmp(in, "float") || !strcmp(in, "return") || !strcmp(in, "char") || !strcmp(in, "case") || !strcmp(in, "char") || !strcmp(in, "sizeof") || !strcmp(in, "long") || !strcmp(in, "short") || !strcmp(in, "typedef") || !strcmp(in, "switch") || !strcmp(in, "unsigned") || !strcmp(in, "void") || !strcmp(in, "static") || !strcmp(in, "inuct") || !strcmp(in, "goto")) return (true);return (false);}bool isInteger(char* in) {int i, len = strlen(in);if (len == 0) return (false);for (i = 0; i < len; i++) {if (in[i] != '0' && in[i] != '1' && in[i] != '2'&& in[i] != '3' && in[i] != '4' && in[i] != '5' && in[i] != '6' && in[i] != '7' && in[i] != '8' && in[i] != '9' || (in[i] == '-' && i > 0)) return (false);}return (true);}bool isRealNumber(char* in) {int i, len = strlen(in);bool hasDecimal = false;if (len == 0) return (false);for (i = 0; i < len; i++) {if (in[i] != '0' && in[i] != '1' && in[i] != '2' && in[i] != '3' && in[i] != '4' && in[i] != '5' && in[i] != '6' && in[i] != '7' && in[i] != '8' && in[i] != '9' && in[i] != '.' || (in[i] == '-' && i > 0)) return (false);if (in[i] == '.') hasDecimal = true;}return (hasDecimal);}char* subString(char* in, int left, int right) {int i;char* subStr = (char*)malloc( sizeof(char) * (right - left + 2));for (i = left; i <= right; i++)subStr[i - left] = in[i];subStr[right - left + 1] = '\0';return (subStr);}//This is the main driver code for checking entered stringvoid lexicalAnalyzer(char* in) {int left = 0, right = 0;int length = strlen(in);while (right <= length && left <= right) {if (isSpecial(in[right]) == false)right++;if (isSpecial(in[right]) == true && left == right) {if (isOperator(in[right]) == true) printf("Valid Operator : '%c'\n", in[right]);else printf("Valid Special Character : '%c'\n", in[right]);right++;left = right;} else if (isSpecial(in[right]) == true && left != right || (right == length && left != right)) {char* subStr = subString(in, left, right - 1);if (isKeyword(subStr) == true)printf("Valid Keyword : '%s'\n", subStr);else if ((isInteger(subStr) == true)|| (isRealNumber(subStr) == true))printf("Valid Constant : '%s'\n", subStr);else if (isIdentifier(subStr) == true && isSpecial(in[right - 1]) == false)printf("Valid Identifier : '%s'\n", subStr);else if (isIdentifier(subStr) == false && isOperator(in[right - 1]) == false)printf("Invalid Identifier : '%s'\n", subStr);left = right;}}return;}//Driver Codeint main(){char in[1000];scanf("%[^\n]%*c", in);printf("Entered Code is : '%s' \n", in);lexicalAnalyzer(in);return (0);}
Enter the input below
In the code above:
- Line 7–40: We have created six different functions to identify which of the token (keywords, operators, identifiers, constants, special characters) is in the input.
- Line 41-48: In this substring function, we are returning the substring of the given input string
char *inby providing its left and right indexes. - Line 51-76: This is the primary driver function that iterates over the input string
inand checks all the valid tokens. If all the conditions are false, it identifies that token as an invalid identifier.
Free Resources