scripts/genksyms/lex.l

0001 /* SPDX-License-Identifier: GPL-2.0-or-later */
0002 /*
0003  * Lexical analysis for genksyms.
0004  * Copyright 1996, 1997 Linux International.
0005  *
0006  * New implementation contributed by Richard Henderson <rth@tamu.edu>
0007  * Based on original work by Bjorn Ekwall <bj0rn@blox.se>
0008  *
0009  * Taken from Linux modutils 2.4.22.
0010  */
0011
0012 %{
0013
0014 #include <limits.h>
0015 #include <stdlib.h>
0016 #include <string.h>
0017 #include <ctype.h>
0018
0019 #include "genksyms.h"
0020 #include "parse.tab.h"
0021
0022 /* We've got a two-level lexer here.  We let flex do basic tokenization
0023    and then we categorize those basic tokens in the second stage.  */
0024 #define YY_DECL         static int yylex1(void)
0025
0026 %}
0027
0028 IDENT                   [A-Za-z_\$][A-Za-z0-9_\$]*
0029
0030 O_INT                   0[0-7]*
0031 D_INT                   [1-9][0-9]*
0032 X_INT                   0[Xx][0-9A-Fa-f]+
0033 I_SUF                   [Uu]|[Ll]|[Uu][Ll]|[Ll][Uu]
0034 INT                     ({O_INT}|{D_INT}|{X_INT}){I_SUF}?
0035
0036 FRAC                    ([0-9]*\.[0-9]+)|([0-9]+\.)
0037 EXP                     [Ee][+-]?[0-9]+
0038 F_SUF                   [FfLl]
0039 REAL                    ({FRAC}{EXP}?{F_SUF}?)|([0-9]+{EXP}{F_SUF}?)
0040
0041 STRING                  L?\"([^\\\"]*\\.)*[^\\\"]*\"
0042 CHAR                    L?\'([^\\\']*\\.)*[^\\\']*\'
0043
0044 MC_TOKEN                ([~%^&*+=|<>/-]=)|(&&)|("||")|(->)|(<<)|(>>)
0045
0046 /* We don't do multiple input files.  */
0047 %option noyywrap
0048
0049 %option noinput
0050
0051 %%
0052
0053
0054  /* Keep track of our location in the original source files.  */
0055 ^#[ \t]+{INT}[ \t]+\"[^\"\n]+\".*\n     return FILENAME;
0056 ^#.*\n                                  cur_line++;
0057 \n                                      cur_line++;
0058
0059  /* Ignore all other whitespace.  */
0060 [ \t\f\v\r]+                            ;
0061
0062
0063 {STRING}                                return STRING;
0064 {CHAR}                                  return CHAR;
0065 {IDENT}                                 return IDENT;
0066
0067  /* The Pedant requires that the other C multi-character tokens be
0068     recognized as tokens.  We don't actually use them since we don't
0069     parse expressions, but we do want whitespace to be arranged
0070     around them properly.  */
0071 {MC_TOKEN}                              return OTHER;
0072 {INT}                                   return INT;
0073 {REAL}                                  return REAL;
0074
0075 "..."                                   return DOTS;
0076
0077  /* All other tokens are single characters.  */
0078 .                                       return yytext[0];
0079
0080
0081 %%
0082
0083 /* Bring in the keyword recognizer.  */
0084
0085 #include "keywords.c"
0086
0087
0088 /* Macros to append to our phrase collection list.  */
0089
0090 /*
0091  * We mark any token, that that equals to a known enumerator, as
0092  * SYM_ENUM_CONST. The parser will change this for struct and union tags later,
0093  * the only problem is struct and union members:
0094  *    enum e { a, b }; struct s { int a, b; }
0095  * but in this case, the only effect will be, that the ABI checksums become
0096  * more volatile, which is acceptable. Also, such collisions are quite rare,
0097  * so far it was only observed in include/linux/telephony.h.
0098  */
0099 #define _APP(T,L)       do {                                               \
0100                           cur_node = next_node;                            \
0101                           next_node = xmalloc(sizeof(*next_node));         \
0102                           next_node->next = cur_node;                      \
0103                           cur_node->string = memcpy(xmalloc(L+1), T, L+1); \
0104                           cur_node->tag =                                  \
0105                             find_symbol(cur_node->string, SYM_ENUM_CONST, 1)?\
0106                             SYM_ENUM_CONST : SYM_NORMAL ;                  \
0107                           cur_node->in_source_file = in_source_file;       \
0108                         } while (0)
0109
0110 #define APP             _APP(yytext, yyleng)
0111
0112
0113 /* The second stage lexer.  Here we incorporate knowledge of the state
0114    of the parser to tailor the tokens that are returned.  */
0115
0116 int
0117 yylex(void)
0118 {
0119   static enum {
0120     ST_NOTSTARTED, ST_NORMAL, ST_ATTRIBUTE, ST_ASM, ST_TYPEOF, ST_TYPEOF_1,
0121     ST_BRACKET, ST_BRACE, ST_EXPRESSION, ST_STATIC_ASSERT,
0122   } lexstate = ST_NOTSTARTED;
0123
0124   static int suppress_type_lookup, dont_want_brace_phrase;
0125   static struct string_list *next_node;
0126   static char *source_file;
0127
0128   int token, count = 0;
0129   struct string_list *cur_node;
0130
0131   if (lexstate == ST_NOTSTARTED)
0132     {
0133       next_node = xmalloc(sizeof(*next_node));
0134       next_node->next = NULL;
0135       lexstate = ST_NORMAL;
0136     }
0137
0138 repeat:
0139   token = yylex1();
0140
0141   if (token == 0)
0142     return 0;
0143   else if (token == FILENAME)
0144     {
0145       char *file, *e;
0146
0147       /* Save the filename and line number for later error messages.  */
0148
0149       if (cur_filename)
0150         free(cur_filename);
0151
0152       file = strchr(yytext, '\"')+1;
0153       e = strchr(file, '\"');
0154       *e = '\0';
0155       cur_filename = memcpy(xmalloc(e-file+1), file, e-file+1);
0156       cur_line = atoi(yytext+2);
0157
0158       if (!source_file) {
0159         source_file = xstrdup(cur_filename);
0160         in_source_file = 1;
0161       } else {
0162         in_source_file = (strcmp(cur_filename, source_file) == 0);
0163       }
0164
0165       goto repeat;
0166     }
0167
0168   switch (lexstate)
0169     {
0170     case ST_NORMAL:
0171       switch (token)
0172         {
0173         case IDENT:
0174           APP;
0175           {
0176             int r = is_reserved_word(yytext, yyleng);
0177             if (r >= 0)
0178               {
0179                 switch (token = r)
0180                   {
0181                   case ATTRIBUTE_KEYW:
0182                     lexstate = ST_ATTRIBUTE;
0183                     count = 0;
0184                     goto repeat;
0185                   case ASM_KEYW:
0186                     lexstate = ST_ASM;
0187                     count = 0;
0188                     goto repeat;
0189                   case TYPEOF_KEYW:
0190                     lexstate = ST_TYPEOF;
0191                     count = 0;
0192                     goto repeat;
0193
0194                   case STRUCT_KEYW:
0195                   case UNION_KEYW:
0196                   case ENUM_KEYW:
0197                     dont_want_brace_phrase = 3;
0198                     suppress_type_lookup = 2;
0199                     goto fini;
0200
0201                   case EXPORT_SYMBOL_KEYW:
0202                       goto fini;
0203
0204                   case STATIC_ASSERT_KEYW:
0205                     lexstate = ST_STATIC_ASSERT;
0206                     count = 0;
0207                     goto repeat;
0208                   }
0209               }
0210             if (!suppress_type_lookup)
0211               {
0212                 if (find_symbol(yytext, SYM_TYPEDEF, 1))
0213                   token = TYPE;
0214               }
0215           }
0216           break;
0217
0218         case '[':
0219           APP;
0220           lexstate = ST_BRACKET;
0221           count = 1;
0222           goto repeat;
0223
0224         case '{':
0225           APP;
0226           if (dont_want_brace_phrase)
0227             break;
0228           lexstate = ST_BRACE;
0229           count = 1;
0230           goto repeat;
0231
0232         case '=': case ':':
0233           APP;
0234           lexstate = ST_EXPRESSION;
0235           break;
0236
0237         default:
0238           APP;
0239           break;
0240         }
0241       break;
0242
0243     case ST_ATTRIBUTE:
0244       APP;
0245       switch (token)
0246         {
0247         case '(':
0248           ++count;
0249           goto repeat;
0250         case ')':
0251           if (--count == 0)
0252             {
0253               lexstate = ST_NORMAL;
0254               token = ATTRIBUTE_PHRASE;
0255               break;
0256             }
0257           goto repeat;
0258         default:
0259           goto repeat;
0260         }
0261       break;
0262
0263     case ST_ASM:
0264       APP;
0265       switch (token)
0266         {
0267         case '(':
0268           ++count;
0269           goto repeat;
0270         case ')':
0271           if (--count == 0)
0272             {
0273               lexstate = ST_NORMAL;
0274               token = ASM_PHRASE;
0275               break;
0276             }
0277           goto repeat;
0278         default:
0279           goto repeat;
0280         }
0281       break;
0282
0283     case ST_TYPEOF_1:
0284       if (token == IDENT)
0285         {
0286           if (is_reserved_word(yytext, yyleng) >= 0
0287               || find_symbol(yytext, SYM_TYPEDEF, 1))
0288             {
0289               yyless(0);
0290               unput('(');
0291               lexstate = ST_NORMAL;
0292               token = TYPEOF_KEYW;
0293               break;
0294             }
0295           _APP("(", 1);
0296         }
0297         lexstate = ST_TYPEOF;
0298         /* FALLTHRU */
0299
0300     case ST_TYPEOF:
0301       switch (token)
0302         {
0303         case '(':
0304           if ( ++count == 1 )
0305             lexstate = ST_TYPEOF_1;
0306           else
0307             APP;
0308           goto repeat;
0309         case ')':
0310           APP;
0311           if (--count == 0)
0312             {
0313               lexstate = ST_NORMAL;
0314               token = TYPEOF_PHRASE;
0315               break;
0316             }
0317           goto repeat;
0318         default:
0319           APP;
0320           goto repeat;
0321         }
0322       break;
0323
0324     case ST_BRACKET:
0325       APP;
0326       switch (token)
0327         {
0328         case '[':
0329           ++count;
0330           goto repeat;
0331         case ']':
0332           if (--count == 0)
0333             {
0334               lexstate = ST_NORMAL;
0335               token = BRACKET_PHRASE;
0336               break;
0337             }
0338           goto repeat;
0339         default:
0340           goto repeat;
0341         }
0342       break;
0343
0344     case ST_BRACE:
0345       APP;
0346       switch (token)
0347         {
0348         case '{':
0349           ++count;
0350           goto repeat;
0351         case '}':
0352           if (--count == 0)
0353             {
0354               lexstate = ST_NORMAL;
0355               token = BRACE_PHRASE;
0356               break;
0357             }
0358           goto repeat;
0359         default:
0360           goto repeat;
0361         }
0362       break;
0363
0364     case ST_EXPRESSION:
0365       switch (token)
0366         {
0367         case '(': case '[': case '{':
0368           ++count;
0369           APP;
0370           goto repeat;
0371         case '}':
0372           /* is this the last line of an enum declaration? */
0373           if (count == 0)
0374             {
0375               /* Put back the token we just read so's we can find it again
0376                  after registering the expression.  */
0377               unput(token);
0378
0379               lexstate = ST_NORMAL;
0380               token = EXPRESSION_PHRASE;
0381               break;
0382             }
0383           /* FALLTHRU */
0384         case ')': case ']':
0385           --count;
0386           APP;
0387           goto repeat;
0388         case ',': case ';':
0389           if (count == 0)
0390             {
0391               /* Put back the token we just read so's we can find it again
0392                  after registering the expression.  */
0393               unput(token);
0394
0395               lexstate = ST_NORMAL;
0396               token = EXPRESSION_PHRASE;
0397               break;
0398             }
0399           APP;
0400           goto repeat;
0401         default:
0402           APP;
0403           goto repeat;
0404         }
0405       break;
0406
0407     case ST_STATIC_ASSERT:
0408       APP;
0409       switch (token)
0410         {
0411         case '(':
0412           ++count;
0413           goto repeat;
0414         case ')':
0415           if (--count == 0)
0416             {
0417               lexstate = ST_NORMAL;
0418               token = STATIC_ASSERT_PHRASE;
0419               break;
0420             }
0421           goto repeat;
0422         default:
0423           goto repeat;
0424         }
0425       break;
0426
0427     default:
0428       exit(1);
0429     }
0430 fini:
0431
0432   if (suppress_type_lookup > 0)
0433     --suppress_type_lookup;
0434   if (dont_want_brace_phrase > 0)
0435     --dont_want_brace_phrase;
0436
0437   yylval = &next_node->next;
0438
0439   return token;
0440 }