From 02a9d8a8b07530f50d27b6158329dd8d218d298b Mon Sep 17 00:00:00 2001 From: "C. McEnroe" Date: Wed, 20 Jan 2021 18:01:04 -0500 Subject: Add messy sh lexer Surprisingly seems to work for everything I looked at in my repos. --- bin/Makefile | 1 + bin/hilex.c | 1 + bin/hilex.h | 1 + bin/man1/hilex.1 | 22 +++++-- bin/sh.l | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 196 insertions(+), 4 deletions(-) create mode 100644 bin/sh.l diff --git a/bin/Makefile b/bin/Makefile index b133232f..7e6b0b7d 100644 --- a/bin/Makefile +++ b/bin/Makefile @@ -83,6 +83,7 @@ OBJS.hilex += c11.o OBJS.hilex += hilex.o OBJS.hilex += make.o OBJS.hilex += mdoc.o +OBJS.hilex += sh.o hilex: ${OBJS.hilex} ${CC} ${LDFLAGS} ${OBJS.$@} ${LDLIBS.$@} -o $@ diff --git a/bin/hilex.c b/bin/hilex.c index 4952c7ad..8a03eb80 100644 --- a/bin/hilex.c +++ b/bin/hilex.c @@ -53,6 +53,7 @@ static const struct { { &LexC, "c", "[.][chlmy]$", NULL }, { &LexMake, "make", "[.](mk|am)$|^Makefile$", NULL }, { &LexMdoc, "mdoc", "[.][1-9]$", "^[.]Dd" }, + { &LexSh, "sh", "[.]sh$|^[.](profile|shrc)$", "^#!/bin/sh" }, { &LexText, "text", "[.]txt$", NULL }, }; diff --git a/bin/hilex.h b/bin/hilex.h index 2c080e20..882b5f95 100644 --- a/bin/hilex.h +++ b/bin/hilex.h @@ -47,3 +47,4 @@ struct Lexer { extern const struct Lexer LexC; extern const struct Lexer LexMake; extern const struct Lexer LexMdoc; +extern const struct Lexer LexSh; diff --git a/bin/man1/hilex.1 b/bin/man1/hilex.1 index ffa20bcf..80b3155b 100644 --- a/bin/man1/hilex.1 +++ b/bin/man1/hilex.1 @@ -1,4 +1,4 @@ -.Dd January 18, 2021 +.Dd January 20, 2021 .Dt HILEX 1 .Os . @@ -167,9 +167,8 @@ Inferred for files. . .It Cm make -The BSD -.Xr make 1 -language. +BSD +.Xr make 1 . Inferred for .Pa Makefile , .Pa *.mk @@ -187,6 +186,21 @@ files and files starting with .Dq .Dd . . +.It Cm sh +POSIX +.Xr sh 1 . +Since lexical analysis of +the shell command language +is effectively impossible, +this is best-effort only. +Inferred for +.Pa *.sh , +.Pa .profile , +.Pa .shrc +files +and files starting with +.Dq #!/bin/sh . +. .It Cm text Plain text. Inferred for diff --git a/bin/sh.l b/bin/sh.l new file mode 100644 index 00000000..77dd2518 --- /dev/null +++ b/bin/sh.l @@ -0,0 +1,175 @@ +/* Copyright (C) 2021 C. McEnroe + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +%option prefix="sh" +%option noyywrap + +%{ +#include +#include +#include "hilex.h" + +enum { Cap = 64 }; +static int len = 1; +static int stack[Cap]; +static int push(int val) { + if (len < Cap) stack[len++] = val; + return val; +} +static int pop(void) { + if (len > 1) len--; + return stack[len-1]; +} +%} + +%s First +%s Param Command Arith Backtick +%x DQuote HereDocDel HereDoc HereDocLit + +word [[:alnum:]_.-]+ +param [^:=?+%#{}-]+ +reserved [!{}]|else|do|elif|for|done|fi|then|until|while|if|case|esac + +%% + static char *delimiter; + +[[:blank:]]+ { return Normal; } + +"\\". { return Escape; } + +{ + "$"[*@#?$!0-9-] | + "$"[_[:alpha:][_[:alnum:]]* | + "${"[#]?{param}"}" { + return Subst; + } + "${"{param} { + BEGIN(push(Param)); + return Subst; + } + "$(" { + BEGIN(push(Command)); + return Subst; + } + "$((" { + BEGIN(push(Arith)); + return Subst; + } + "`" { + BEGIN(push(Backtick)); + return Subst; + } +} +"}" | +")" | +"))" | +"`" { + BEGIN(pop()); + return Subst; +} + +[&();|]|"&&"|";;"|"||" { + BEGIN(push(First)); + return Operator; +} +[0-9]?([<>]"&"?|">|"|">>"|"<>") { + return Operator; +} + +^{reserved} { return Keyword; } +{ + [[:blank:]]+ { return Normal; } + {reserved} { + BEGIN(pop()); + return Keyword; + } + {word} { + BEGIN(pop()); + return Normal; + } +} + +{word}/[[:blank:]]*"()" { return Ident; } + +[0-9]?("<<"|"<<-") { + BEGIN(push(HereDocDel)); + return Operator; +} +{ + [[:blank:]]+ { return Normal; } + {word} { + delimiter = strdup(yytext); + assert(delimiter); + BEGIN(pop(), push(HereDoc)); + return Ident; + } + "'"{word}"'" { + delimiter = strndup(&yytext[1], strlen(yytext)-2); + assert(delimiter); + BEGIN(pop(), push(HereDocLit)); + return Ident; + } +} +{ + ^"\t"*{word} { + if (strcmp(&yytext[strspn(yytext, "\t")], delimiter)) REJECT; + free(delimiter); + BEGIN(pop()); + return Ident; + } +} +{ + [^$`\n]+ { return String; } + .|\n { return String; } +} +{ + .*\n { return String; } +} + +"'"[^'']*"'" { return String; } + +"\""/[^$`\\] { + BEGIN(push(DQuote)); + yymore(); +} +"\"" { + BEGIN(push(DQuote)); + return String; +} + +{ + [^\\$`""]*"\"" { + BEGIN(pop()); + return String; + } + "\\"[$`""\\\n] { return Escape; } + [^\\$`""]+|. { return String; } +} + +"#".* { return Comment; } + +{word} { return Normal; } + +.|\n { return Normal; } + +%{ + (void)yyunput; + (void)input; +%} + +%% + +const struct Lexer LexSh = { yylex, &yyin, &yytext }; -- cgit 1.4.1