summary refs log tree commit diff
path: root/bin/hilex
diff options
context:
space:
mode:
authorJune McEnroe <june@causal.agency>2020-12-28 20:06:44 -0500
committerJune McEnroe <june@causal.agency>2020-12-28 20:11:17 -0500
commitab54d9c9f162d07a62004bc0e9b958c345a03c07 (patch)
tree235c656a5673578e59985a21e78401e9ebda4304 /bin/hilex
parentAdd TOUR.7 (diff)
downloadsrc-ab54d9c9f162d07a62004bc0e9b958c345a03c07.tar.gz
src-ab54d9c9f162d07a62004bc0e9b958c345a03c07.zip
Add initial version of hilex
hilex is meant to replace hi, based on lex rather than a mess of
overlapping regexps. I want to preserve hi's tagging abilities, but that
will require some amount of parsing/post-processing, which I'm not sure
how to approach yet.

Macro lexing for C still needs work, as I want to match strings and
comments inside macros.
Diffstat (limited to '')
-rw-r--r--bin/hilex/.gitignore2
-rw-r--r--bin/hilex/Makefile14
-rw-r--r--bin/hilex/ansi.c44
-rw-r--r--bin/hilex/c.l131
-rw-r--r--bin/hilex/hilex.c129
-rw-r--r--bin/hilex/hilex.h60
-rw-r--r--bin/hilex/text.l35
7 files changed, 415 insertions, 0 deletions
diff --git a/bin/hilex/.gitignore b/bin/hilex/.gitignore
new file mode 100644
index 00000000..f4c89460
--- /dev/null
+++ b/bin/hilex/.gitignore
@@ -0,0 +1,2 @@
+*.o
+hilex
diff --git a/bin/hilex/Makefile b/bin/hilex/Makefile
new file mode 100644
index 00000000..4a930757
--- /dev/null
+++ b/bin/hilex/Makefile
@@ -0,0 +1,14 @@
+CFLAGS += -std=c11 -Wall -Wextra -Wpedantic
+
+OBJS += ansi.o
+OBJS += c.o
+OBJS += hilex.o
+OBJS += text.o
+
+hilex: ${OBJS}
+	${CC} ${LDFLAGS} ${OBJS} ${LDLIBS} -o $@
+
+${OBJS}: hilex.h
+
+clean:
+	rm -f hilex ${OBJS}
diff --git a/bin/hilex/ansi.c b/bin/hilex/ansi.c
new file mode 100644
index 00000000..5ecd1f2a
--- /dev/null
+++ b/bin/hilex/ansi.c
@@ -0,0 +1,44 @@
+/* Copyright (C) 2020  C. McEnroe <june@causal.agency>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "hilex.h"
+
+static const char *Color[ClassCap] = {
+	[Keyword] = "37",
+	[Macro] = "32",
+	[Comment] = "34",
+	[String] = "36",
+	[StringFormat] = "36;1;96",
+};
+
+static void format(const char *opts[], enum Class class, const char *text) {
+	(void)opts;
+	if (!Color[class]) {
+		printf("%s", text);
+		return;
+	}
+	// Set color on each line for piping to less -R:
+	for (const char *nl; (nl = strchr(text, '\n')); text = &nl[1]) {
+		printf("\33[%sm%.*s\33[m\n", Color[class], (int)(nl - text), text);
+	}
+	if (*text) printf("\33[%sm%s\33[m", Color[class], text);
+}
+
+const struct Formatter FormatANSI = { .format = format };
diff --git a/bin/hilex/c.l b/bin/hilex/c.l
new file mode 100644
index 00000000..159980e2
--- /dev/null
+++ b/bin/hilex/c.l
@@ -0,0 +1,131 @@
+/* Copyright (C) 2020  C. McEnroe <june@causal.agency>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+%option prefix="c11"
+%option noyywrap
+
+%{
+#include "hilex.h"
+%}
+
+%x MacroLine MacroInclude
+%x CharLiteral StringLiteral
+
+width "*"|[0-9]+
+
+%%
+
+[[:space:]]+ { return Normal; }
+
+([-+*/%&|^=!<>]|"<<"|">>")"="? |
+[=~.?:]|"++"|"--"|"&&"|"||"|"->" |
+sizeof|(_A|alignof) {
+	return Operator;
+}
+
+([1-9][0-9]*|"0"[0-7]*|"0x"[[:xdigit:]]+)([ulUL]{0,3}) |
+[0-9]*("."[0-9]*)?([eE][+-]?[0-9]+)?[flFL]? |
+"0x"[[:xdigit:]]*("."[[:xdigit:]]*)?([pP][+-]?[0-9]+)[flFL]? {
+	return Number;
+}
+
+auto|break|case|const|continue|default|do|else|enum|extern|for|goto|if|inline |
+register|restrict|return|static|struct|switch|typedef|union|volatile|while |
+(_A|a)lignas|_Atomic|_Generic|(_N|n)oreturn|(_S|s)tatic_assert |
+(_T|t)hread_local {
+	return Keyword;
+}
+
+[_[:alpha:]][_[:alnum:]]* { return Identifier; }
+
+^"#" {
+	BEGIN(MacroLine);
+	return Macro;
+}
+^"#"[[:blank:]]*"include" {
+	BEGIN(MacroInclude);
+	return Macro;
+}
+<MacroLine,MacroInclude>{
+	"\n" {
+		BEGIN(0);
+		return Macro;
+	}
+	"\\\n" { return Macro; }
+	[^\\\n<"]+|. { return Macro; }
+}
+<MacroInclude>{
+	"<"[^>]+">" |
+	"\""[^"]+"\"" {
+		return String;
+	}
+}
+
+"//"([^\n]|"\\\n")* |
+"/*"([^*]|"*"[^/])*"*"+"/" {
+	return Comment;
+}
+
+[LUu]?"'" {
+	BEGIN(CharLiteral);
+	return String;
+}
+([LU]|u8?)?"\"" {
+	BEGIN(StringLiteral);
+	return String;
+}
+
+<CharLiteral,StringLiteral>{
+	"\\"['""?\\abfnrtv] |
+	"\\"([0-7]{1,3}) |
+	"\\x"([[:xdigit:]]{2}) |
+	"\\u"([[:xdigit:]]{4}) |
+	"\\U"([[:xdigit:]]{8}) {
+		return StringEscape;
+	}
+}
+<StringLiteral>{
+	"%%" |
+	"%"[ #+-0]*{width}?("."{width})?([Lhjltz]|hh|ll)?[AEFGXacdefginopsux] {
+		return StringFormat;
+	}
+}
+
+<CharLiteral>{
+	"'" {
+		BEGIN(0);
+		return String;
+	}
+	[^\\']+|. { return String; }
+}
+<StringLiteral>{
+	"\"" {
+		BEGIN(0);
+		return String;
+	}
+	[^%\\"]+|. { return String; }
+}
+
+. { return Normal; }
+
+%{
+	(void)yyunput;
+	(void)input;
+%}
+
+%%
+
+const struct Lexer LexC = { yylex, &yyin, &yytext };
diff --git a/bin/hilex/hilex.c b/bin/hilex/hilex.c
new file mode 100644
index 00000000..e973f0cd
--- /dev/null
+++ b/bin/hilex/hilex.c
@@ -0,0 +1,129 @@
+/* Copyright (C) 2020  C. McEnroe <june@causal.agency>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <err.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include "hilex.h"
+
+static const struct {
+	const struct Lexer *lexer;
+	const char *name;
+	const char *pattern;
+} Lexers[] = {
+	{ &LexC, "c", "[.][chlmy]$" },
+	{ &LexText, "text", "[.]txt$" },
+};
+
+static const struct Lexer *parseLexer(const char *name) {
+	for (size_t i = 0; i < ARRAY_LEN(Lexers); ++i) {
+		if (!strcmp(name, Lexers[i].name)) return Lexers[i].lexer;
+	}
+	errx(EX_USAGE, "unknown lexer %s", name);
+}
+
+static const struct Lexer *matchLexer(const char *name) {
+	regex_t regex;
+	for (size_t i = 0; i < ARRAY_LEN(Lexers); ++i) {
+		int error = regcomp(
+			&regex, Lexers[i].pattern, REG_EXTENDED | REG_NOSUB
+		);
+		assert(!error);
+		error = regexec(&regex, name, 0, NULL, 0);
+		regfree(&regex);
+		if (!error) return Lexers[i].lexer;
+	}
+	return NULL;
+}
+
+static const struct {
+	const struct Formatter *formatter;
+	const char *name;
+} Formatters[] = {
+	{ &FormatANSI, "ansi" },
+	{ &FormatDebug, "debug" },
+};
+
+static const struct Formatter *parseFormatter(const char *name) {
+	for (size_t i = 0; i < ARRAY_LEN(Formatters); ++i) {
+		if (!strcmp(name, Formatters[i].name)) return Formatters[i].formatter;
+	}
+	errx(EX_USAGE, "unknown formatter %s", name);
+}
+
+static const char *ClassName[] = {
+#define X(class) [class] = #class,
+	ENUM_CLASS
+#undef X
+};
+
+static void
+debugFormat(const char *opts[], enum Class class, const char *text) {
+	printf("%s(\33[3m", ClassName[class]);
+	FormatANSI.format(opts, class, text);
+	printf("\33[m)");
+}
+
+const struct Formatter FormatDebug = { .format = debugFormat };
+
+int main(int argc, char *argv[]) {
+	bool text = false;
+	const char *name = NULL;
+	const struct Lexer *lexer = NULL;
+	const struct Formatter *formatter = &FormatANSI;
+
+	for (int opt; 0 < (opt = getopt(argc, argv, "f:l:n:t"));) {
+		switch (opt) {
+			break; case 'f': formatter = parseFormatter(optarg);
+			break; case 'l': lexer = parseLexer(optarg);
+			break; case 'n': name = optarg;
+			break; case 't': text = true;
+		}
+	}
+
+	const char *path = "(stdin)";
+	FILE *file = stdin;
+	if (optind < argc) {
+		path = argv[optind];
+		file = fopen(path, "r");
+		if (!file) err(EX_NOINPUT, "%s", path);
+	}
+
+	if (!name) {
+		if (NULL != (name = strrchr(path, '/'))) {
+			name++;
+		} else {
+			name = path;
+		}
+	}
+	if (!lexer) lexer = matchLexer(name);
+	if (!lexer && text) lexer = &LexText;
+	if (!lexer) errx(EX_USAGE, "cannot infer lexer for %s", name);
+
+	*lexer->in = file;
+	if (formatter->header) formatter->header(NULL);
+	for (enum Class class; None != (class = lexer->lex());) {
+		formatter->format(NULL, class, *lexer->text);
+	}
+	if (formatter->footer) formatter->footer(NULL);
+}
diff --git a/bin/hilex/hilex.h b/bin/hilex/hilex.h
new file mode 100644
index 00000000..63e5a43b
--- /dev/null
+++ b/bin/hilex/hilex.h
@@ -0,0 +1,60 @@
+/* Copyright (C) 2020  C. McEnroe <june@causal.agency>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+
+#define ARRAY_LEN(a) (sizeof(a) / sizeof(a[0]))
+
+#define ENUM_CLASS \
+	X(None) \
+	X(Normal) \
+	X(Operator) \
+	X(Number) \
+	X(Keyword) \
+	X(Identifier) \
+	X(Macro) \
+	X(Comment) \
+	X(String) \
+	X(StringEscape) \
+	X(StringFormat)
+
+enum Class {
+#define X(class) class,
+	ENUM_CLASS
+#undef X
+	ClassCap,
+};
+
+typedef int Lex(void);
+struct Lexer {
+	Lex *lex;
+	FILE **in;
+	char **text;
+};
+
+extern const struct Lexer LexC;
+extern const struct Lexer LexText;
+
+typedef void Header(const char *opts[]);
+typedef void Format(const char *opts[], enum Class class, const char *text);
+struct Formatter {
+	Header *header;
+	Format *format;
+	Header *footer;
+};
+
+extern const struct Formatter FormatANSI;
+extern const struct Formatter FormatDebug;
diff --git a/bin/hilex/text.l b/bin/hilex/text.l
new file mode 100644
index 00000000..e3d64814
--- /dev/null
+++ b/bin/hilex/text.l
@@ -0,0 +1,35 @@
+/* Copyright (C) 2020  C. McEnroe <june@causal.agency>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+%option prefix="text"
+%option noyywrap
+
+%{
+#include "hilex.h"
+%}
+
+%%
+
+.*"\n"? { return Normal; }
+
+%{
+	(void)yyunput;
+	(void)input;
+%}
+
+%%
+
+const struct Lexer LexText = { yylex, &yyin, &yytext };