summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--bin/.gitignore1
-rw-r--r--bin/Makefile1
-rw-r--r--bin/README.74
-rw-r--r--bin/dehtml.l148
-rw-r--r--bin/man1/dehtml.135
5 files changed, 188 insertions, 1 deletions
diff --git a/bin/.gitignore b/bin/.gitignore
index be0a8004..d065f60e 100644
--- a/bin/.gitignore
+++ b/bin/.gitignore
@@ -6,6 +6,7 @@ bit
 bri
 c
 config.mk
+dehtml
 dtch
 ever
 fbatt
diff --git a/bin/Makefile b/bin/Makefile
index 14ad94eb..b3c8adea 100644
--- a/bin/Makefile
+++ b/bin/Makefile
@@ -11,6 +11,7 @@ BINS += beef
 BINS += bibsort
 BINS += bit
 BINS += c
+BINS += dehtml
 BINS += dtch
 BINS += glitch
 BINS += hilex
diff --git a/bin/README.7 b/bin/README.7
index 0ff46eab..441bf693 100644
--- a/bin/README.7
+++ b/bin/README.7
@@ -1,4 +1,4 @@
-.Dd April 26, 2021
+.Dd September  7, 2021
 .Dt BIN 7
 .Os "Causal Agency"
 .
@@ -26,6 +26,8 @@ calculator
 backlight brightness control
 .It Xr c 1
 run C statements
+.It Xr dehtml 1
+extract text from HTML
 .It Xr dtch 1
 detached sessions
 .It Xr ever 1
diff --git a/bin/dehtml.l b/bin/dehtml.l
new file mode 100644
index 00000000..f585b701
--- /dev/null
+++ b/bin/dehtml.l
@@ -0,0 +1,148 @@
+/* Copyright (C) 2021  C. McEnroe <june@causal.agency>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+%option noyywrap
+
+%{
+enum Token {
+	Doctype = 1,
+	Comment,
+	TagOpen,
+	TagClose,
+	Entity,
+	Text,
+	Space,
+};
+%}
+
+%%
+
+"<!DOCTYPE "[^>]*">" { return Doctype; }
+"<!--"([^-]|-[^-]|--[^>])*"-->" { return Comment; }
+"</"[^>]*">" { return TagClose; }
+"<"[^>]*">" { return TagOpen; }
+"&"[^;]*";" { return Entity; }
+[^<&[:space:]]+ { return Text; }
+[[:space:]]+ { return Space; }
+
+%%
+
+#include <err.h>
+#include <locale.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <wchar.h>
+
+static const struct {
+	wchar_t ch;
+	const char *name;
+} Entities[] = {
+	{ L'&', "&amp;" },
+	{ L'<', "&lt;" },
+	{ L'>', "&gt;" },
+	{ L'"', "&quot;" },
+	{ L' ', "&nbsp;" },
+	{ L'\u2014', "&mdash;" },
+	{ L'\u00A9', "&copy;" },
+};
+
+static void entity(void) {
+	wchar_t ch = 0;
+	if (yytext[1] == '#') {
+		if (yytext[2] == 'x') {
+			ch = strtoul(&yytext[3], NULL, 16);
+		} else {
+			ch = strtoul(&yytext[2], NULL, 10);
+		}
+	} else {
+		for (size_t i = 0; i < sizeof(Entities) / sizeof(Entities[0]); ++i) {
+			if (strcmp(Entities[i].name, yytext)) continue;
+			ch = Entities[i].ch;
+			break;
+		}
+	}
+	if (ch) {
+		printf("%lc", (wint_t)ch);
+	} else {
+		warnx("unknown entity %s", yytext);
+		printf("%s", yytext);
+	}
+}
+
+static bool isTag(const char *tag) {
+	const char *ptr = &yytext[1];
+	if (*ptr == '/') ptr++;
+	size_t len = strlen(tag);
+	if (strncasecmp(ptr, tag, len)) return false;
+	ptr += len;
+	return *ptr == ' ' || *ptr == '>';
+}
+
+int main(int argc, char *argv[]) {
+	(void)input;
+	(void)yyunput;
+	setlocale(LC_CTYPE, "");
+
+	bool collapse = 0;
+	for (int opt; 0 < (opt = getopt(argc, argv, "s"));) {
+		switch (opt) {
+			break; case 's': collapse = true;
+			break; default:  return EX_USAGE;
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (!argc) argc++;
+	for (int i = 0; i < argc; ++i) {
+		yyin = (argv[i] ? fopen(argv[i], "r") : stdin);
+		if (!yyin) err(EX_NOINPUT, "%s", argv[i]);
+
+		bool space = true;
+		bool discard = false;
+		bool pre = false;
+		for (enum Token tok; (tok = yylex());) {
+			if (tok == TagOpen || tok == TagClose) {
+				if (isTag("title") || isTag("style") || isTag("script")) {
+					discard = (tok == TagOpen);
+				} else if (isTag("pre")) {
+					pre = (tok == TagOpen);
+				}
+			} else if (discard) {
+				continue;
+			} else if (tok == Entity) {
+				entity();
+				space = false;
+			} else if (tok == Text) {
+				printf("%s", yytext);
+				space = false;
+			} else if (tok == Space) {
+				if (collapse && !pre) {
+					if (space) continue;
+					printf("%c", yytext[0]);
+				} else {
+					printf("%s", yytext);
+				}
+				space = true;
+			}
+		}
+	}
+}
diff --git a/bin/man1/dehtml.1 b/bin/man1/dehtml.1
new file mode 100644
index 00000000..a0c5a8c4
--- /dev/null
+++ b/bin/man1/dehtml.1
@@ -0,0 +1,35 @@
+.Dd September  7, 2021
+.Dt DEHTML 1
+.Os
+.
+.Sh NAME
+.Nm dehtml
+.Nd extract text from HTML
+.
+.Sh SYNOPSIS
+.Nm
+.Op Fl s
+.Op Ar
+.
+.Sh DESCRIPTION
+The
+.Nm
+utility extracts text
+from HTML documents.
+Text inside
+.Sy <title> ,
+.Sy <style>
+and
+.Sy <script>
+tags is discarded.
+Numeric and common named HTML entities
+are converted.
+.
+.Pp
+The arguments are as follows:
+.Bl -tag -width Ds
+.It Fl s
+Collapse whitespace outside of
+.Sy <pre>
+tags.
+.El