From a64fd47f6196f769f19a205885a8ca5a4a0388c5 Mon Sep 17 00:00:00 2001 From: June McEnroe Date: Tue, 7 Sep 2021 16:53:43 -0400 Subject: Add dehtml --- bin/.gitignore | 1 + bin/Makefile | 1 + bin/README.7 | 4 +- bin/dehtml.l | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bin/man1/dehtml.1 | 35 +++++++++++++ 5 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 bin/dehtml.l create mode 100644 bin/man1/dehtml.1 diff --git a/bin/.gitignore b/bin/.gitignore index be0a8004..d065f60e 100644 --- a/bin/.gitignore +++ b/bin/.gitignore @@ -6,6 +6,7 @@ bit bri c config.mk +dehtml dtch ever fbatt diff --git a/bin/Makefile b/bin/Makefile index 14ad94eb..b3c8adea 100644 --- a/bin/Makefile +++ b/bin/Makefile @@ -11,6 +11,7 @@ BINS += beef BINS += bibsort BINS += bit BINS += c +BINS += dehtml BINS += dtch BINS += glitch BINS += hilex diff --git a/bin/README.7 b/bin/README.7 index 0ff46eab..441bf693 100644 --- a/bin/README.7 +++ b/bin/README.7 @@ -1,4 +1,4 @@ -.Dd April 26, 2021 +.Dd September 7, 2021 .Dt BIN 7 .Os "Causal Agency" . @@ -26,6 +26,8 @@ calculator backlight brightness control .It Xr c 1 run C statements +.It Xr dehtml 1 +extract text from HTML .It Xr dtch 1 detached sessions .It Xr ever 1 diff --git a/bin/dehtml.l b/bin/dehtml.l new file mode 100644 index 00000000..7793cdbc --- /dev/null +++ b/bin/dehtml.l @@ -0,0 +1,148 @@ +/* Copyright (C) 2021 June McEnroe + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +%option noyywrap + +%{ +enum Token { + Doctype = 1, + Comment, + TagOpen, + TagClose, + Entity, + Text, + Space, +}; +%} + +%% + +"]*">" { return Doctype; } +"" { return Comment; } +"]*">" { return TagClose; } +"<"[^>]*">" { return TagOpen; } +"&"[^;]*";" { return Entity; } +[^<&[:space:]]+ { return Text; } +[[:space:]]+ { return Space; } + +%% + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct { + wchar_t ch; + const char *name; +} Entities[] = { + { L'&', "&" }, + { L'<', "<" }, + { L'>', ">" }, + { L'"', """ }, + { L' ', " " }, + { L'\u2014', "—" }, + { L'\u00A9', "©" }, +}; + +static void entity(void) { + wchar_t ch = 0; + if (yytext[1] == '#') { + if (yytext[2] == 'x') { + ch = strtoul(&yytext[3], NULL, 16); + } else { + ch = strtoul(&yytext[2], NULL, 10); + } + } else { + for (size_t i = 0; i < sizeof(Entities) / sizeof(Entities[0]); ++i) { + if (strcmp(Entities[i].name, yytext)) continue; + ch = Entities[i].ch; + break; + } + } + if (ch) { + printf("%lc", (wint_t)ch); + } else { + warnx("unknown entity %s", yytext); + printf("%s", yytext); + } +} + +static bool isTag(const char *tag) { + const char *ptr = &yytext[1]; + if (*ptr == '/') ptr++; + size_t len = strlen(tag); + if (strncasecmp(ptr, tag, len)) return false; + ptr += len; + return *ptr == ' ' || *ptr == '>'; +} + +int main(int argc, char *argv[]) { + (void)input; + (void)yyunput; + setlocale(LC_CTYPE, ""); + + bool collapse = 0; + for (int opt; 0 < (opt = getopt(argc, argv, "s"));) { + switch (opt) { + break; case 's': collapse = true; + break; default: return EX_USAGE; + } + } + argc -= optind; + argv += optind; + + if (!argc) argc++; + for (int i = 0; i < argc; ++i) { + yyin = (argv[i] ? fopen(argv[i], "r") : stdin); + if (!yyin) err(EX_NOINPUT, "%s", argv[i]); + + bool space = true; + bool discard = false; + bool pre = false; + for (enum Token tok; (tok = yylex());) { + if (tok == TagOpen || tok == TagClose) { + if (isTag("title") || isTag("style") || isTag("script")) { + discard = (tok == TagOpen); + } else if (isTag("pre")) { + pre = (tok == TagOpen); + } + } else if (discard) { + continue; + } else if (tok == Entity) { + entity(); + space = false; + } else if (tok == Text) { + printf("%s", yytext); + space = false; + } else if (tok == Space) { + if (collapse && !pre) { + if (space) continue; + printf("%c", yytext[0]); + } else { + printf("%s", yytext); + } + space = true; + } + } + } +} diff --git a/bin/man1/dehtml.1 b/bin/man1/dehtml.1 new file mode 100644 index 00000000..a0c5a8c4 --- /dev/null +++ b/bin/man1/dehtml.1 @@ -0,0 +1,35 @@ +.Dd September 7, 2021 +.Dt DEHTML 1 +.Os +. +.Sh NAME +.Nm dehtml +.Nd extract text from HTML +. +.Sh SYNOPSIS +.Nm +.Op Fl s +.Op Ar +. +.Sh DESCRIPTION +The +.Nm +utility extracts text +from HTML documents. +Text inside +.Sy , +.Sy <style> +and +.Sy <script> +tags is discarded. +Numeric and common named HTML entities +are converted. +. +.Pp +The arguments are as follows: +.Bl -tag -width Ds +.It Fl s +Collapse whitespace outside of +.Sy <pre> +tags. +.El -- cgit 1.4.1