From f5235d92eeb7477fa459b7f5665873c0c23452ff Mon Sep 17 00:00:00 2001 From: "C. McEnroe" Date: Tue, 7 Sep 2021 16:53:43 -0400 Subject: Add dehtml --- bin/.gitignore | 1 + bin/Makefile | 1 + bin/README.7 | 4 +- bin/dehtml.l | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bin/man1/dehtml.1 | 35 +++++++++++++ 5 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 bin/dehtml.l create mode 100644 bin/man1/dehtml.1 (limited to 'bin') diff --git a/bin/.gitignore b/bin/.gitignore index be0a8004..d065f60e 100644 --- a/bin/.gitignore +++ b/bin/.gitignore @@ -6,6 +6,7 @@ bit bri c config.mk +dehtml dtch ever fbatt diff --git a/bin/Makefile b/bin/Makefile index 14ad94eb..b3c8adea 100644 --- a/bin/Makefile +++ b/bin/Makefile @@ -11,6 +11,7 @@ BINS += beef BINS += bibsort BINS += bit BINS += c +BINS += dehtml BINS += dtch BINS += glitch BINS += hilex diff --git a/bin/README.7 b/bin/README.7 index 0ff46eab..441bf693 100644 --- a/bin/README.7 +++ b/bin/README.7 @@ -1,4 +1,4 @@ -.Dd April 26, 2021 +.Dd September 7, 2021 .Dt BIN 7 .Os "Causal Agency" . @@ -26,6 +26,8 @@ calculator backlight brightness control .It Xr c 1 run C statements +.It Xr dehtml 1 +extract text from HTML .It Xr dtch 1 detached sessions .It Xr ever 1 diff --git a/bin/dehtml.l b/bin/dehtml.l new file mode 100644 index 00000000..f585b701 --- /dev/null +++ b/bin/dehtml.l @@ -0,0 +1,148 @@ +/* Copyright (C) 2021 C. McEnroe + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +%option noyywrap + +%{ +enum Token { + Doctype = 1, + Comment, + TagOpen, + TagClose, + Entity, + Text, + Space, +}; +%} + +%% + +"]*">" { return Doctype; } +"" { return Comment; } +"]*">" { return TagClose; } +"<"[^>]*">" { return TagOpen; } +"&"[^;]*";" { return Entity; } +[^<&[:space:]]+ { return Text; } +[[:space:]]+ { return Space; } + +%% + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct { + wchar_t ch; + const char *name; +} Entities[] = { + { L'&', "&" }, + { L'<', "<" }, + { L'>', ">" }, + { L'"', """ }, + { L' ', " " }, + { L'\u2014', "—" }, + { L'\u00A9', "©" }, +}; + +static void entity(void) { + wchar_t ch = 0; + if (yytext[1] == '#') { + if (yytext[2] == 'x') { + ch = strtoul(&yytext[3], NULL, 16); + } else { + ch = strtoul(&yytext[2], NULL, 10); + } + } else { + for (size_t i = 0; i < sizeof(Entities) / sizeof(Entities[0]); ++i) { + if (strcmp(Entities[i].name, yytext)) continue; + ch = Entities[i].ch; + break; + } + } + if (ch) { + printf("%lc", (wint_t)ch); + } else { + warnx("unknown entity %s", yytext); + printf("%s", yytext); + } +} + +static bool isTag(const char *tag) { + const char *ptr = &yytext[1]; + if (*ptr == '/') ptr++; + size_t len = strlen(tag); + if (strncasecmp(ptr, tag, len)) return false; + ptr += len; + return *ptr == ' ' || *ptr == '>'; +} + +int main(int argc, char *argv[]) { + (void)input; + (void)yyunput; + setlocale(LC_CTYPE, ""); + + bool collapse = 0; + for (int opt; 0 < (opt = getopt(argc, argv, "s"));) { + switch (opt) { + break; case 's': collapse = true; + break; default: return EX_USAGE; + } + } + argc -= optind; + argv += optind; + + if (!argc) argc++; + for (int i = 0; i < argc; ++i) { + yyin = (argv[i] ? fopen(argv[i], "r") : stdin); + if (!yyin) err(EX_NOINPUT, "%s", argv[i]); + + bool space = true; + bool discard = false; + bool pre = false; + for (enum Token tok; (tok = yylex());) { + if (tok == TagOpen || tok == TagClose) { + if (isTag("title") || isTag("style") || isTag("script")) { + discard = (tok == TagOpen); + } else if (isTag("pre")) { + pre = (tok == TagOpen); + } + } else if (discard) { + continue; + } else if (tok == Entity) { + entity(); + space = false; + } else if (tok == Text) { + printf("%s", yytext); + space = false; + } else if (tok == Space) { + if (collapse && !pre) { + if (space) continue; + printf("%c", yytext[0]); + } else { + printf("%s", yytext); + } + space = true; + } + } + } +} diff --git a/bin/man1/dehtml.1 b/bin/man1/dehtml.1 new file mode 100644 index 00000000..a0c5a8c4 --- /dev/null +++ b/bin/man1/dehtml.1 @@ -0,0 +1,35 @@ +.Dd September 7, 2021 +.Dt DEHTML 1 +.Os +. +.Sh NAME +.Nm dehtml +.Nd extract text from HTML +. +.Sh SYNOPSIS +.Nm +.Op Fl s +.Op Ar +. +.Sh DESCRIPTION +The +.Nm +utility extracts text +from HTML documents. +Text inside +.Sy , +.Sy <style> +and +.Sy <script> +tags is discarded. +Numeric and common named HTML entities +are converted. +. +.Pp +The arguments are as follows: +.Bl -tag -width Ds +.It Fl s +Collapse whitespace outside of +.Sy <pre> +tags. +.El -- cgit 1.4.1 5</span></td><td><a href='/litterbox/commit/litterbox.c?h=1.5&id=e7f73d66da51be79776c9c28ed4738a5635a1c8a&follow=1'>Error on invalid ISUPPORT values</a><span class='decoration'> <a class='tag-annotated-deco' href='/litterbox/tag/?h=1.2'>1.2</a></span></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-05 11:51:30 -0400'>2020-04-05</span></td><td><a href='/litterbox/commit/configure?h=1.5&id=b325846f8d12fc00275863c63a153064b0a2b4fd&follow=1'>Only set RCS on FreeBSD</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-05 11:42:45 -0400'>2020-04-05</span></td><td><a href='/litterbox/commit/litterbox.c?h=1.5&id=b9afd7f8f8bbb799ec3e2f6c18edca1593c2c61b&follow=1'>Log bans and unbans</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-05 11:27:51 -0400'>2020-04-05</span></td><td><a href='/litterbox/commit/litterbox.c?h=1.5&id=8a4a9999baa28b6ace84fc1a5be69f8c71f3f88e&follow=1'>Parse mode types from ISUPPORT</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-05 11:13:48 -0400'>2020-04-05</span></td><td><a href='/litterbox/commit/unscoop.c?h=1.5&id=5fce836036bc60d250b73b8883f02184435fb2c8&follow=1'>Add unscoop matchers for ban/unban events</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-05 11:08:50 -0400'>2020-04-05</span></td><td><a href='/litterbox/commit/Makefile?h=1.5&id=28a7c819fedc7188a557b450e5e7b692a9fa991e&follow=1'>Check unscoop regexps with make test</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-05 11:02:41 -0400'>2020-04-05</span></td><td><a href='/litterbox/commit/unscoop.c?h=1.5&id=5cd7060e8d9e1f18924c5f19e2be715a9ee7ff98&follow=1'>Add unscoop -n flag for checking regexps</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-05 10:45:17 -0400'>2020-04-05</span></td><td><a href='/litterbox/commit/scoop.c?h=1.5&id=a8c5d13cb21ef08f7d2b5a718b5c93113dfc6448&follow=1'>Add Ban and Unban event types</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-04-02 16:29:36 -0400'>2020-04-02</span></td><td><a href='/litterbox/commit/unscoop.c?h=1.5&id=d156d07f44536bca14c847f7f5313c956a625f96&follow=1'>Update style</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-03-31 18:55:09 -0400'>2020-03-31</span></td><td><a href='/litterbox/commit/unscoop.c?h=1.5&id=a1a944e22b6806cbba755176990d7f9d88c6cec4&follow=1'>Update unscoop catgirl matchers</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-03-31 18:54:37 -0400'>2020-03-31</span></td><td><a href='/litterbox/commit/litterbox.c?h=1.5&id=5e3578a00f651d9ed04f6b9beda305408d6c09ce&follow=1'>Fix writing verbose to stderr</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-03-02 18:55:35 -0500'>2020-03-02</span></td><td><a href='/litterbox/commit/scoop.c?h=1.5&id=34daf56d67522d13375b1dca80d313955b85e7c2&follow=1'>Include <>/-/* around nicks in scoop coloring</a><span class='decoration'> <a class='tag-annotated-deco' href='/litterbox/tag/?h=1.1'>1.1</a></span></td><td>June McEnroe</td></tr> <tr><td><span title='2020-03-02 18:46:02 -0500'>2020-03-02</span></td><td><a href='/litterbox/commit/scoop.c?h=1.5&id=834fb3a3615c7546af0f35e8f0a0b31d923d7585&follow=1'>Replace .mk files with configure script</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-02-28 00:13:42 -0500'>2020-02-28</span></td><td><a href='/litterbox/commit/litterbox.c?h=1.5&id=c0fafbc887a147ee77278a5bfd852e171aeb471c&follow=1'>Implement the causal.agency/consumer capability</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-02-22 00:51:04 -0500'>2020-02-22</span></td><td><a href='/litterbox/commit/litterbox.c?h=1.5&id=08456da07fadec93973eb28d059d9382149f3d75&follow=1'>Include <>/-/* around nicks in coloring</a></td><td>June McEnroe</td></tr> <tr><td><span title='2020-02-22 00:49:04 -0500'>2020-02-22</span></td><td><a href='/litterbox/commit/litterbox.c?h=1.5&id=3789fea374c475c1b465281b853665f92c6e91f9&follow=1'>Use (almost) the full range of IRC colors for nicks</a></td><td>June McEnroe