From f5235d92eeb7477fa459b7f5665873c0c23452ff Mon Sep 17 00:00:00 2001 From: "C. McEnroe" Date: Tue, 7 Sep 2021 16:53:43 -0400 Subject: Add dehtml --- bin/dehtml.l | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 bin/dehtml.l (limited to 'bin/dehtml.l') diff --git a/bin/dehtml.l b/bin/dehtml.l new file mode 100644 index 00000000..f585b701 --- /dev/null +++ b/bin/dehtml.l @@ -0,0 +1,148 @@ +/* Copyright (C) 2021 C. McEnroe + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +%option noyywrap + +%{ +enum Token { + Doctype = 1, + Comment, + TagOpen, + TagClose, + Entity, + Text, + Space, +}; +%} + +%% + +"]*">" { return Doctype; } +"" { return Comment; } +"]*">" { return TagClose; } +"<"[^>]*">" { return TagOpen; } +"&"[^;]*";" { return Entity; } +[^<&[:space:]]+ { return Text; } +[[:space:]]+ { return Space; } + +%% + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct { + wchar_t ch; + const char *name; +} Entities[] = { + { L'&', "&" }, + { L'<', "<" }, + { L'>', ">" }, + { L'"', """ }, + { L' ', " " }, + { L'\u2014', "—" }, + { L'\u00A9', "©" }, +}; + +static void entity(void) { + wchar_t ch = 0; + if (yytext[1] == '#') { + if (yytext[2] == 'x') { + ch = strtoul(&yytext[3], NULL, 16); + } else { + ch = strtoul(&yytext[2], NULL, 10); + } + } else { + for (size_t i = 0; i < sizeof(Entities) / sizeof(Entities[0]); ++i) { + if (strcmp(Entities[i].name, yytext)) continue; + ch = Entities[i].ch; + break; + } + } + if (ch) { + printf("%lc", (wint_t)ch); + } else { + warnx("unknown entity %s", yytext); + printf("%s", yytext); + } +} + +static bool isTag(const char *tag) { + const char *ptr = &yytext[1]; + if (*ptr == '/') ptr++; + size_t len = strlen(tag); + if (strncasecmp(ptr, tag, len)) return false; + ptr += len; + return *ptr == ' ' || *ptr == '>'; +} + +int main(int argc, char *argv[]) { + (void)input; + (void)yyunput; + setlocale(LC_CTYPE, ""); + + bool collapse = 0; + for (int opt; 0 < (opt = getopt(argc, argv, "s"));) { + switch (opt) { + break; case 's': collapse = true; + break; default: return EX_USAGE; + } + } + argc -= optind; + argv += optind; + + if (!argc) argc++; + for (int i = 0; i < argc; ++i) { + yyin = (argv[i] ? fopen(argv[i], "r") : stdin); + if (!yyin) err(EX_NOINPUT, "%s", argv[i]); + + bool space = true; + bool discard = false; + bool pre = false; + for (enum Token tok; (tok = yylex());) { + if (tok == TagOpen || tok == TagClose) { + if (isTag("title") || isTag("style") || isTag("script")) { + discard = (tok == TagOpen); + } else if (isTag("pre")) { + pre = (tok == TagOpen); + } + } else if (discard) { + continue; + } else if (tok == Entity) { + entity(); + space = false; + } else if (tok == Text) { + printf("%s", yytext); + space = false; + } else if (tok == Space) { + if (collapse && !pre) { + if (space) continue; + printf("%c", yytext[0]); + } else { + printf("%s", yytext); + } + space = true; + } + } + } +} -- cgit 1.4.1