From 8dddcdeb41e01736b14f415a0adf60819ed8594f Mon Sep 17 00:00:00 2001 From: June McEnroe Date: Thu, 5 Sep 2019 01:49:01 -0400 Subject: Add title --- bin/.gitignore | 1 + bin/Makefile | 9 +++- bin/README | 3 +- bin/bin.7 | 4 +- bin/man1/title.1 | 34 ++++++++++++++ bin/title.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 bin/man1/title.1 create mode 100644 bin/title.c diff --git a/bin/.gitignore b/bin/.gitignore index b31a9d08..78e94de3 100644 --- a/bin/.gitignore +++ b/bin/.gitignore @@ -30,6 +30,7 @@ scheme.png setopt shotty tags +title ttpre up when diff --git a/bin/Makefile b/bin/Makefile index 3eb71c62..1b038ea6 100644 --- a/bin/Makefile +++ b/bin/Makefile @@ -44,9 +44,11 @@ BINS_LINUX += fbatt BINS_LINUX += fbclock BINS_LINUX += psfed +BINS_CURL += title + BINS_TLS += relay -BINS = ${BINS_ANY} ${BINS_BSD} ${BINS_LINUX} ${BINS_TLS} +BINS = ${BINS_ANY} ${BINS_BSD} ${BINS_LINUX} ${BINS_CURL} ${BINS_TLS} MANS = ${BINS:%=man1/%.1} LINKS = ${LINKS_ANY} @@ -58,6 +60,8 @@ bsd: meta ${BINS_BSD} linux: meta ${BINS_LINUX} +curl: meta ${BINS_CURL} + tls: meta ${BINS_TLS} meta: tags .gitignore README @@ -74,6 +78,9 @@ open pbcopy pbpaste: pbd relay: relay.c ${CC} ${CFLAGS_tls} ${LDFLAGS_tls} $@.c ${LDLIBS_tls} -o $@ +title: title.c + ${CC} ${CFLAGS} ${LDFLAGS} $@.c ${LDLIBS} -lcurl -o $@ + # Headers fbatt.o fbclock.o: scheme.h diff --git a/bin/README b/bin/README index c534e7dc..bfbfa203 100644 --- a/bin/README +++ b/bin/README @@ -30,6 +30,7 @@ DESCRIPTION scheme(1) color scheme setopt(1) quoting getopt shotty(1) terminal capture + title(1) page titles ttpre(1) man output to HTML up(1) upload file when(1) date calculator @@ -39,4 +40,4 @@ DESCRIPTION png(3) basic PNG output -Causal Agency August 29, 2019 Causal Agency +Causal Agency September 3, 2019 Causal Agency diff --git a/bin/bin.7 b/bin/bin.7 index 4d2d5ac7..98717614 100644 --- a/bin/bin.7 +++ b/bin/bin.7 @@ -1,4 +1,4 @@ -.Dd August 29, 2019 +.Dd September 3, 2019 .Dt BIN 7 .Os "Causal Agency" . @@ -62,6 +62,8 @@ color scheme quoting getopt .It Xr shotty 1 terminal capture +.It Xr title 1 +page titles .It Xr ttpre 1 man output to HTML .It Xr up 1 diff --git a/bin/man1/title.1 b/bin/man1/title.1 new file mode 100644 index 00000000..927163fa --- /dev/null +++ b/bin/man1/title.1 @@ -0,0 +1,34 @@ +.Dd September 5, 2019 +.Dt TITLE 1 +.Os +. +.Sh NAME +.Nm title +.Nd page titles +. +.Sh SYNOPSIS +.Nm +.Op Ar url +. +.Sh DESCRIPTION +.Nm +fetches HTML page titles +over HTTP and HTTPS. +.Nm +scans standard input for URLs +and writes their titles to standard output. +If a +.Ar url +argument is given, +.Nm +exits after fetching its title. +. +.Sh EXAMPLES +.Bd -literal -offset indent +mkfifo snarf titles +relay irc.example.org 6697 snarf '#example' <>titles >snarf +title titles +.Ed +. +.Sh SEE ALSO +.Xr relay 1 diff --git a/bin/title.c b/bin/title.c new file mode 100644 index 00000000..ed9eeccc --- /dev/null +++ b/bin/title.c @@ -0,0 +1,138 @@ +/* Copyright (C) 2019 June McEnroe + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static regex_t regex(const char *pattern) { + regex_t regex; + int error = regcomp(®ex, pattern, REG_EXTENDED); + if (!error) return regex; + char buf[256]; + regerror(error, ®ex, buf, sizeof(buf)); + errx(EX_SOFTWARE, "regcomp: %s: %s", buf, pattern); +} + +static CURL *curl; +static bool html; +static struct { + char buf[8192]; + size_t len; +} body; + +static size_t handleHeader(char *buf, size_t size, size_t nitems, void *user) { + (void)user; + size_t len = size * nitems; + const char ContentType[] = "Content-Type: text/html"; + if (sizeof(ContentType) - 1 < len) len = sizeof(ContentType) - 1; + if (!strncasecmp(buf, ContentType, len)) html = true; + return size * nitems; +} + +static size_t handleBody(char *buf, size_t size, size_t nitems, void *user) { + (void)user; + size_t len = size * nitems; + size_t cap = sizeof(body.buf) - body.len; + size_t cpy = (len < cap ? len : cap); + memcpy(&body.buf[body.len], buf, cpy); + body.len += cpy; + return len; +} + +static const char *TitlePattern = "([^<]*)"; +static regex_t TitleRegex; + +static bool getTitle(const char *url) { + CURLcode code = curl_easy_setopt(curl, CURLOPT_URL, url); + if (code) { + warnx("CURLOPT_URL: %s", curl_easy_strerror(code)); + return false; + } + + html = false; + curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); + if ((code = curl_easy_perform(curl))) { + warnx("curl_easy_perform: %s", curl_easy_strerror(code)); + return false; + } + if (!html) return false; + + body.len = 0; + curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); + if ((code = curl_easy_perform(curl))) { + warnx("curl_easy_perform: %s", curl_easy_strerror(code)); + return false; + } + body.buf[body.len - 1] = '\0'; + + regmatch_t match[2]; + int error = regexec(&TitleRegex, body.buf, 2, match, 0); + if (error == REG_NOMATCH) return false; + if (error) errx(EX_SOFTWARE, "regexec: %d", error); + + body.buf[match[1].rm_eo] = '\0'; + char *title = &body.buf[match[1].rm_so]; + + printf("%s\n", title); + return true; +} + +int main(int argc, char *argv[]) { + TitleRegex = regex(TitlePattern); + + CURLcode code = curl_global_init(CURL_GLOBAL_ALL); + if (code) errx(EX_OSERR, "curl_global_init: %s", curl_easy_strerror(code)); + + curl = curl_easy_init(); + if (!curl) errx(EX_SOFTWARE, "curl_easy_init"); + + curl_easy_setopt(curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L); + + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, handleHeader); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, handleBody); + + setlinebuf(stdout); + + if (argc > 1) { + return (getTitle(argv[1]) ? EX_OK : EX_DATAERR); + } + + regex_t urlRegex = regex("https?://[^[:space:]>\"]+"); + + char *buf = NULL; + size_t cap = 0; + while (0 < getline(&buf, &cap, stdin)) { + regmatch_t match = {0}; + for (char *url = buf; *url; url += match.rm_eo) { + int error = regexec(&urlRegex, url, 1, &match, 0); + if (error == REG_NOMATCH) break; + if (error) errx(EX_SOFTWARE, "regexec: %d", error); + + url[match.rm_eo] = '\0'; + getTitle(&url[match.rm_so]); + url[match.rm_eo] = ' '; + } + } +} -- cgit 1.4.1