summary refs log tree commit diff
path: root/bin/title.c
diff options
context:
space:
mode:
Diffstat (limited to 'bin/title.c')
-rw-r--r--bin/title.c211
1 files changed, 211 insertions, 0 deletions
diff --git a/bin/title.c b/bin/title.c
new file mode 100644
index 00000000..47ff720a
--- /dev/null
+++ b/bin/title.c
@@ -0,0 +1,211 @@
+/* Copyright (C) 2019  June McEnroe <june@causal.agency>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <curl/curl.h>
+#include <err.h>
+#include <locale.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <wchar.h>
+
+static regex_t regex(const char *pattern, int flags) {
+	regex_t regex;
+	int error = regcomp(&regex, pattern, REG_EXTENDED | flags);
+	if (!error) return regex;
+
+	char buf[256];
+	regerror(error, &regex, buf, sizeof(buf));
+	errx(EX_SOFTWARE, "regcomp: %s: %s", buf, pattern);
+}
+
+static const struct Entity {
+	wchar_t ch;
+	const char *name;
+} Entities[] = {
+	{ L'"', "&quot;" },
+	{ L'&', "&amp;" },
+	{ L'<', "&lt;" },
+	{ L'>', "&gt;" },
+	{ L'␤', "&#10;" },
+};
+
+static wchar_t entity(const char *name) {
+	for (size_t i = 0; i < sizeof(Entities) / sizeof(Entities[0]); ++i) {
+		struct Entity entity = Entities[i];
+		if (strncmp(name, entity.name, strlen(entity.name))) continue;
+		return entity.ch;
+	}
+	if (!strncmp(name, "&#x", 3)) return strtoul(&name[3], NULL, 16);
+	if (!strncmp(name, "&#", 2)) return strtoul(&name[2], NULL, 10);
+	return 0;
+}
+
+static const char EntityPattern[] = {
+	"[[:space:]]+|&([[:alpha:]]+|#([[:digit:]]+|x[[:xdigit:]]+));"
+};
+static regex_t EntityRegex;
+
+static void showTitle(const char *title) {
+	regmatch_t match = {0};
+	for (; *title; title += match.rm_eo) {
+		if (regexec(&EntityRegex, title, 1, &match, 0)) break;
+		if (title[match.rm_so] != '&') {
+			printf("%.*s ", (int)match.rm_so, title);
+			continue;
+		}
+		wchar_t ch = entity(&title[match.rm_so]);
+		if (ch) {
+			printf("%.*s%lc", (int)match.rm_so, title, (wint_t)ch);
+		} else {
+			printf("%.*s", (int)match.rm_eo, title);
+		}
+	}
+	printf("%s\n", title);
+}
+
+static CURL *curl;
+static bool title;
+static struct {
+	char buf[64 * 1024];
+	size_t len;
+} body;
+
+// HE COMES
+static const char TitlePattern[] = "<title>([^<]*)</title>";
+static regex_t TitleRegex;
+
+static size_t handleBody(char *buf, size_t size, size_t nitems, void *user) {
+	(void)user;
+	size_t len = size * nitems;
+	size_t cap = sizeof(body.buf) - body.len - 1;
+	size_t new = (len < cap ? len : cap);
+	if (title || !new) return len;
+
+	memcpy(&body.buf[body.len], buf, new);
+	body.len += new;
+	body.buf[body.len] = '\0';
+
+	regmatch_t match[2];
+	if (regexec(&TitleRegex, body.buf, 2, match, 0)) return len;
+	body.buf[match[1].rm_eo] = '\0';
+	showTitle(&body.buf[match[1].rm_so]);
+	title = true;
+
+	return len;
+}
+
+static CURLcode fetchTitle(const char *url) {
+	CURLcode code = curl_easy_setopt(curl, CURLOPT_URL, url);
+	if (code) return code;
+
+	curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
+	code = curl_easy_perform(curl);
+	if (code) return code;
+
+	char *type;
+	code = curl_easy_getinfo(curl, CURLINFO_CONTENT_TYPE, &type);
+	if (code) return code;
+	if (!type || strncmp(type, "text/html", 9)) return CURLE_OK;
+
+	char *dest;
+	curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &dest);
+	dest = strdup(dest);
+	if (!dest) err(EX_OSERR, "strdup");
+
+	code = curl_easy_setopt(curl, CURLOPT_URL, dest);
+	if (code) return code;
+	free(dest);
+
+	body.len = 0;
+	title = false;
+	curl_easy_setopt(curl, CURLOPT_HTTPGET, 1L);
+	code = curl_easy_perform(curl);
+	return code;
+}
+
+int main(int argc, char *argv[]) {
+	EntityRegex = regex(EntityPattern, 0);
+	TitleRegex = regex(TitlePattern, REG_ICASE);
+
+	setlocale(LC_CTYPE, "");
+	setlinebuf(stdout);
+
+	CURLcode code = curl_global_init(CURL_GLOBAL_ALL);
+	if (code) errx(EX_OSERR, "curl_global_init: %s", curl_easy_strerror(code));
+
+	curl = curl_easy_init();
+	if (!curl) errx(EX_SOFTWARE, "curl_easy_init");
+
+	static char error[CURL_ERROR_SIZE];
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, error);
+
+	curl_easy_setopt(curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
+	curl_easy_setopt(
+		curl, CURLOPT_USERAGENT,
+		"curl/7.54.0 facebookexternalhit/1.1 Twitterbot/1.0"
+	);
+	curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+	curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 3L);
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
+
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, handleBody);
+
+	bool exclude = false;
+	regex_t excludeRegex;
+
+	int opt;
+	while (0 < (opt = getopt(argc, argv, "x:v"))) {
+		switch (opt) {
+			break; case 'x': {
+				exclude = true;
+				excludeRegex = regex(optarg, REG_NOSUB);
+			}
+			break; case 'v': curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
+			break; default:  return EX_USAGE;
+		}
+	}
+
+	if (optind < argc) {
+		code = fetchTitle(argv[optind]);
+		if (!code) return EX_OK;
+		errx(EX_DATAERR, "curl_easy_perform: %s", error);
+	}
+
+	char *buf = NULL;
+	size_t cap = 0;
+
+	regex_t urlRegex = regex("https?://([^[:space:]>\"()]|[(][^)]*[)])+", 0);
+	while (0 < getline(&buf, &cap, stdin)) {
+		regmatch_t match = {0};
+		for (char *ptr = buf; *ptr; ptr += match.rm_eo) {
+			if (regexec(&urlRegex, ptr, 1, &match, 0)) break;
+			ptr[match.rm_eo] = '\0';
+			const char *url = &ptr[match.rm_so];
+			if (!exclude || regexec(&excludeRegex, url, 0, NULL, 0)) {
+				code = fetchTitle(url);
+				if (code) warnx("curl_easy_perform: %s", error);
+			}
+			ptr[match.rm_eo] = ' ';
+		}
+	}
+	if (ferror(stdin)) err(EX_IOERR, "getline");
+}