Parse encoded-word and convert charsets

author: June McEnroe <june@causal.agency> 2020-04-16 16:40:22 -0400
committer: June McEnroe <june@causal.agency> 2020-04-16 16:40:22 -0400
commit: 1ec8758fc8f188d321ef3e6da09b0a91a7ed47b8 (patch)
tree: 89904e68b3eb33ca707957b79ca60c6fc79139a4
parent: Add subject and recipient HTML classes (diff)
download: bubger-1ec8758fc8f188d321ef3e6da09b0a91a7ed47b8.tar.gz
bubger-1ec8758fc8f188d321ef3e6da09b0a91a7ed47b8.zip
3 files changed, 142 insertions, 5 deletions
diff --git a/Makefile b/Makefile
index 935f25e..b1db8d4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,8 @@
 CFLAGS += -std=c11 -Wall -Wextra -Wpedantic
 LDLIBS = -ltls
 
+-include config.mk
+
 OBJS += archive.o
 OBJS += atom.o
 OBJS += concat.o
diff --git a/bubger.1 b/bubger.1
index 77f93fa..57d734f 100644
--- a/bubger.1
+++ b/bubger.1
@@ -1,4 +1,4 @@
-.Dd April 14, 2020
+.Dd April 16, 2020
 .Dt BUBGER 1
 .Os
 .
@@ -131,6 +131,15 @@ Rendered Atom, HTML and mboxrd files for each thread.
 .Re
 .It
 .Rs
+.%A K. Moore
+.%T Message Header Extensions for Non-ASCII Text
+.%I IETF
+.%N RFC 2047
+.%D November 1996
+.%U https://tools.ietf.org/html/rfc2047
+.Re
+.It
+.Rs
 .%A M. Crispin
 .%T INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
 .%I IETF
@@ -160,6 +169,16 @@ Rendered Atom, HTML and mboxrd files for each thread.
 .Re
 .It
 .Rs
+.%A N. Freed
+.%A N. Borenstein
+.%T Format of Internet Message Bodies
+.%I IETF
+.%N RFC 2045
+.%D November 1996
+.%U https://tools.ietf.org/html/rfc2045
+.Re
+.It
+.Rs
 .%A T. Berners-Lee
 .%A L. Masinter
 .%A M. McCahill
diff --git a/decode.c b/decode.c
index 2e289fc..654cf22 100644
--- a/decode.c
+++ b/decode.c
@@ -15,18 +15,134 @@
  */
 
 #include <err.h>
+#include <errno.h>
+#include <iconv.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <strings.h>
 #include <sysexits.h>
 
 #include "archive.h"
 
-char *decodeHeader(const char *header) {
+struct Buffer {
+	size_t cap;
+	size_t len;
+	char *ptr;
+};
+
+static struct Buffer bufferAlloc(size_t cap) {
+	struct Buffer buf = {
+		.cap = cap,
+		.len = 0,
+		.ptr = malloc(cap),
+	};
+	if (!buf.ptr) err(EX_OSERR, "malloc");
+	return buf;
+}
+
+static char *bufferDest(struct Buffer *buf, size_t len) {
+	if (buf->len + len > buf->cap) {
+		buf->cap *= 2;
+		buf->ptr = realloc(buf->ptr, buf->cap);
+		if (!buf->ptr) err(EX_OSERR, "realloc");
+	}
+	char *dest = &buf->ptr[buf->len];
+	buf->len += len;
+	return dest;
+}
+
+static void bufferCopy(struct Buffer *buf, const char *src, size_t len) {
+	char *dst = bufferDest(buf, len);
+	memcpy(dst, src, len);
+}
+
+static char *bufferString(struct Buffer *buf) {
+	*bufferDest(buf, 1) = '\0';
+	return buf->ptr;
+}
+
+static void convertCharset(
+	struct Buffer *dst, const char *charset, const char *src, size_t len
+) {
+	iconv_t conv = iconv_open("utf-8", charset);
+	if (conv == (iconv_t)-1) {
+		warn("cannot convert from %s to utf-8", charset);
+		return;
+	}
+
+	for (size_t pad = 0; len; ++pad) {
+		char *ptr = bufferDest(dst, len + pad);
+		size_t cap = dst->cap - (ptr - dst->ptr);
+		size_t n = iconv(conv, (char **)&src, &len, &ptr, &cap);
+		if (n == (size_t)-1 && errno != E2BIG) {
+			warn("iconv");
+			break;
+		}
+		dst->len = dst->cap - cap;
+	}
+
+	iconv_close(conv);
+}
+
+static void decodeEncoding(
+	struct Buffer *dst, const char *encoding, const char *src, size_t len
+) {
 	// TODO
-	char *dup = strdup(header);
-	if (!dup) err(EX_OSERR, "strdup");
-	return dup;
+	bufferCopy(dst, src, len);
+}
+
+static void decode(
+	struct Buffer *dst, const char *encoding, const char *charset,
+	const char *src, size_t len
+) {
+	if (
+		!charset ||
+		!strcasecmp(charset, "us-ascii") ||
+		!strcasecmp(charset, "utf-8")
+	) {
+		decodeEncoding(dst, encoding, src, len);
+	} else {
+		// TODO: Avoid copying if encoding is 8bit.
+		struct Buffer decoded = bufferAlloc(len);
+		decodeEncoding(&decoded, encoding, src, len);
+		convertCharset(dst, charset, decoded.ptr, decoded.len);
+		free(decoded.ptr);
+	}
+}
+
+static void decodeWord(struct Buffer *dst, const char *src, size_t len) {
+	struct Buffer word = bufferAlloc(len + 1);
+	bufferCopy(&word, src, len);
+
+	char *ptr = bufferString(&word);
+	strsep(&ptr, "?");
+	char *charset = strsep(&ptr, "?");
+	char *encoding = strsep(&ptr, "?");
+	char *encoded = strsep(&ptr, "?");
+
+	if (charset && encoding && encoded && ptr && *ptr == '=') {
+		decode(dst, encoding, charset, encoded, strlen(encoded));
+	} else {
+		bufferCopy(dst, src, len);
+	}
+
+	free(word.ptr);
+}
+
+char *decodeHeader(const char *header) {
+	struct Buffer buf = bufferAlloc(strlen(header) + 1);
+	while (*header) {
+		size_t len = strcspn(header, " ");
+		if (!strncmp(header, "=?", 2)) {
+			decodeWord(&buf, header, len);
+		} else {
+			if (header[len]) len++;
+			bufferCopy(&buf, header, len);
+		}
+		header += len;
+	}
+	return bufferString(&buf);
 }
 
 int decodeContent(
author	June McEnroe <june@causal.agency>	2020-04-16 16:40:22 -0400
committer	June McEnroe <june@causal.agency>	2020-04-16 16:40:22 -0400
commit	1ec8758fc8f188d321ef3e6da09b0a91a7ed47b8 (patch)
tree	89904e68b3eb33ca707957b79ca60c6fc79139a4
parent	Add subject and recipient HTML classes (diff)
download	bubger-1ec8758fc8f188d321ef3e6da09b0a91a7ed47b8.tar.gz bubger-1ec8758fc8f188d321ef3e6da09b0a91a7ed47b8.zip