From 1ec8758fc8f188d321ef3e6da09b0a91a7ed47b8 Mon Sep 17 00:00:00 2001 From: "C. McEnroe" Date: Thu, 16 Apr 2020 16:40:22 -0400 Subject: Parse encoded-word and convert charsets --- Makefile | 2 ++ bubger.1 | 21 ++++++++++- decode.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 142 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 935f25e..b1db8d4 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,8 @@ CFLAGS += -std=c11 -Wall -Wextra -Wpedantic LDLIBS = -ltls +-include config.mk + OBJS += archive.o OBJS += atom.o OBJS += concat.o diff --git a/bubger.1 b/bubger.1 index 77f93fa..57d734f 100644 --- a/bubger.1 +++ b/bubger.1 @@ -1,4 +1,4 @@ -.Dd April 14, 2020 +.Dd April 16, 2020 .Dt BUBGER 1 .Os . @@ -131,6 +131,15 @@ Rendered Atom, HTML and mboxrd files for each thread. .Re .It .Rs +.%A K. Moore +.%T Message Header Extensions for Non-ASCII Text +.%I IETF +.%N RFC 2047 +.%D November 1996 +.%U https://tools.ietf.org/html/rfc2047 +.Re +.It +.Rs .%A M. Crispin .%T INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 .%I IETF @@ -160,6 +169,16 @@ Rendered Atom, HTML and mboxrd files for each thread. .Re .It .Rs +.%A N. Freed +.%A N. Borenstein +.%T Format of Internet Message Bodies +.%I IETF +.%N RFC 2045 +.%D November 1996 +.%U https://tools.ietf.org/html/rfc2045 +.Re +.It +.Rs .%A T. Berners-Lee .%A L. Masinter .%A M. McCahill diff --git a/decode.c b/decode.c index 2e289fc..654cf22 100644 --- a/decode.c +++ b/decode.c @@ -15,18 +15,134 @@ */ #include +#include +#include #include #include #include +#include #include #include "archive.h" -char *decodeHeader(const char *header) { +struct Buffer { + size_t cap; + size_t len; + char *ptr; +}; + +static struct Buffer bufferAlloc(size_t cap) { + struct Buffer buf = { + .cap = cap, + .len = 0, + .ptr = malloc(cap), + }; + if (!buf.ptr) err(EX_OSERR, "malloc"); + return buf; +} + +static char *bufferDest(struct Buffer *buf, size_t len) { + if (buf->len + len > buf->cap) { + buf->cap *= 2; + buf->ptr = realloc(buf->ptr, buf->cap); + if (!buf->ptr) err(EX_OSERR, "realloc"); + } + char *dest = &buf->ptr[buf->len]; + buf->len += len; + return dest; +} + +static void bufferCopy(struct Buffer *buf, const char *src, size_t len) { + char *dst = bufferDest(buf, len); + memcpy(dst, src, len); +} + +static char *bufferString(struct Buffer *buf) { + *bufferDest(buf, 1) = '\0'; + return buf->ptr; +} + +static void convertCharset( + struct Buffer *dst, const char *charset, const char *src, size_t len +) { + iconv_t conv = iconv_open("utf-8", charset); + if (conv == (iconv_t)-1) { + warn("cannot convert from %s to utf-8", charset); + return; + } + + for (size_t pad = 0; len; ++pad) { + char *ptr = bufferDest(dst, len + pad); + size_t cap = dst->cap - (ptr - dst->ptr); + size_t n = iconv(conv, (char **)&src, &len, &ptr, &cap); + if (n == (size_t)-1 && errno != E2BIG) { + warn("iconv"); + break; + } + dst->len = dst->cap - cap; + } + + iconv_close(conv); +} + +static void decodeEncoding( + struct Buffer *dst, const char *encoding, const char *src, size_t len +) { // TODO - char *dup = strdup(header); - if (!dup) err(EX_OSERR, "strdup"); - return dup; + bufferCopy(dst, src, len); +} + +static void decode( + struct Buffer *dst, const char *encoding, const char *charset, + const char *src, size_t len +) { + if ( + !charset || + !strcasecmp(charset, "us-ascii") || + !strcasecmp(charset, "utf-8") + ) { + decodeEncoding(dst, encoding, src, len); + } else { + // TODO: Avoid copying if encoding is 8bit. + struct Buffer decoded = bufferAlloc(len); + decodeEncoding(&decoded, encoding, src, len); + convertCharset(dst, charset, decoded.ptr, decoded.len); + free(decoded.ptr); + } +} + +static void decodeWord(struct Buffer *dst, const char *src, size_t len) { + struct Buffer word = bufferAlloc(len + 1); + bufferCopy(&word, src, len); + + char *ptr = bufferString(&word); + strsep(&ptr, "?"); + char *charset = strsep(&ptr, "?"); + char *encoding = strsep(&ptr, "?"); + char *encoded = strsep(&ptr, "?"); + + if (charset && encoding && encoded && ptr && *ptr == '=') { + decode(dst, encoding, charset, encoded, strlen(encoded)); + } else { + bufferCopy(dst, src, len); + } + + free(word.ptr); +} + +char *decodeHeader(const char *header) { + struct Buffer buf = bufferAlloc(strlen(header) + 1); + while (*header) { + size_t len = strcspn(header, " "); + if (!strncmp(header, "=?", 2)) { + decodeWord(&buf, header, len); + } else { + if (header[len]) len++; + bufferCopy(&buf, header, len); + } + header += len; + } + return bufferString(&buf); } int decodeContent( -- cgit 1.4.1