summary refs log tree commit diff
path: root/unscoop.c
diff options
context:
space:
mode:
authorJune McEnroe <june@causal.agency>2019-12-06 12:44:59 -0500
committerJune McEnroe <june@causal.agency>2019-12-06 12:44:59 -0500
commit95394a0bcb641ca201289cfea373f9780d45ad8b (patch)
tree6ca65a22bee4f4c339675f2749f340950861f11e /unscoop.c
parentAdd eventsDelete trigger (diff)
downloadlitterbox-95394a0bcb641ca201289cfea373f9780d45ad8b.tar.gz
litterbox-95394a0bcb641ca201289cfea373f9780d45ad8b.zip
Add deduplication function to unscoop
Diffstat (limited to 'unscoop.c')
-rw-r--r--unscoop.c48
1 files changed, 37 insertions, 11 deletions
diff --git a/unscoop.c b/unscoop.c
index 09a0e97..b39ee14 100644
--- a/unscoop.c
+++ b/unscoop.c
@@ -18,6 +18,7 @@
 #include <err.h>
 #include <regex.h>
 #include <sqlite3.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -171,22 +172,55 @@ bindMatch(sqlite3_stmt *stmt, int param, const char *str, regmatch_t match) {
 
 int main(int argc, char *argv[]) {
 	char *path = NULL;
+	bool dedup = false;
 	const char *network = NULL;
 	const char *context = NULL;
 	const struct Format *format = &Formats[0];
 
 	int opt;
-	while (0 < (opt = getopt(argc, argv, "C:N:d:f:"))) {
+	while (0 < (opt = getopt(argc, argv, "C:DN:d:f:"))) {
 		switch (opt) {
 			break; case 'C': context = optarg;
+			break; case 'D': dedup = true;
 			break; case 'N': network = optarg;
 			break; case 'd': path = optarg;
 			break; case 'f': format = formatParse(optarg);
 			break; default:  return EX_USAGE;
 		}
 	}
-	if (!network) errx(EX_USAGE, "network required");
-	if (!context) errx(EX_USAGE, "context required");
+	if (!dedup && !network) errx(EX_USAGE, "network required");
+	if (!dedup && !context) errx(EX_USAGE, "context required");
+
+	int flags = SQLITE_OPEN_READWRITE;
+	sqlite3 *db = (path ? dbOpen(path, flags) : dbFind(flags));
+	if (!db) errx(EX_NOINPUT, "database not found");
+
+	if (dbVersion(db) != DatabaseVersion) {
+		errx(EX_CONFIG, "database needs migration");
+	}
+
+	if (dedup) {
+		if (sqlite3_libversion_number() < 3025000) {
+			errx(EX_CONFIG, "SQLite version 3.25.0 or newer required");
+		}
+		int error = sqlite3_exec(
+			db,
+			"WITH potentials AS ("
+			" SELECT events.id, events.id - first_value(events.id) OVER ("
+			"  PARTITION BY time, type, contextID, nick, target, message"
+			"  ORDER BY events.id"
+			" ) AS diff"
+			" FROM events JOIN names ON (names.id = nameID)"
+			"), duplicates AS (SELECT id FROM potentials WHERE diff > 50)"
+			"DELETE FROM events WHERE id IN duplicates;",
+			NULL, NULL, NULL
+		);
+		if (error) {
+			errx(EX_SOFTWARE, "sqlite3_exec: %s", sqlite3_errmsg(db));
+		}
+		printf("deleted %d events\n", sqlite3_changes(db));
+		return EX_OK;
+	}
 
 	for (size_t i = 0; i < format->len; ++i) {
 		struct Matcher *matcher = &format->matchers[i];
@@ -199,14 +233,6 @@ int main(int argc, char *argv[]) {
 		errx(EX_SOFTWARE, "regcomp: %s: %s", buf, matcher->pattern);
 	}
 
-	int flags = SQLITE_OPEN_READWRITE;
-	sqlite3 *db = (path ? dbOpen(path, flags) : dbFind(flags));
-	if (!db) errx(EX_NOINPUT, "database not found");
-
-	if (dbVersion(db) != DatabaseVersion) {
-		errx(EX_CONFIG, "database needs migration");
-	}
-
 	sqlite3_stmt *insertNetwork = dbPrepare(
 		db, 0, "INSERT OR IGNORE INTO networks (name) VALUES ($network);"
 	);