about summary refs log tree commit diff
diff options
context:
space:
mode:
authorJune McEnroe <june@causal.agency>2019-12-06 12:44:59 -0500
committerJune McEnroe <june@causal.agency>2019-12-06 12:44:59 -0500
commit95394a0bcb641ca201289cfea373f9780d45ad8b (patch)
tree6ca65a22bee4f4c339675f2749f340950861f11e
parentAdd eventsDelete trigger (diff)
downloadlitterbox-95394a0bcb641ca201289cfea373f9780d45ad8b.tar.gz
litterbox-95394a0bcb641ca201289cfea373f9780d45ad8b.zip
Add deduplication function to unscoop
-rw-r--r--unscoop.128
-rw-r--r--unscoop.c48
2 files changed, 58 insertions, 18 deletions
diff --git a/unscoop.1 b/unscoop.1
index e9e8dbe..9acba7c 100644
--- a/unscoop.1
+++ b/unscoop.1
@@ -8,11 +8,14 @@
 .
 .Sh SYNOPSIS
 .Nm
-.Op Fl C Ar context
-.Op Fl N Ar network
+.Fl C Ar context
+.Fl N Ar network
 .Op Fl d Ar path
 .Op Fl f Ar format
 .Ar
+.Nm
+.Fl D
+.Op Fl d Ar path
 .
 .Sh DESCRIPTION
 The
@@ -26,16 +29,25 @@ The arguments are as follows:
 .Bl -tag -width Ds
 .It Fl C Ar context
 Set the channel or query name of the imported logs.
+Contexts beginning with
+.Sq #
+or
+.Sq &
+are assumed to be channels.
+.
+.It Fl D
+Delete duplicate events caused by overlapping imports.
+This operation requires SQLite version 3.25.0 or newer.
 .
 .It Fl N Ar network
 Set the network name of the imported logs.
 .
 .It Fl d Ar path
-Import into the database at
-.Ar path .
-See
-.Xr litterbox 1
-for the default database path.
+Set the database path.
+The database must have been initialized by
+.Xr litterbox 1 .
+The default path is as in
+.Xr litterbox 1 .
 .
 .It Fl f Ar format
 Set the input log format.
@@ -43,6 +55,8 @@ The following formats are supported:
 .Sy generic ,
 .Sy textual ,
 .Sy catgirl .
+The default format is
+.Sy generic .
 .Pp
 The
 .Sy generic
diff --git a/unscoop.c b/unscoop.c
index 09a0e97..b39ee14 100644
--- a/unscoop.c
+++ b/unscoop.c
@@ -18,6 +18,7 @@
 #include <err.h>
 #include <regex.h>
 #include <sqlite3.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -171,22 +172,55 @@ bindMatch(sqlite3_stmt *stmt, int param, const char *str, regmatch_t match) {
 
 int main(int argc, char *argv[]) {
 	char *path = NULL;
+	bool dedup = false;
 	const char *network = NULL;
 	const char *context = NULL;
 	const struct Format *format = &Formats[0];
 
 	int opt;
-	while (0 < (opt = getopt(argc, argv, "C:N:d:f:"))) {
+	while (0 < (opt = getopt(argc, argv, "C:DN:d:f:"))) {
 		switch (opt) {
 			break; case 'C': context = optarg;
+			break; case 'D': dedup = true;
 			break; case 'N': network = optarg;
 			break; case 'd': path = optarg;
 			break; case 'f': format = formatParse(optarg);
 			break; default:  return EX_USAGE;
 		}
 	}
-	if (!network) errx(EX_USAGE, "network required");
-	if (!context) errx(EX_USAGE, "context required");
+	if (!dedup && !network) errx(EX_USAGE, "network required");
+	if (!dedup && !context) errx(EX_USAGE, "context required");
+
+	int flags = SQLITE_OPEN_READWRITE;
+	sqlite3 *db = (path ? dbOpen(path, flags) : dbFind(flags));
+	if (!db) errx(EX_NOINPUT, "database not found");
+
+	if (dbVersion(db) != DatabaseVersion) {
+		errx(EX_CONFIG, "database needs migration");
+	}
+
+	if (dedup) {
+		if (sqlite3_libversion_number() < 3025000) {
+			errx(EX_CONFIG, "SQLite version 3.25.0 or newer required");
+		}
+		int error = sqlite3_exec(
+			db,
+			"WITH potentials AS ("
+			" SELECT events.id, events.id - first_value(events.id) OVER ("
+			"  PARTITION BY time, type, contextID, nick, target, message"
+			"  ORDER BY events.id"
+			" ) AS diff"
+			" FROM events JOIN names ON (names.id = nameID)"
+			"), duplicates AS (SELECT id FROM potentials WHERE diff > 50)"
+			"DELETE FROM events WHERE id IN duplicates;",
+			NULL, NULL, NULL
+		);
+		if (error) {
+			errx(EX_SOFTWARE, "sqlite3_exec: %s", sqlite3_errmsg(db));
+		}
+		printf("deleted %d events\n", sqlite3_changes(db));
+		return EX_OK;
+	}
 
 	for (size_t i = 0; i < format->len; ++i) {
 		struct Matcher *matcher = &format->matchers[i];
@@ -199,14 +233,6 @@ int main(int argc, char *argv[]) {
 		errx(EX_SOFTWARE, "regcomp: %s: %s", buf, matcher->pattern);
 	}
 
-	int flags = SQLITE_OPEN_READWRITE;
-	sqlite3 *db = (path ? dbOpen(path, flags) : dbFind(flags));
-	if (!db) errx(EX_NOINPUT, "database not found");
-
-	if (dbVersion(db) != DatabaseVersion) {
-		errx(EX_CONFIG, "database needs migration");
-	}
-
 	sqlite3_stmt *insertNetwork = dbPrepare(
 		db, 0, "INSERT OR IGNORE INTO networks (name) VALUES ($network);"
 	);