From 95394a0bcb641ca201289cfea373f9780d45ad8b Mon Sep 17 00:00:00 2001 From: "C. McEnroe" Date: Fri, 6 Dec 2019 12:44:59 -0500 Subject: Add deduplication function to unscoop --- unscoop.1 | 28 +++++++++++++++++++++------- unscoop.c | 48 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 18 deletions(-) diff --git a/unscoop.1 b/unscoop.1 index e9e8dbe..9acba7c 100644 --- a/unscoop.1 +++ b/unscoop.1 @@ -8,11 +8,14 @@ . .Sh SYNOPSIS .Nm -.Op Fl C Ar context -.Op Fl N Ar network +.Fl C Ar context +.Fl N Ar network .Op Fl d Ar path .Op Fl f Ar format .Ar +.Nm +.Fl D +.Op Fl d Ar path . .Sh DESCRIPTION The @@ -26,16 +29,25 @@ The arguments are as follows: .Bl -tag -width Ds .It Fl C Ar context Set the channel or query name of the imported logs. +Contexts beginning with +.Sq # +or +.Sq & +are assumed to be channels. +. +.It Fl D +Delete duplicate events caused by overlapping imports. +This operation requires SQLite version 3.25.0 or newer. . .It Fl N Ar network Set the network name of the imported logs. . .It Fl d Ar path -Import into the database at -.Ar path . -See -.Xr litterbox 1 -for the default database path. +Set the database path. +The database must have been initialized by +.Xr litterbox 1 . +The default path is as in +.Xr litterbox 1 . . .It Fl f Ar format Set the input log format. @@ -43,6 +55,8 @@ The following formats are supported: .Sy generic , .Sy textual , .Sy catgirl . +The default format is +.Sy generic . .Pp The .Sy generic diff --git a/unscoop.c b/unscoop.c index 09a0e97..b39ee14 100644 --- a/unscoop.c +++ b/unscoop.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -171,22 +172,55 @@ bindMatch(sqlite3_stmt *stmt, int param, const char *str, regmatch_t match) { int main(int argc, char *argv[]) { char *path = NULL; + bool dedup = false; const char *network = NULL; const char *context = NULL; const struct Format *format = &Formats[0]; int opt; - while (0 < (opt = getopt(argc, argv, "C:N:d:f:"))) { + while (0 < (opt = getopt(argc, argv, "C:DN:d:f:"))) { switch (opt) { break; case 'C': context = optarg; + break; case 'D': dedup = true; break; case 'N': network = optarg; break; case 'd': path = optarg; break; case 'f': format = formatParse(optarg); break; default: return EX_USAGE; } } - if (!network) errx(EX_USAGE, "network required"); - if (!context) errx(EX_USAGE, "context required"); + if (!dedup && !network) errx(EX_USAGE, "network required"); + if (!dedup && !context) errx(EX_USAGE, "context required"); + + int flags = SQLITE_OPEN_READWRITE; + sqlite3 *db = (path ? dbOpen(path, flags) : dbFind(flags)); + if (!db) errx(EX_NOINPUT, "database not found"); + + if (dbVersion(db) != DatabaseVersion) { + errx(EX_CONFIG, "database needs migration"); + } + + if (dedup) { + if (sqlite3_libversion_number() < 3025000) { + errx(EX_CONFIG, "SQLite version 3.25.0 or newer required"); + } + int error = sqlite3_exec( + db, + "WITH potentials AS (" + " SELECT events.id, events.id - first_value(events.id) OVER (" + " PARTITION BY time, type, contextID, nick, target, message" + " ORDER BY events.id" + " ) AS diff" + " FROM events JOIN names ON (names.id = nameID)" + "), duplicates AS (SELECT id FROM potentials WHERE diff > 50)" + "DELETE FROM events WHERE id IN duplicates;", + NULL, NULL, NULL + ); + if (error) { + errx(EX_SOFTWARE, "sqlite3_exec: %s", sqlite3_errmsg(db)); + } + printf("deleted %d events\n", sqlite3_changes(db)); + return EX_OK; + } for (size_t i = 0; i < format->len; ++i) { struct Matcher *matcher = &format->matchers[i]; @@ -199,14 +233,6 @@ int main(int argc, char *argv[]) { errx(EX_SOFTWARE, "regcomp: %s: %s", buf, matcher->pattern); } - int flags = SQLITE_OPEN_READWRITE; - sqlite3 *db = (path ? dbOpen(path, flags) : dbFind(flags)); - if (!db) errx(EX_NOINPUT, "database not found"); - - if (dbVersion(db) != DatabaseVersion) { - errx(EX_CONFIG, "database needs migration"); - } - sqlite3_stmt *insertNetwork = dbPrepare( db, 0, "INSERT OR IGNORE INTO networks (name) VALUES ($network);" ); -- cgit 1.4.1