From 73dae54cf11a916f16ad4a80549d4e2c9b10036e Mon Sep 17 00:00:00 2001 From: Adrian Malacoda Date: Tue, 4 Aug 2020 02:29:42 -0500 Subject: [PATCH] Filter out sensitive information (emails, IP addressses) from messages table. --- dump_forum_data | 53 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/dump_forum_data b/dump_forum_data index 2cad504..3d58c29 100755 --- a/dump_forum_data +++ b/dump_forum_data @@ -1,5 +1,7 @@ #!/usr/bin/env python3 import argparse +import os +import re from xml.etree import ElementTree from subprocess import check_output, check_call @@ -56,6 +58,13 @@ BOARDS = "smf_boards" TOPICS = "smf_topics" MESSAGES = "smf_messages" +# Dump filenames +STRUCTURE_DUMP = "structure.sql" +MISC_DUMP = "misc_data.sql" +CATEGORIES_DUMP = "categories.sql" +BOARDS_DUMP = "boards.sql" +TOPICS_DUMP = "threads.sql" + # Categories we are not interested in archiving. # `id_cat` in (1, 2) DO_NOT_ARCHIVE_CATEGORIES = [ @@ -80,6 +89,10 @@ DO_NOT_ARCHIVE_BOARDS = [ 123 # ????? ] +# Regexes for sensitive information +EMAIL_REGEX = re.compile(r"'[^']+@[^']+'") +IP_REGEX = re.compile(r"'\d+\.\d+\.\d+\.\d+'") + class Database(): def __init__(self, host, database, username, password): self.host = host @@ -87,14 +100,21 @@ class Database(): self.username = username self.password = password - def dump(self, filename, tables, *args): - command = ["mysqldump"] + list(args) + self.auth() + [ - "--result-file={}".format(filename), - self.database - ] + tables + def dump(self, tables, filename, *args): + command = ["mysqldump"] + list(args) + self.auth() + + if filename: + command.append("--result-file={}".format(filename)) + + command.append(self.database) + command = command + tables print(">> {}".format(format_command(command))) - check_call(command) + if filename: + check_call(command) + return filename + else: + return check_output(command).strip().decode() def query(self, query): command = ["mysql"] + self.auth() + [ @@ -155,13 +175,24 @@ database = Database(args.host, args.db, args.username, args.password) # Select which topics we DON'T want, based on the board ids we don't want do_not_archive_thread_ids = [row['id_topic'] for row in database.query("SELECT id_topic FROM smf_topics WHERE id_board IN ({})".format(",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])))] -database.dump("structure.sql", DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], "--no-data") -database.dump("misc_data.sql", DUMP_ALL_DATA, "--no-create-info") +if not os.path.exists(STRUCTURE_DUMP): + database.dump(DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], STRUCTURE_DUMP, "--no-data") + +if not os.path.exists(MISC_DUMP): + database.dump(DUMP_ALL_DATA, MISC_DUMP, "--no-create-info") category_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_CATEGORIES]) board_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS]) thread_filter = ",".join([str(id) for id in do_not_archive_thread_ids]) -database.dump("categories.sql", [CATEGORIES], "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info") -database.dump("boards.sql", [BOARDS], "--where=NOT id_board in ({})".format(board_filter), "--no-create-info") -database.dump("threads.sql", [TOPICS, MESSAGES], "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info") \ No newline at end of file +if not os.path.exists(CATEGORIES_DUMP): + database.dump([CATEGORIES], CATEGORIES_DUMP, "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info") + +if not os.path.exists(BOARDS_DUMP): + database.dump([BOARDS], BOARDS_DUMP, "--where=NOT id_board in ({})".format(board_filter), "--no-create-info") + +with open(TOPICS_DUMP, "w", encoding="utf-8") as topics_dump: + dump_content = database.dump([TOPICS, MESSAGES], None, "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info") + dump_content = EMAIL_REGEX.sub("'*****@*****'", dump_content) + dump_content = IP_REGEX.sub("'***.***.***.***'", dump_content) + topics_dump.write(dump_content) \ No newline at end of file