Filter out sensitive information (emails, IP addressses) from messages table.
This commit is contained in:
parent
eac65f6bda
commit
73dae54cf1
@ -1,5 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
|
||||
from xml.etree import ElementTree
|
||||
from subprocess import check_output, check_call
|
||||
@ -56,6 +58,13 @@ BOARDS = "smf_boards"
|
||||
TOPICS = "smf_topics"
|
||||
MESSAGES = "smf_messages"
|
||||
|
||||
# Dump filenames
|
||||
STRUCTURE_DUMP = "structure.sql"
|
||||
MISC_DUMP = "misc_data.sql"
|
||||
CATEGORIES_DUMP = "categories.sql"
|
||||
BOARDS_DUMP = "boards.sql"
|
||||
TOPICS_DUMP = "threads.sql"
|
||||
|
||||
# Categories we are not interested in archiving.
|
||||
# `id_cat` in (1, 2)
|
||||
DO_NOT_ARCHIVE_CATEGORIES = [
|
||||
@ -80,6 +89,10 @@ DO_NOT_ARCHIVE_BOARDS = [
|
||||
123 # ?????
|
||||
]
|
||||
|
||||
# Regexes for sensitive information
|
||||
EMAIL_REGEX = re.compile(r"'[^']+@[^']+'")
|
||||
IP_REGEX = re.compile(r"'\d+\.\d+\.\d+\.\d+'")
|
||||
|
||||
class Database():
|
||||
def __init__(self, host, database, username, password):
|
||||
self.host = host
|
||||
@ -87,14 +100,21 @@ class Database():
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
def dump(self, filename, tables, *args):
|
||||
command = ["mysqldump"] + list(args) + self.auth() + [
|
||||
"--result-file={}".format(filename),
|
||||
self.database
|
||||
] + tables
|
||||
def dump(self, tables, filename, *args):
|
||||
command = ["mysqldump"] + list(args) + self.auth()
|
||||
|
||||
if filename:
|
||||
command.append("--result-file={}".format(filename))
|
||||
|
||||
command.append(self.database)
|
||||
command = command + tables
|
||||
|
||||
print(">> {}".format(format_command(command)))
|
||||
check_call(command)
|
||||
if filename:
|
||||
check_call(command)
|
||||
return filename
|
||||
else:
|
||||
return check_output(command).strip().decode()
|
||||
|
||||
def query(self, query):
|
||||
command = ["mysql"] + self.auth() + [
|
||||
@ -155,13 +175,24 @@ database = Database(args.host, args.db, args.username, args.password)
|
||||
# Select which topics we DON'T want, based on the board ids we don't want
|
||||
do_not_archive_thread_ids = [row['id_topic'] for row in database.query("SELECT id_topic FROM smf_topics WHERE id_board IN ({})".format(",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])))]
|
||||
|
||||
database.dump("structure.sql", DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], "--no-data")
|
||||
database.dump("misc_data.sql", DUMP_ALL_DATA, "--no-create-info")
|
||||
if not os.path.exists(STRUCTURE_DUMP):
|
||||
database.dump(DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], STRUCTURE_DUMP, "--no-data")
|
||||
|
||||
if not os.path.exists(MISC_DUMP):
|
||||
database.dump(DUMP_ALL_DATA, MISC_DUMP, "--no-create-info")
|
||||
|
||||
category_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_CATEGORIES])
|
||||
board_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])
|
||||
thread_filter = ",".join([str(id) for id in do_not_archive_thread_ids])
|
||||
|
||||
database.dump("categories.sql", [CATEGORIES], "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info")
|
||||
database.dump("boards.sql", [BOARDS], "--where=NOT id_board in ({})".format(board_filter), "--no-create-info")
|
||||
database.dump("threads.sql", [TOPICS, MESSAGES], "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info")
|
||||
if not os.path.exists(CATEGORIES_DUMP):
|
||||
database.dump([CATEGORIES], CATEGORIES_DUMP, "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info")
|
||||
|
||||
if not os.path.exists(BOARDS_DUMP):
|
||||
database.dump([BOARDS], BOARDS_DUMP, "--where=NOT id_board in ({})".format(board_filter), "--no-create-info")
|
||||
|
||||
with open(TOPICS_DUMP, "w", encoding="utf-8") as topics_dump:
|
||||
dump_content = database.dump([TOPICS, MESSAGES], None, "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info")
|
||||
dump_content = EMAIL_REGEX.sub("'*****@*****'", dump_content)
|
||||
dump_content = IP_REGEX.sub("'***.***.***.***'", dump_content)
|
||||
topics_dump.write(dump_content)
|
Loading…
x
Reference in New Issue
Block a user