epilogue/forum/dump_forum_data
2020-09-01 01:31:39 -05:00

203 lines
8.2 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import os
import re
from xml.etree import ElementTree
from subprocess import check_output, check_call
# Tables we don't want data from. These are usually populated with default data from an SMF install.
DUMP_STRUCTURE_ONLY = [
"smf_admin_info_files", "smf_ads", "smf_ads_settings", "smf_approval_queue", "smf_awards", "smf_ban_groups", "smf_ban_items",
"smf_bbcodes", "smf_board_permissions", "smf_buddies", "smf_calendar", "smf_calendar_holidays", "smf_cf_fields", "smf_cf_forms",
"smf_collapsed_categories", "smf_custom_actions", "smf_custom_fields", "smf_down_cat", "smf_down_catperm", "smf_down_comment",
"smf_down_creport", "smf_down_custom_field", "smf_down_custom_field_data", "smf_down_file", "smf_down_groupquota", "smf_down_rating",
"smf_down_report", "smf_down_userquota", "smf_global_announcements", "smf_global_announcements_boards", "smf_groupmods",
"smf_group_moderators", "smf_login", "smf_log_actions", "smf_log_activity", "smf_log_banned", "smf_log_boards", "smf_log_comments",
"smf_log_digest", "smf_log_errors", "smf_log_floodcontrol", "smf_log_group_requests", "smf_log_issues", "smf_log_karma",
"smf_log_mark_read", "smf_log_member_notices", "smf_log_notify", "smf_log_notify_projects", "smf_log_online", "smf_log_packages",
"smf_log_polls", "smf_log_projects", "smf_log_project_mark_read", "smf_log_reported", "smf_log_reported_comments",
"smf_log_scheduled_tasks", "smf_log_search_messages", "smf_log_search_results", "smf_log_search_subjects", "smf_log_search_topics",
"smf_log_spider_hits", "smf_log_spider_stats", "smf_log_subscribed", "smf_log_topics", "smf_mail_queue", "smf_membergroups", "smf_members",
"smf_moderators", "smf_openid_assoc", "smf_package_servers", "smf_permissions", "smf_permission_profiles", "smf_personal_messages",
"smf_picture_comments","smf_pm_attachments", "smf_pm_recipients", "smf_pm_rules", "smf_profile_albums", "smf_profile_comments",
"smf_profile_pictures", "", "smf_scheduled_tasks", "smf_sessions", "smf_settings", "smf_smileys", "smf_spiders", "smf_subscriptions", "smf_themes"
]
# Tables we want all the data from. Some legacy data that may be of interest is also here.
DUMP_ALL_DATA = [
# actual forum content
"smf_polls", "smf_poll_choices", "smf_message_icons", "smf_attachments",
# pre-wiki (orange glove)
"comments", "dirs", "docs", "glitchdex", "glitchdex2", "groups", "old", "staff", "statsbydex",
# felblog (old blogging system)
"smf_felblog_categories", "smf_felblog_cmnt_log", "smf_felblog_comments", "smf_felblog_content", "smf_felblog_cont_log", "smf_felblog_manager",
"smf_felblog_ratings", "smf_felblog_settings",
# arcade system
"smf_games", "smf_games_category", "smf_games_challenge", "smf_games_favorite", "smf_games_high", "smf_games_rate",
"smf_games_settings", "smf_games_tournament", "smf_games_tournament_players", "smf_games_tournament_results", "smf_games_tournament_scores",
# shop system
"smf_shop_inventory", "smf_shop_items", "smf_shop_shops",
# project management system (we had this?)
"smf_projects", "smf_projects_settings", "smf_project_developer", "smf_project_permissions", "smf_project_profiles", "smf_project_timeline",
"smf_project_trackers", "smf_project_versions", "smf_issues", "smf_issue_attachments", "smf_issue_category", "smf_issue_comments",
"smf_issue_tags",
# used for the IRC bridge
"smf_slartibartfast"
]
# Special tables we need to filter.
CATEGORIES = "smf_categories"
BOARDS = "smf_boards"
TOPICS = "smf_topics"
MESSAGES = "smf_messages"
# Dump filenames
STRUCTURE_DUMP = "structure.sql"
MISC_DUMP = "misc_data.sql"
CATEGORIES_DUMP = "categories.sql"
BOARDS_DUMP = "boards.sql"
TOPICS_DUMP = "threads.sql"
# Categories we are not interested in archiving.
# `id_cat` in (1, 2)
DO_NOT_ARCHIVE_CATEGORIES = [
7, # Links
12, # Epsilon: ?????
6, # Sigma: Higher Access
8 # Omega: Garbage
]
# Boards we are not interested in archiving.
# `id_board` in (1, 2)
DO_NOT_ARCHIVE_BOARDS = [
24, 94, 118, 121, # Links
40, # Exclusive Board
65, # Requests for Moderatorship
66, # Requests for Membership+
67, # Requests for Distinguished Membership
23, # M.A.S.K. HQ (Staff Board)
22, # Admins Only Board
89, # Test Board
86, # Omega Archives
51, 37, 79, 26, 47, 44, 45, 99, 93, 119, 96,
62, # Submit-A-Glitch Archives
3, 4, 5, 57, 58, 59, 38, 54, 63, 64,
68, 69, 70, 81, 82, 83,
28, # The Dumpster Out Back
123 # ?????
]
# Regexes for sensitive information
EMAIL_REGEX = re.compile(r"'[^'\s]+@[^'\s]+'")
IP_REGEX = re.compile(r"'\d+\.\d+\.\d+\.\d+'")
class Database():
def __init__(self, host, database, username, password):
self.host = host
self.database = database
self.username = username
self.password = password
def dump(self, tables, filename, *args):
command = ["mysqldump"] + list(args) + self.auth()
if filename:
command.append("--result-file={}".format(filename))
command.append(self.database)
command = command + tables
print(">> {}".format(format_command(command)))
if filename:
check_call(command)
return filename
else:
return check_output(command).strip().decode()
def query(self, query):
command = ["mysql"] + self.auth() + [
"--xml",
self.database,
"-e", query
]
print(">> {}".format(format_command(command)))
result = check_output(command).strip()
return [row_from_xml(element) for element in ElementTree.fromstring(result)]
def auth(self):
return [
"--user={}".format(self.username),
"--password={}".format(self.password),
"--host={}".format(self.host)
]
def format_command(command):
return "{}".format([item for item in command if not item.startswith("--password")])
def row_from_xml(element):
row = {}
for child in element:
row[child.attrib['name']] = child.text
return row
parser = argparse.ArgumentParser(description="Forum scraper")
parser.add_argument(
"--host",
dest="host",
default="127.0.0.1",
help="Database host"
)
parser.add_argument(
"--db",
dest="db",
default="glitchcity",
help="Database name"
)
parser.add_argument(
"--username",
dest="username",
default="glitchcity",
help="Database username"
)
parser.add_argument(
"--password",
dest="password",
required=True,
help="Database password"
)
args = parser.parse_args()
database = Database(args.host, args.db, args.username, args.password)
# Select which topics we DON'T want, based on the board ids we don't want
do_not_archive_thread_ids = [row['id_topic'] for row in database.query("SELECT id_topic FROM smf_topics WHERE id_board IN ({})".format(",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])))]
if not os.path.exists(STRUCTURE_DUMP):
database.dump(DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], STRUCTURE_DUMP, "--no-data")
if not os.path.exists(MISC_DUMP):
database.dump(DUMP_ALL_DATA, MISC_DUMP, "--no-create-info")
category_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_CATEGORIES])
board_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])
thread_filter = ",".join([str(id) for id in do_not_archive_thread_ids])
if not os.path.exists(CATEGORIES_DUMP):
database.dump([CATEGORIES], CATEGORIES_DUMP, "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info")
if not os.path.exists(BOARDS_DUMP):
database.dump([BOARDS], BOARDS_DUMP, "--where=NOT id_board in ({})".format(board_filter), "--no-create-info")
with open(TOPICS_DUMP, "w", encoding="utf-8") as topics_dump:
dump_content = database.dump([TOPICS, MESSAGES], None, "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info")
dump_content = EMAIL_REGEX.sub("'*****@*****'", dump_content)
dump_content = IP_REGEX.sub("'***.***.***.***'", dump_content)
topics_dump.write(dump_content)