198 lines
7.9 KiB
Python
Executable File
198 lines
7.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import argparse
|
|
import os
|
|
import re
|
|
|
|
from xml.etree import ElementTree
|
|
from subprocess import check_output, check_call
|
|
|
|
# Tables we don't want data from. These are usually populated with default data from an SMF install.
|
|
DUMP_STRUCTURE_ONLY = [
|
|
"smf_admin_info_files", "smf_ads", "smf_ads_settings", "smf_approval_queue", "smf_awards", "smf_ban_groups", "smf_ban_items",
|
|
"smf_bbcodes", "smf_board_permissions", "smf_buddies", "smf_calendar", "smf_calendar_holidays", "smf_cf_fields", "smf_cf_forms",
|
|
"smf_collapsed_categories", "smf_custom_actions", "smf_custom_fields", "smf_down_cat", "smf_down_catperm", "smf_down_comment",
|
|
"smf_down_creport", "smf_down_custom_field", "smf_down_custom_field_data", "smf_down_file", "smf_down_groupquota", "smf_down_rating",
|
|
"smf_down_report", "smf_down_userquota", "smf_global_announcements", "smf_global_announcements_boards", "smf_groupmods",
|
|
"smf_group_moderators", "smf_login", "smf_log_actions", "smf_log_activity", "smf_log_banned", "smf_log_boards", "smf_log_comments",
|
|
"smf_log_digest", "smf_log_errors", "smf_log_floodcontrol", "smf_log_group_requests", "smf_log_issues", "smf_log_karma",
|
|
"smf_log_mark_read", "smf_log_member_notices", "smf_log_notify", "smf_log_notify_projects", "smf_log_online", "smf_log_packages",
|
|
"smf_log_polls", "smf_log_projects", "smf_log_project_mark_read", "smf_log_reported", "smf_log_reported_comments",
|
|
"smf_log_scheduled_tasks", "smf_log_search_messages", "smf_log_search_results", "smf_log_search_subjects", "smf_log_search_topics",
|
|
"smf_log_spider_hits", "smf_log_spider_stats", "smf_log_subscribed", "smf_log_topics", "smf_mail_queue", "smf_membergroups", "smf_members",
|
|
"smf_moderators", "smf_openid_assoc", "smf_package_servers", "smf_permissions", "smf_permission_profiles", "smf_personal_messages",
|
|
"smf_picture_comments","smf_pm_attachments", "smf_pm_recipients", "smf_pm_rules", "smf_profile_albums", "smf_profile_comments",
|
|
"smf_profile_pictures", "", "smf_scheduled_tasks", "smf_sessions", "smf_settings", "smf_smileys", "smf_spiders", "smf_subscriptions", "smf_themes"
|
|
]
|
|
|
|
# Tables we want all the data from. Some legacy data that may be of interest is also here.
|
|
DUMP_ALL_DATA = [
|
|
# actual forum content
|
|
"smf_polls", "smf_poll_choices", "smf_message_icons", "smf_attachments",
|
|
|
|
# pre-wiki (orange glove)
|
|
"comments", "dirs", "docs", "glitchdex", "glitchdex2", "groups", "old", "staff", "statsbydex",
|
|
|
|
# felblog (old blogging system)
|
|
"smf_felblog_categories", "smf_felblog_cmnt_log", "smf_felblog_comments", "smf_felblog_content", "smf_felblog_cont_log", "smf_felblog_manager",
|
|
"smf_felblog_ratings", "smf_felblog_settings",
|
|
|
|
# arcade system
|
|
"smf_games", "smf_games_category", "smf_games_challenge", "smf_games_favorite", "smf_games_high", "smf_games_rate",
|
|
"smf_games_settings", "smf_games_tournament", "smf_games_tournament_players", "smf_games_tournament_results", "smf_games_tournament_scores",
|
|
|
|
# shop system
|
|
"smf_shop_inventory", "smf_shop_items", "smf_shop_shops",
|
|
|
|
# project management system (we had this?)
|
|
"smf_projects", "smf_projects_settings", "smf_project_developer", "smf_project_permissions", "smf_project_profiles", "smf_project_timeline",
|
|
"smf_project_trackers", "smf_project_versions", "smf_issues", "smf_issue_attachments", "smf_issue_category", "smf_issue_comments",
|
|
"smf_issue_tags",
|
|
|
|
# used for the IRC bridge
|
|
"smf_slartibartfast"
|
|
]
|
|
|
|
# Special tables we need to filter.
|
|
CATEGORIES = "smf_categories"
|
|
BOARDS = "smf_boards"
|
|
TOPICS = "smf_topics"
|
|
MESSAGES = "smf_messages"
|
|
|
|
# Dump filenames
|
|
STRUCTURE_DUMP = "structure.sql"
|
|
MISC_DUMP = "misc_data.sql"
|
|
CATEGORIES_DUMP = "categories.sql"
|
|
BOARDS_DUMP = "boards.sql"
|
|
TOPICS_DUMP = "threads.sql"
|
|
|
|
# Categories we are not interested in archiving.
|
|
# `id_cat` in (1, 2)
|
|
DO_NOT_ARCHIVE_CATEGORIES = [
|
|
12, # Epsilon: ?????
|
|
6, # Sigma: Higher Access
|
|
8 # Omega: Garbage
|
|
]
|
|
|
|
# Boards we are not interested in archiving.
|
|
# `id_board` in (1, 2)
|
|
DO_NOT_ARCHIVE_BOARDS = [
|
|
40, # Exclusive Board
|
|
65, # Requests for Moderatorship
|
|
66, # Requests for Membership+
|
|
67, # Requests for Distinguished Membership
|
|
23, # M.A.S.K. HQ (Staff Board)
|
|
22, # Admins Only Board
|
|
89, # Test Board
|
|
86, # Omega Archives
|
|
51, 37, 79, 26, 47, 44, 99, 93, 119, 96,
|
|
28, # The Dumpster Out Back
|
|
123 # ?????
|
|
]
|
|
|
|
# Regexes for sensitive information
|
|
EMAIL_REGEX = re.compile(r"'[^'\s]+@[^'\s]+'")
|
|
IP_REGEX = re.compile(r"'\d+\.\d+\.\d+\.\d+'")
|
|
|
|
class Database():
|
|
def __init__(self, host, database, username, password):
|
|
self.host = host
|
|
self.database = database
|
|
self.username = username
|
|
self.password = password
|
|
|
|
def dump(self, tables, filename, *args):
|
|
command = ["mysqldump"] + list(args) + self.auth()
|
|
|
|
if filename:
|
|
command.append("--result-file={}".format(filename))
|
|
|
|
command.append(self.database)
|
|
command = command + tables
|
|
|
|
print(">> {}".format(format_command(command)))
|
|
if filename:
|
|
check_call(command)
|
|
return filename
|
|
else:
|
|
return check_output(command).strip().decode()
|
|
|
|
def query(self, query):
|
|
command = ["mysql"] + self.auth() + [
|
|
"--xml",
|
|
self.database,
|
|
"-e", query
|
|
]
|
|
|
|
print(">> {}".format(format_command(command)))
|
|
result = check_output(command).strip()
|
|
return [row_from_xml(element) for element in ElementTree.fromstring(result)]
|
|
|
|
def auth(self):
|
|
return [
|
|
"--user={}".format(self.username),
|
|
"--password={}".format(self.password),
|
|
"--host={}".format(self.host)
|
|
]
|
|
|
|
def format_command(command):
|
|
return "{}".format([item for item in command if not item.startswith("--password")])
|
|
|
|
def row_from_xml(element):
|
|
row = {}
|
|
for child in element:
|
|
row[child.attrib['name']] = child.text
|
|
return row
|
|
|
|
parser = argparse.ArgumentParser(description="Forum scraper")
|
|
parser.add_argument(
|
|
"--host",
|
|
dest="host",
|
|
default="127.0.0.1",
|
|
help="Database host"
|
|
)
|
|
parser.add_argument(
|
|
"--db",
|
|
dest="db",
|
|
default="glitchcity",
|
|
help="Database name"
|
|
)
|
|
parser.add_argument(
|
|
"--username",
|
|
dest="username",
|
|
default="glitchcity",
|
|
help="Database username"
|
|
)
|
|
parser.add_argument(
|
|
"--password",
|
|
dest="password",
|
|
required=True,
|
|
help="Database password"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
database = Database(args.host, args.db, args.username, args.password)
|
|
|
|
# Select which topics we DON'T want, based on the board ids we don't want
|
|
do_not_archive_thread_ids = [row['id_topic'] for row in database.query("SELECT id_topic FROM smf_topics WHERE id_board IN ({})".format(",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])))]
|
|
|
|
if not os.path.exists(STRUCTURE_DUMP):
|
|
database.dump(DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], STRUCTURE_DUMP, "--no-data")
|
|
|
|
if not os.path.exists(MISC_DUMP):
|
|
database.dump(DUMP_ALL_DATA, MISC_DUMP, "--no-create-info")
|
|
|
|
category_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_CATEGORIES])
|
|
board_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])
|
|
thread_filter = ",".join([str(id) for id in do_not_archive_thread_ids])
|
|
|
|
if not os.path.exists(CATEGORIES_DUMP):
|
|
database.dump([CATEGORIES], CATEGORIES_DUMP, "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info")
|
|
|
|
if not os.path.exists(BOARDS_DUMP):
|
|
database.dump([BOARDS], BOARDS_DUMP, "--where=NOT id_board in ({})".format(board_filter), "--no-create-info")
|
|
|
|
with open(TOPICS_DUMP, "w", encoding="utf-8") as topics_dump:
|
|
dump_content = database.dump([TOPICS, MESSAGES], None, "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info")
|
|
dump_content = EMAIL_REGEX.sub("'*****@*****'", dump_content)
|
|
dump_content = IP_REGEX.sub("'***.***.***.***'", dump_content)
|
|
topics_dump.write(dump_content) |