#!/usr/bin/env python3 import argparse import os import re from xml.etree import ElementTree from subprocess import check_output, check_call # Tables we don't want data from. These are usually populated with default data from an SMF install. DUMP_STRUCTURE_ONLY = [ "smf_admin_info_files", "smf_ads", "smf_ads_settings", "smf_approval_queue", "smf_awards", "smf_ban_groups", "smf_ban_items", "smf_bbcodes", "smf_board_permissions", "smf_buddies", "smf_calendar", "smf_calendar_holidays", "smf_cf_fields", "smf_cf_forms", "smf_collapsed_categories", "smf_custom_actions", "smf_custom_fields", "smf_down_cat", "smf_down_catperm", "smf_down_comment", "smf_down_creport", "smf_down_custom_field", "smf_down_custom_field_data", "smf_down_file", "smf_down_groupquota", "smf_down_rating", "smf_down_report", "smf_down_userquota", "smf_global_announcements", "smf_global_announcements_boards", "smf_groupmods", "smf_group_moderators", "smf_login", "smf_log_actions", "smf_log_activity", "smf_log_banned", "smf_log_boards", "smf_log_comments", "smf_log_digest", "smf_log_errors", "smf_log_floodcontrol", "smf_log_group_requests", "smf_log_issues", "smf_log_karma", "smf_log_mark_read", "smf_log_member_notices", "smf_log_notify", "smf_log_notify_projects", "smf_log_online", "smf_log_packages", "smf_log_polls", "smf_log_projects", "smf_log_project_mark_read", "smf_log_reported", "smf_log_reported_comments", "smf_log_scheduled_tasks", "smf_log_search_messages", "smf_log_search_results", "smf_log_search_subjects", "smf_log_search_topics", "smf_log_spider_hits", "smf_log_spider_stats", "smf_log_subscribed", "smf_log_topics", "smf_mail_queue", "smf_membergroups", "smf_members", "smf_moderators", "smf_openid_assoc", "smf_package_servers", "smf_permissions", "smf_permission_profiles", "smf_personal_messages", "smf_picture_comments","smf_pm_attachments", "smf_pm_recipients", "smf_pm_rules", "smf_profile_albums", "smf_profile_comments", "smf_profile_pictures", "", "smf_scheduled_tasks", "smf_sessions", "smf_settings", "smf_smileys", "smf_spiders", "smf_subscriptions", "smf_themes" ] # Tables we want all the data from. Some legacy data that may be of interest is also here. DUMP_ALL_DATA = [ # actual forum content "smf_polls", "smf_poll_choices", "smf_message_icons", "smf_attachments", # pre-wiki (orange glove) "comments", "dirs", "docs", "glitchdex", "glitchdex2", "groups", "old", "staff", "statsbydex", # felblog (old blogging system) "smf_felblog_categories", "smf_felblog_cmnt_log", "smf_felblog_comments", "smf_felblog_content", "smf_felblog_cont_log", "smf_felblog_manager", "smf_felblog_ratings", "smf_felblog_settings", # arcade system "smf_games", "smf_games_category", "smf_games_challenge", "smf_games_favorite", "smf_games_high", "smf_games_rate", "smf_games_settings", "smf_games_tournament", "smf_games_tournament_players", "smf_games_tournament_results", "smf_games_tournament_scores", # shop system "smf_shop_inventory", "smf_shop_items", "smf_shop_shops", # project management system (we had this?) "smf_projects", "smf_projects_settings", "smf_project_developer", "smf_project_permissions", "smf_project_profiles", "smf_project_timeline", "smf_project_trackers", "smf_project_versions", "smf_issues", "smf_issue_attachments", "smf_issue_category", "smf_issue_comments", "smf_issue_tags", # used for the IRC bridge "smf_slartibartfast" ] # Special tables we need to filter. CATEGORIES = "smf_categories" BOARDS = "smf_boards" TOPICS = "smf_topics" MESSAGES = "smf_messages" # Dump filenames STRUCTURE_DUMP = "structure.sql" MISC_DUMP = "misc_data.sql" CATEGORIES_DUMP = "categories.sql" BOARDS_DUMP = "boards.sql" TOPICS_DUMP = "threads.sql" # Categories we are not interested in archiving. # `id_cat` in (1, 2) DO_NOT_ARCHIVE_CATEGORIES = [ 12, # Epsilon: ????? 6, # Sigma: Higher Access 8 # Omega: Garbage ] # Boards we are not interested in archiving. # `id_board` in (1, 2) DO_NOT_ARCHIVE_BOARDS = [ 40, # Exclusive Board 65, # Requests for Moderatorship 66, # Requests for Membership+ 67, # Requests for Distinguished Membership 23, # M.A.S.K. HQ (Staff Board) 22, # Admins Only Board 89, # Test Board 86, # Omega Archives 51, 37, 79, 26, 47, 44, 99, 93, 119, 96, 28, # The Dumpster Out Back 123 # ????? ] # Regexes for sensitive information EMAIL_REGEX = re.compile(r"'[^']+@[^']+'") IP_REGEX = re.compile(r"'\d+\.\d+\.\d+\.\d+'") class Database(): def __init__(self, host, database, username, password): self.host = host self.database = database self.username = username self.password = password def dump(self, tables, filename, *args): command = ["mysqldump"] + list(args) + self.auth() if filename: command.append("--result-file={}".format(filename)) command.append(self.database) command = command + tables print(">> {}".format(format_command(command))) if filename: check_call(command) return filename else: return check_output(command).strip().decode() def query(self, query): command = ["mysql"] + self.auth() + [ "--xml", self.database, "-e", query ] print(">> {}".format(format_command(command))) result = check_output(command).strip() return [row_from_xml(element) for element in ElementTree.fromstring(result)] def auth(self): return [ "--user={}".format(self.username), "--password={}".format(self.password), "--host={}".format(self.host) ] def format_command(command): return "{}".format([item for item in command if not item.startswith("--password")]) def row_from_xml(element): row = {} for child in element: row[child.attrib['name']] = child.text return row parser = argparse.ArgumentParser(description="Forum scraper") parser.add_argument( "--host", dest="host", default="127.0.0.1", help="Database host" ) parser.add_argument( "--db", dest="db", default="glitchcity", help="Database name" ) parser.add_argument( "--username", dest="username", default="glitchcity", help="Database username" ) parser.add_argument( "--password", dest="password", required=True, help="Database password" ) args = parser.parse_args() database = Database(args.host, args.db, args.username, args.password) # Select which topics we DON'T want, based on the board ids we don't want do_not_archive_thread_ids = [row['id_topic'] for row in database.query("SELECT id_topic FROM smf_topics WHERE id_board IN ({})".format(",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])))] if not os.path.exists(STRUCTURE_DUMP): database.dump(DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], STRUCTURE_DUMP, "--no-data") if not os.path.exists(MISC_DUMP): database.dump(DUMP_ALL_DATA, MISC_DUMP, "--no-create-info") category_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_CATEGORIES]) board_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS]) thread_filter = ",".join([str(id) for id in do_not_archive_thread_ids]) if not os.path.exists(CATEGORIES_DUMP): database.dump([CATEGORIES], CATEGORIES_DUMP, "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info") if not os.path.exists(BOARDS_DUMP): database.dump([BOARDS], BOARDS_DUMP, "--where=NOT id_board in ({})".format(board_filter), "--no-create-info") with open(TOPICS_DUMP, "w", encoding="utf-8") as topics_dump: dump_content = database.dump([TOPICS, MESSAGES], None, "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info") dump_content = EMAIL_REGEX.sub("'*****@*****'", dump_content) dump_content = IP_REGEX.sub("'***.***.***.***'", dump_content) topics_dump.write(dump_content)