From eac65f6bdafd53cb683fb50fd0dfa3e31adced42 Mon Sep 17 00:00:00 2001 From: Adrian Malacoda Date: Mon, 3 Aug 2020 05:02:43 -0500 Subject: [PATCH] Add dump_forum_data script: This connects to a GCL database and creates SQL dumps that can be distributed publicly. Specifically, it creates these scripts: - structure.sql: Database structure for all tables. - misc_data.sql: Misc. data that may be of interest, including legacy GCLF features such as the shop and the arcade, and The Orange Glove content. - categories.sql: Category data for public categories. - boards.sql: Board data for public boards. - threads.sql: Thread (topic) and message data. This is the meat of the forums. What's NOT included: - Personal/Private messages - Member accounts - Administrative stuff like error logs - Private categories and boards - Garbage category (Lab Omega) --- dump_forum_data | 167 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100755 dump_forum_data diff --git a/dump_forum_data b/dump_forum_data new file mode 100755 index 0000000..2cad504 --- /dev/null +++ b/dump_forum_data @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +import argparse + +from xml.etree import ElementTree +from subprocess import check_output, check_call + +# Tables we don't want data from. These are usually populated with default data from an SMF install. +DUMP_STRUCTURE_ONLY = [ + "smf_admin_info_files", "smf_ads", "smf_ads_settings", "smf_approval_queue", "smf_awards", "smf_ban_groups", "smf_ban_items", + "smf_bbcodes", "smf_board_permissions", "smf_buddies", "smf_calendar", "smf_calendar_holidays", "smf_cf_fields", "smf_cf_forms", + "smf_collapsed_categories", "smf_custom_actions", "smf_custom_fields", "smf_down_cat", "smf_down_catperm", "smf_down_comment", + "smf_down_creport", "smf_down_custom_field", "smf_down_custom_field_data", "smf_down_file", "smf_down_groupquota", "smf_down_rating", + "smf_down_report", "smf_down_userquota", "smf_global_announcements", "smf_global_announcements_boards", "smf_groupmods", + "smf_group_moderators", "smf_login", "smf_log_actions", "smf_log_activity", "smf_log_banned", "smf_log_boards", "smf_log_comments", + "smf_log_digest", "smf_log_errors", "smf_log_floodcontrol", "smf_log_group_requests", "smf_log_issues", "smf_log_karma", + "smf_log_mark_read", "smf_log_member_notices", "smf_log_notify", "smf_log_notify_projects", "smf_log_online", "smf_log_packages", + "smf_log_polls", "smf_log_projects", "smf_log_project_mark_read", "smf_log_reported", "smf_log_reported_comments", + "smf_log_scheduled_tasks", "smf_log_search_messages", "smf_log_search_results", "smf_log_search_subjects", "smf_log_search_topics", + "smf_log_spider_hits", "smf_log_spider_stats", "smf_log_subscribed", "smf_log_topics", "smf_mail_queue", "smf_membergroups", "smf_members", + "smf_moderators", "smf_openid_assoc", "smf_package_servers", "smf_permissions", "smf_permission_profiles", "smf_personal_messages", + "smf_picture_comments","smf_pm_attachments", "smf_pm_recipients", "smf_pm_rules", "smf_profile_albums", "smf_profile_comments", + "smf_profile_pictures", "", "smf_scheduled_tasks", "smf_sessions", "smf_settings", "smf_smileys", "smf_spiders", "smf_subscriptions", "smf_themes" +] + +# Tables we want all the data from. Some legacy data that may be of interest is also here. +DUMP_ALL_DATA = [ + # actual forum content + "smf_polls", "smf_poll_choices", "smf_message_icons", "smf_attachments", + + # pre-wiki (orange glove) + "comments", "dirs", "docs", "glitchdex", "glitchdex2", "groups", "old", "staff", "statsbydex", + + # felblog (old blogging system) + "smf_felblog_categories", "smf_felblog_cmnt_log", "smf_felblog_comments", "smf_felblog_content", "smf_felblog_cont_log", "smf_felblog_manager", + "smf_felblog_ratings", "smf_felblog_settings", + + # arcade system + "smf_games", "smf_games_category", "smf_games_challenge", "smf_games_favorite", "smf_games_high", "smf_games_rate", + "smf_games_settings", "smf_games_tournament", "smf_games_tournament_players", "smf_games_tournament_results", "smf_games_tournament_scores", + + # shop system + "smf_shop_inventory", "smf_shop_items", "smf_shop_shops", + + # project management system (we had this?) + "smf_projects", "smf_projects_settings", "smf_project_developer", "smf_project_permissions", "smf_project_profiles", "smf_project_timeline", + "smf_project_trackers", "smf_project_versions", "smf_issues", "smf_issue_attachments", "smf_issue_category", "smf_issue_comments", + "smf_issue_tags", + + # used for the IRC bridge + "smf_slartibartfast" +] + +# Special tables we need to filter. +CATEGORIES = "smf_categories" +BOARDS = "smf_boards" +TOPICS = "smf_topics" +MESSAGES = "smf_messages" + +# Categories we are not interested in archiving. +# `id_cat` in (1, 2) +DO_NOT_ARCHIVE_CATEGORIES = [ + 12, # Epsilon: ????? + 6, # Sigma: Higher Access + 8 # Omega: Garbage +] + +# Boards we are not interested in archiving. +# `id_board` in (1, 2) +DO_NOT_ARCHIVE_BOARDS = [ + 40, # Exclusive Board + 65, # Requests for Moderatorship + 66, # Requests for Membership+ + 67, # Requests for Distinguished Membership + 23, # M.A.S.K. HQ (Staff Board) + 22, # Admins Only Board + 89, # Test Board + 86, # Omega Archives + 51, 37, 79, 26, 47, 44, 99, 93, 119, 96, + 28, # The Dumpster Out Back + 123 # ????? +] + +class Database(): + def __init__(self, host, database, username, password): + self.host = host + self.database = database + self.username = username + self.password = password + + def dump(self, filename, tables, *args): + command = ["mysqldump"] + list(args) + self.auth() + [ + "--result-file={}".format(filename), + self.database + ] + tables + + print(">> {}".format(format_command(command))) + check_call(command) + + def query(self, query): + command = ["mysql"] + self.auth() + [ + "--xml", + self.database, + "-e", query + ] + + print(">> {}".format(format_command(command))) + result = check_output(command).strip() + return [row_from_xml(element) for element in ElementTree.fromstring(result)] + + def auth(self): + return [ + "--user={}".format(self.username), + "--password={}".format(self.password), + "--host={}".format(self.host) + ] + +def format_command(command): + return "{}".format([item for item in command if not item.startswith("--password")]) + +def row_from_xml(element): + row = {} + for child in element: + row[child.attrib['name']] = child.text + return row + +parser = argparse.ArgumentParser(description="Forum scraper") +parser.add_argument( + "--host", + dest="host", + default="127.0.0.1", + help="Database host" +) +parser.add_argument( + "--db", + dest="db", + default="glitchcity", + help="Database name" +) +parser.add_argument( + "--username", + dest="username", + default="glitchcity", + help="Database username" +) +parser.add_argument( + "--password", + dest="password", + required=True, + help="Database password" +) +args = parser.parse_args() + +database = Database(args.host, args.db, args.username, args.password) + +# Select which topics we DON'T want, based on the board ids we don't want +do_not_archive_thread_ids = [row['id_topic'] for row in database.query("SELECT id_topic FROM smf_topics WHERE id_board IN ({})".format(",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS])))] + +database.dump("structure.sql", DUMP_STRUCTURE_ONLY + DUMP_ALL_DATA + [CATEGORIES, BOARDS, TOPICS, MESSAGES], "--no-data") +database.dump("misc_data.sql", DUMP_ALL_DATA, "--no-create-info") + +category_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_CATEGORIES]) +board_filter = ",".join([str(id) for id in DO_NOT_ARCHIVE_BOARDS]) +thread_filter = ",".join([str(id) for id in do_not_archive_thread_ids]) + +database.dump("categories.sql", [CATEGORIES], "--where=NOT id_cat in ({})".format(category_filter), "--no-create-info") +database.dump("boards.sql", [BOARDS], "--where=NOT id_board in ({})".format(board_filter), "--no-create-info") +database.dump("threads.sql", [TOPICS, MESSAGES], "--where=NOT id_topic in ({})".format(thread_filter), "--no-create-info") \ No newline at end of file