From 191e1ebd1345aeb6243a92a1c1ca9c5277286957 Mon Sep 17 00:00:00 2001
From: Adrian Malacoda <malacoda@monarch-pass.net>
Date: Thu, 30 Jul 2020 03:46:02 -0500
Subject: [PATCH] Add find_pages script to determine through the MediaWiki API
 which pages need to be looked at.

---
 find_pages | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100755 find_pages

diff --git a/find_pages b/find_pages
new file mode 100755
index 0000000..1cb0aec
--- /dev/null
+++ b/find_pages
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+from urllib.request import urlopen
+from datetime import date
+import json
+
+GCL_URL = "https://glitchcity.info/"
+API_ENDPOINT = "/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{}&prop=info&gcmlimit=100&format=json"
+
+CATEGORIES = ["Generation I glitches", "Generation II glitches", "Generation III glitches", "Generation IV glitches"]
+
+FILTER_MODIFIED_SINCE_THEN = date(2020, 3, 31)
+
+def get_pages_for_category(url, category):
+    category = category.replace(" ", "_")
+    pages = {}
+    continue_param = None
+    while True:
+        api_url = url + API_ENDPOINT.format(category)
+
+        if continue_param:
+            api_url = api_url + "&gcmcontinue=" + continue_param
+
+        with urlopen(api_url) as result:
+            result_object = json.loads(result.read())
+            pages.update(result_object['query']['pages'])
+
+            if not "continue" in result_object:
+                break
+
+            continue_param = result_object['continue']['gcmcontinue']
+    return pages
+
+def filter_page(page):
+    touched = date(*[int(value) for value in page['touched'].split("T")[0].split("-")])
+    return touched >= FILTER_MODIFIED_SINCE_THEN
+
+all_pages = {}
+for category in CATEGORIES:
+    category_pages = get_pages_for_category(GCL_URL, category)
+    print("--> Found {} total pages in category {}".format(len(category_pages), category))
+    all_pages.update(category_pages)
+
+print("----> {} total pages to consider".format(len(all_pages)))
+filtered_pages = [page for page in all_pages.values() if filter_page(page)]
+print("----> {} filtered pages".format(len(filtered_pages)))
+
+for page in filtered_pages:
+    print(page['title'])
\ No newline at end of file