From 191e1ebd1345aeb6243a92a1c1ca9c5277286957 Mon Sep 17 00:00:00 2001 From: Adrian Malacoda Date: Thu, 30 Jul 2020 03:46:02 -0500 Subject: [PATCH] Add find_pages script to determine through the MediaWiki API which pages need to be looked at. --- find_pages | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 find_pages diff --git a/find_pages b/find_pages new file mode 100755 index 0000000..1cb0aec --- /dev/null +++ b/find_pages @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +from urllib.request import urlopen +from datetime import date +import json + +GCL_URL = "https://glitchcity.info/" +API_ENDPOINT = "/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{}&prop=info&gcmlimit=100&format=json" + +CATEGORIES = ["Generation I glitches", "Generation II glitches", "Generation III glitches", "Generation IV glitches"] + +FILTER_MODIFIED_SINCE_THEN = date(2020, 3, 31) + +def get_pages_for_category(url, category): + category = category.replace(" ", "_") + pages = {} + continue_param = None + while True: + api_url = url + API_ENDPOINT.format(category) + + if continue_param: + api_url = api_url + "&gcmcontinue=" + continue_param + + with urlopen(api_url) as result: + result_object = json.loads(result.read()) + pages.update(result_object['query']['pages']) + + if not "continue" in result_object: + break + + continue_param = result_object['continue']['gcmcontinue'] + return pages + +def filter_page(page): + touched = date(*[int(value) for value in page['touched'].split("T")[0].split("-")]) + return touched >= FILTER_MODIFIED_SINCE_THEN + +all_pages = {} +for category in CATEGORIES: + category_pages = get_pages_for_category(GCL_URL, category) + print("--> Found {} total pages in category {}".format(len(category_pages), category)) + all_pages.update(category_pages) + +print("----> {} total pages to consider".format(len(all_pages))) +filtered_pages = [page for page in all_pages.values() if filter_page(page)] +print("----> {} filtered pages".format(len(filtered_pages))) + +for page in filtered_pages: + print(page['title']) \ No newline at end of file