epilogue/wiki/find_pages

#!/usr/bin/env python3
from urllib.request import urlopen
from datetime import date
import json

GCL_URL = "https://glitchcity.info/"
API_ENDPOINT = "/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{}&prop=info&gcmlimit=100&format=json"

CATEGORIES = [
    "Generation I glitches", "Generation II glitches", "Generation III glitches", "Generation IV glitches",
    "Generation I glitch Pokémon", "Generation II glitch Pokémon", "Generation III glitch Pokémon", "Generation IV glitch Pokémon",
    "Pokémon Red and Blue glitch moves", "Pokémon Yellow glitch moves", "Generation II glitch moves",
    "Generation I glitch items", "Generation II glitch items", "Generation IV glitch items",
    "Generation I glitch Trainers", "Generation II glitch Trainers"
]

FILTER_MODIFIED_SINCE_THEN = date(2020, 3, 31)

def get_pages_for_category(url, category):
    category = category.replace(" ", "_").replace("é", "%C3%A9")
    pages = {}
    continue_param = None
    while True:
        api_url = url + API_ENDPOINT.format(category)

        if continue_param:
            api_url = api_url + "&gcmcontinue=" + continue_param

        with urlopen(api_url) as result:
            result_object = json.loads(result.read())
            pages.update(result_object['query']['pages'])

            if not "continue" in result_object:
                break

            continue_param = result_object['continue']['gcmcontinue']
    return pages

def filter_page(page):
    touched = date(*[int(value) for value in page['touched'].split("T")[0].split("-")])
    return touched >= FILTER_MODIFIED_SINCE_THEN

all_pages = {}
for category in CATEGORIES:
    category_pages = get_pages_for_category(GCL_URL, category)
    print("--> Found {} total pages in category {}".format(len(category_pages), category))
    all_pages.update(category_pages)

print("----> {} total pages to consider".format(len(all_pages)))
filtered_pages = [page for page in all_pages.values() if filter_page(page)]
print("----> {} filtered pages".format(len(filtered_pages)))

for page in filtered_pages:
    print(page['title'])