epilogue/wiki/find_pages
Adrian Malacoda dc0191a04a Rudimentary support for building wiki archives. The content is dumped to html but the wikitext isn't parsed yet.
mwparserfromhell is used for parsing wikitext but it has no support for rendering to HTML so we'll have to build it manually.
2020-08-11 10:44:06 -05:00

54 lines
2.1 KiB
Python
Executable File

#!/usr/bin/env python3
from urllib.request import urlopen
from datetime import date
import json
GCL_URL = "https://glitchcity.info/"
API_ENDPOINT = "/w/api.php?action=query&generator=categorymembers&gcmtitle=Category:{}&prop=info&gcmlimit=100&format=json"
CATEGORIES = [
"Generation I glitches", "Generation II glitches", "Generation III glitches", "Generation IV glitches",
"Generation I glitch Pokémon", "Generation II glitch Pokémon", "Generation III glitch Pokémon", "Generation IV glitch Pokémon",
"Pokémon Red and Blue glitch moves", "Pokémon Yellow glitch moves", "Generation II glitch moves",
"Generation I glitch items", "Generation II glitch items", "Generation IV glitch items",
"Generation I glitch Trainers", "Generation II glitch Trainers"
]
FILTER_MODIFIED_SINCE_THEN = date(2020, 3, 31)
def get_pages_for_category(url, category):
category = category.replace(" ", "_").replace("é", "%C3%A9")
pages = {}
continue_param = None
while True:
api_url = url + API_ENDPOINT.format(category)
if continue_param:
api_url = api_url + "&gcmcontinue=" + continue_param
with urlopen(api_url) as result:
result_object = json.loads(result.read())
pages.update(result_object['query']['pages'])
if not "continue" in result_object:
break
continue_param = result_object['continue']['gcmcontinue']
return pages
def filter_page(page):
touched = date(*[int(value) for value in page['touched'].split("T")[0].split("-")])
return touched >= FILTER_MODIFIED_SINCE_THEN
all_pages = {}
for category in CATEGORIES:
category_pages = get_pages_for_category(GCL_URL, category)
print("--> Found {} total pages in category {}".format(len(category_pages), category))
all_pages.update(category_pages)
print("----> {} total pages to consider".format(len(all_pages)))
filtered_pages = [page for page in all_pages.values() if filter_page(page)]
print("----> {} filtered pages".format(len(filtered_pages)))
for page in filtered_pages:
print(page['title'])