Rudimentary support for building wiki archives. The content is dumped to html but the wikitext isn't parsed yet.

mwparserfromhell is used for parsing wikitext but it has no support for rendering to HTML so we'll have to build it manually.
This commit is contained in:
Adrian Kuschelyagi Malacoda 2020-08-11 10:44:06 -05:00
parent 3cb08e2d2f
commit dc0191a04a
8 changed files with 5726 additions and 4 deletions

View File

@ -8,8 +8,11 @@ This repository contains the tickets, scripts, and documentation for the end of
#### `deploy_archives` #### `deploy_archives`
Run this once the archives have been built to tar them up and scp them to the server. Run this once the archives have been built to tar them up and scp them to the server.
#### Wiki Data #### Wiki Data (`wiki` directory)
##### `find_data` ##### `wiki_pages`
Not a script, just a listing of all the pages in the wiki (as of the 27 July 2020 lockdown). Use this and Special:Export to create an XML dump of wiki pages and place it in the `wiki` directory.
##### `find_pages`
Run this locally (it uses the MediaWiki HTTP API). Finds all pages in categories related to Pokemon generations 1 - 4 that have been edited since 31 March 2020. Run this locally (it uses the MediaWiki HTTP API). Finds all pages in categories related to Pokemon generations 1 - 4 that have been edited since 31 March 2020.
#### Forum Data (`forum` directory) #### Forum Data (`forum` directory)

View File

@ -1,9 +1,11 @@
import os import os
from .forum import Forum from .forum import Forum
from .wiki import Wiki
from .archive_generator import ArchiveGenerator from .archive_generator import ArchiveGenerator
BASEDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) BASEDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
FORUM_DATABASE = os.path.join(BASEDIR, "forum", "forum.sqlite") FORUM_DATABASE = os.path.join(BASEDIR, "forum", "forum.sqlite")
WIKI_DIRECTORY = os.path.join(BASEDIR, "wiki")
TEMPLATES_DIR = os.path.join(BASEDIR, "templates") TEMPLATES_DIR = os.path.join(BASEDIR, "templates")
STATIC_DIR = os.path.join(BASEDIR, "static") STATIC_DIR = os.path.join(BASEDIR, "static")
@ -13,5 +15,14 @@ WIKI_ARCHIVES = os.path.join(ARCHIVES_BASEDIR, "wiki")
def main(): def main():
forum = Forum(FORUM_DATABASE) forum = Forum(FORUM_DATABASE)
wiki = None
for entry in os.listdir(WIKI_DIRECTORY):
if entry.endswith(".xml"):
wiki = Wiki(os.path.join(WIKI_DIRECTORY, entry))
generator = ArchiveGenerator(TEMPLATES_DIR, STATIC_DIR) generator = ArchiveGenerator(TEMPLATES_DIR, STATIC_DIR)
generator.generate_forum(forum, FORUM_ARCHIVES) #generator.generate_forum(forum, FORUM_ARCHIVES)
if wiki:
generator.generate_wiki(wiki, WIKI_ARCHIVES)

View File

@ -6,6 +6,9 @@ from datetime import datetime
import chevron import chevron
import bbcode import bbcode
from .wiki import NAMESPACES as WIKI_NAMESPACES
import mwparserfromhell
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ArchiveGenerator") logger = logging.getLogger("ArchiveGenerator")
@ -29,6 +32,38 @@ class ArchiveGenerator():
self.template_dir = template_dir self.template_dir = template_dir
self.static_dir = static_dir self.static_dir = static_dir
def generate_wiki (self, wiki, out_dir):
logger.info("Archiving wiki to %s", out_dir)
try:
os.makedirs(out_dir)
except FileExistsError: pass
shutil.copyfile(os.path.join(self.static_dir, "style.css"), os.path.join(out_dir, "style.css"))
renderer = TemplateRenderer(self.template_dir, out_dir)
for page in wiki.get_pages():
if page.redirect:
continue
if page.namespace != WIKI_NAMESPACES['MAIN']:
continue
page_out = "{}.html".format(page.title).replace(" ", "_")
base = ""
if "/" in page_out:
base = "../" * page_out.count("/")
try:
os.makedirs(os.path.dirname(os.path.join(out_dir, page_out)))
except FileExistsError: pass
logger.info("Archiving page %s to %s", page.title, page_out)
renderer.render_template_to_file("page", page_out, {
"title": " - {}".format(page.title),
"page": page,
"base": base,
"text": mwparserfromhell.parse(page.get_latest().text)
})
def generate_forum (self, forum, out_dir): def generate_forum (self, forum, out_dir):
logger.info("Archiving forum to %s", out_dir) logger.info("Archiving forum to %s", out_dir)
try: try:

74
epilogue/wiki.py Normal file
View File

@ -0,0 +1,74 @@
from xml.etree import ElementTree
NAMESPACE = "{http://www.mediawiki.org/xml/export-0.10/}"
PAGE_TAG = "{}page".format(NAMESPACE)
ID_TAG = "{}id".format(NAMESPACE)
TITLE_TAG = "{}title".format(NAMESPACE)
REVISION_TAG = "{}revision".format(NAMESPACE)
NS_TAG = "{}ns".format(NAMESPACE)
REDIRECT_TAG = "{}redirect".format(NAMESPACE)
TEXT_TAG = "{}text".format(NAMESPACE)
FORMAT_TAG = "{}format".format(NAMESPACE)
MODEL_TAG = "{}model".format(NAMESPACE)
TIMESTAMP_TAG = "{}timestamp".format(NAMESPACE)
COMMENT_TAG = "{}comment".format(NAMESPACE)
CONTRIBUTOR_TAG = "{}contributor".format(NAMESPACE)
USERNAME_TAG = "{}username".format(NAMESPACE)
NAMESPACES = {
"MAIN": 0,
"TEMPLATE": 10
}
class Wiki():
def __init__ (self, xml_path):
self.xml_path = xml_path
def get_pages (self):
tree = ElementTree.parse(self.xml_path)
return (Page(element) for element in tree.getroot() if element.tag == PAGE_TAG)
class Page():
def __init__ (self, element):
self.redirect = None
self.revisions = []
for child in element:
if child.tag == ID_TAG:
self.id = child.text
elif child.tag == NS_TAG:
self.namespace = int(child.text)
elif child.tag == TITLE_TAG:
self.title = child.text
elif child.tag == REVISION_TAG:
self.revisions.append(Revision(child))
elif child.tag == REDIRECT_TAG:
self.redirect = child.attrib['title']
def get_latest (self):
return self.revisions[0]
class Revision():
def __init__ (self, element):
for child in element:
if child.tag == ID_TAG:
self.id = child.text
elif child.tag == TEXT_TAG:
self.text = child.text
elif child.tag == CONTRIBUTOR_TAG:
self.contributor = Contributor(child)
elif child.tag == TIMESTAMP_TAG:
self.timestamp = child.text
elif child.tag == MODEL_TAG:
self.model = child.text
elif child.tag == COMMENT_TAG:
self.comment = child.text
class Contributor():
def __init__ (self, element):
for child in element:
if child.tag == ID_TAG:
self.id = child.text
elif child.tag == USERNAME_TAG:
self.username = child.text

View File

@ -8,7 +8,7 @@ setup(
description='Tools for exporting and creating archives of Glitch City Labs data', description='Tools for exporting and creating archives of Glitch City Labs data',
author='Adrian Kuschelyagi Malacoda', author='Adrian Kuschelyagi Malacoda',
packages=['epilogue'], packages=['epilogue'],
install_requires=['pysqlite3 >= 0.4.3', 'chevron >= 0.13.1', 'bbcode >= 1.1.0'], install_requires=['pysqlite3 >= 0.4.3', 'chevron >= 0.13.1', 'bbcode >= 1.1.0', 'mwparserfromhell >= 0.5.4'],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'epilogue = epilogue:main' 'epilogue = epilogue:main'

6
templates/page.mustache Normal file
View File

@ -0,0 +1,6 @@
{{>header}}
<h2>{{page.title}}</h2>
<article>
{{text}}
</article>
{{>footer}}

5593
wiki/wiki_pages Normal file

File diff suppressed because it is too large Load Diff