epilogue/epilogue/wiki.py

229 lines
8.2 KiB
Python

from xml.etree import ElementTree
import mwparserfromhell
from mwparserfromhell.nodes import Wikilink, Comment, ExternalLink, Heading, Tag, Template, Text
NAMESPACE = "{http://www.mediawiki.org/xml/export-0.10/}"
PAGE_TAG = "{}page".format(NAMESPACE)
ID_TAG = "{}id".format(NAMESPACE)
TITLE_TAG = "{}title".format(NAMESPACE)
REVISION_TAG = "{}revision".format(NAMESPACE)
NS_TAG = "{}ns".format(NAMESPACE)
REDIRECT_TAG = "{}redirect".format(NAMESPACE)
TEXT_TAG = "{}text".format(NAMESPACE)
FORMAT_TAG = "{}format".format(NAMESPACE)
MODEL_TAG = "{}model".format(NAMESPACE)
TIMESTAMP_TAG = "{}timestamp".format(NAMESPACE)
COMMENT_TAG = "{}comment".format(NAMESPACE)
CONTRIBUTOR_TAG = "{}contributor".format(NAMESPACE)
USERNAME_TAG = "{}username".format(NAMESPACE)
NAMESPACES = {
"MAIN": 0,
"TEMPLATE": 10
}
INTERWIKI_NAMESPACES = {
"bp:": "https://bulbapedia.bulbagarden.net/wiki/{}",
"wikipedia:": "https://en.wikipedia.org/wiki/{}"
}
FILE_NAMESPACES = ["File:", "Image:"]
CATEGORY_NAMESPACE = "Category:"
CATEGORY_LINK_NAMESPACE = ":{}".format(CATEGORY_NAMESPACE)
class Wiki():
def __init__ (self, xml_path):
self.xml_path = xml_path
def get_pages (self):
tree = ElementTree.parse(self.xml_path)
return (Page(element) for element in tree.getroot() if element.tag == PAGE_TAG)
class Page():
def __init__ (self, element):
self.redirect = None
self.revisions = []
for child in element:
if child.tag == ID_TAG:
self.id = child.text
elif child.tag == NS_TAG:
self.namespace = int(child.text)
elif child.tag == TITLE_TAG:
self.title = child.text
elif child.tag == REVISION_TAG:
self.revisions.append(Revision(child))
elif child.tag == REDIRECT_TAG:
self.redirect = child.attrib['title']
def get_latest (self):
return self.revisions[0]
class Revision():
def __init__ (self, element):
for child in element:
if child.tag == ID_TAG:
self.id = child.text
elif child.tag == TEXT_TAG:
self.text = child.text
elif child.tag == CONTRIBUTOR_TAG:
self.contributor = Contributor(child)
elif child.tag == TIMESTAMP_TAG:
self.timestamp = child.text
elif child.tag == MODEL_TAG:
self.model = child.text
elif child.tag == COMMENT_TAG:
self.comment = child.text
class Contributor():
def __init__ (self, element):
for child in element:
if child.tag == ID_TAG:
self.id = child.text
elif child.tag == USERNAME_TAG:
self.username = child.text
class Renderer():
def __init__ (self, templates={}, linker=None):
self.templates = templates
self.linker = linker if linker else Linker()
def render (self, wikitext, base="", *args, **kwargs):
categories = []
wikitext = self.transclude_templates(wikitext, *args, **kwargs)
# parse out categories
for link in wikitext.ifilter_wikilinks():
if not link.title.startswith(CATEGORY_NAMESPACE):
continue
wikitext.remove(link)
categories.append(link.title[len(CATEGORY_NAMESPACE):])
rendered = [render(wikitext, base, self.linker)]
if categories:
rendered.append('<h2>Categories</h2><ul class="categories">')
for category in categories:
rendered.append('<li><a href="{}Category:{}">{}</a></li>'.format(
base,
self.linker.translate_page_title(category),
category
))
rendered.append("</ul>")
return ("".join(rendered), categories)
def transclude_templates (self, wikitext, *args, **kwargs):
wikitext = mwparserfromhell.parse(wikitext)
for inclusion in wikitext.ifilter_templates():
template_key = str(inclusion.name)
template = self.templates.get(template_key, self.templates.get(template_key[0].upper() + template_key[1:], None))
result = None
if template:
result = template(inclusion, *args, **kwargs)
else:
result = "<span class='unknown-template'>Template:{0}</span>".format(inclusion.name)
try:
wikitext.replace(inclusion, result) #self.transclude_templates(result))
except ValueError: pass
return wikitext
def render (wikitext, base="", linker=None):
rendered = []
if not linker:
linker = Linker()
for node in wikitext.ifilter(False):
# node types:
# https://mwparserfromhell.readthedocs.io/en/latest/api/mwparserfromhell.nodes.html#module-mwparserfromhell.nodes.text
node_type = type(node)
if node_type is Wikilink:
image_name = linker.translate_image_title(node.title)
if image_name:
rendered.append('<img src="{}{}" />'.format(
base,
image_name,
render(mwparserfromhell.parse(node.text), base, linker)
))
else:
url = linker.translate_interwiki_title(node.title)
if not url:
url = "{}{}".format(base, linker.translate_page_title(node.title))
rendered.append('<a href="{}">{}</a>'.format(
url,
render(node.text if node.text else node.title, base, linker)
))
elif node_type is ExternalLink:
rendered.append('<a href="{}">{}</a>'.format(
node.url,
render(node.title if node.title else node.url)
))
elif node_type is Tag:
rendered.append("<{}>{}</{}>".format(
render(node.tag),
render(node.contents, base, linker),
render(node.tag)
))
elif node_type is Heading:
rendered.append('<h{} id="{}">{}</h{}>'.format(
node.level,
reformat_page_title(node.title),
render(node.title, base, linker),
node.level
))
elif node_type is Text:
rendered.append(node.value)
return "".join(rendered).strip().replace("\n\n", "<br /><br />")
class Linker():
def __init__ (self, file_namespaces=FILE_NAMESPACES, interwiki_namespaces=INTERWIKI_NAMESPACES):
self.file_namespaces = file_namespaces
self.interwiki_namespaces = interwiki_namespaces
def translate_interwiki_title (self, page_title):
for namespace, url in self.interwiki_namespaces.items():
if page_title.startswith(namespace):
return url.format(page_title[len(namespace):])
def translate_page_title (self, page_title):
if page_title.startswith(CATEGORY_LINK_NAMESPACE):
page_title = page_title[1:]
return reformat_page_title(page_title)
def translate_image_title (self, page_title):
for namespace in self.file_namespaces:
if page_title.startswith(namespace):
return reformat_page_title(page_title[len(namespace):])
def reformat_page_title (page_title):
if not page_title:
return ""
return "{}{}".format(page_title[0].upper(), page_title[1:].replace(' ', '_'))
class Template():
def __init__ (self, wikicode):
self.wikicode = mwparserfromhell.parse(wikicode)
for tag in self.wikicode.ifilter_tags():
if tag.tag == "noinclude":
self.wikicode.remove(tag)
def __call__ (self, inclusion, *args, **kwargs):
parsed_wikicode = mwparserfromhell.parse(self.wikicode)
for argument in parsed_wikicode.ifilter_arguments():
value = argument.default if argument.default else argument.name
if inclusion.has(argument.name):
value = inclusion.get(argument.name)
try:
parsed_wikicode.replace(argument, value)
except ValueError: pass
return parsed_wikicode