229 lines
8.2 KiB
Python
229 lines
8.2 KiB
Python
from xml.etree import ElementTree
|
|
|
|
import mwparserfromhell
|
|
from mwparserfromhell.nodes import Wikilink, Comment, ExternalLink, Heading, Tag, Template, Text
|
|
|
|
NAMESPACE = "{http://www.mediawiki.org/xml/export-0.10/}"
|
|
PAGE_TAG = "{}page".format(NAMESPACE)
|
|
ID_TAG = "{}id".format(NAMESPACE)
|
|
TITLE_TAG = "{}title".format(NAMESPACE)
|
|
REVISION_TAG = "{}revision".format(NAMESPACE)
|
|
NS_TAG = "{}ns".format(NAMESPACE)
|
|
REDIRECT_TAG = "{}redirect".format(NAMESPACE)
|
|
|
|
TEXT_TAG = "{}text".format(NAMESPACE)
|
|
FORMAT_TAG = "{}format".format(NAMESPACE)
|
|
MODEL_TAG = "{}model".format(NAMESPACE)
|
|
TIMESTAMP_TAG = "{}timestamp".format(NAMESPACE)
|
|
COMMENT_TAG = "{}comment".format(NAMESPACE)
|
|
CONTRIBUTOR_TAG = "{}contributor".format(NAMESPACE)
|
|
|
|
USERNAME_TAG = "{}username".format(NAMESPACE)
|
|
|
|
NAMESPACES = {
|
|
"MAIN": 0,
|
|
"TEMPLATE": 10
|
|
}
|
|
|
|
INTERWIKI_NAMESPACES = {
|
|
"bp:": "https://bulbapedia.bulbagarden.net/wiki/{}",
|
|
"wikipedia:": "https://en.wikipedia.org/wiki/{}"
|
|
}
|
|
|
|
FILE_NAMESPACES = ["File:", "Image:"]
|
|
CATEGORY_NAMESPACE = "Category:"
|
|
CATEGORY_LINK_NAMESPACE = ":{}".format(CATEGORY_NAMESPACE)
|
|
|
|
class Wiki():
|
|
def __init__ (self, xml_path):
|
|
self.xml_path = xml_path
|
|
|
|
def get_pages (self):
|
|
tree = ElementTree.parse(self.xml_path)
|
|
return (Page(element) for element in tree.getroot() if element.tag == PAGE_TAG)
|
|
|
|
class Page():
|
|
def __init__ (self, element):
|
|
self.redirect = None
|
|
self.revisions = []
|
|
for child in element:
|
|
if child.tag == ID_TAG:
|
|
self.id = child.text
|
|
elif child.tag == NS_TAG:
|
|
self.namespace = int(child.text)
|
|
elif child.tag == TITLE_TAG:
|
|
self.title = child.text
|
|
elif child.tag == REVISION_TAG:
|
|
self.revisions.append(Revision(child))
|
|
elif child.tag == REDIRECT_TAG:
|
|
self.redirect = child.attrib['title']
|
|
|
|
def get_latest (self):
|
|
return self.revisions[0]
|
|
|
|
class Revision():
|
|
def __init__ (self, element):
|
|
for child in element:
|
|
if child.tag == ID_TAG:
|
|
self.id = child.text
|
|
elif child.tag == TEXT_TAG:
|
|
self.text = child.text
|
|
elif child.tag == CONTRIBUTOR_TAG:
|
|
self.contributor = Contributor(child)
|
|
elif child.tag == TIMESTAMP_TAG:
|
|
self.timestamp = child.text
|
|
elif child.tag == MODEL_TAG:
|
|
self.model = child.text
|
|
elif child.tag == COMMENT_TAG:
|
|
self.comment = child.text
|
|
|
|
class Contributor():
|
|
def __init__ (self, element):
|
|
for child in element:
|
|
if child.tag == ID_TAG:
|
|
self.id = child.text
|
|
elif child.tag == USERNAME_TAG:
|
|
self.username = child.text
|
|
|
|
class Renderer():
|
|
def __init__ (self, templates={}, linker=None):
|
|
self.templates = templates
|
|
self.linker = linker if linker else Linker()
|
|
|
|
def render (self, wikitext, base="", *args, **kwargs):
|
|
categories = []
|
|
wikitext = self.transclude_templates(wikitext, *args, **kwargs)
|
|
|
|
# parse out categories
|
|
for link in wikitext.ifilter_wikilinks():
|
|
if not link.title.startswith(CATEGORY_NAMESPACE):
|
|
continue
|
|
|
|
wikitext.remove(link)
|
|
categories.append(link.title[len(CATEGORY_NAMESPACE):])
|
|
|
|
rendered = [render(wikitext, base, self.linker)]
|
|
if categories:
|
|
rendered.append('<h2>Categories</h2><ul class="categories">')
|
|
for category in categories:
|
|
rendered.append('<li><a href="{}Category:{}">{}</a></li>'.format(
|
|
base,
|
|
self.linker.translate_page_title(category),
|
|
category
|
|
))
|
|
rendered.append("</ul>")
|
|
|
|
return ("".join(rendered), categories)
|
|
|
|
def transclude_templates (self, wikitext, *args, **kwargs):
|
|
wikitext = mwparserfromhell.parse(wikitext)
|
|
for inclusion in wikitext.ifilter_templates():
|
|
template_key = str(inclusion.name)
|
|
template = self.templates.get(template_key, self.templates.get(template_key[0].upper() + template_key[1:], None))
|
|
result = None
|
|
if template:
|
|
result = template(inclusion, *args, **kwargs)
|
|
else:
|
|
result = "<span class='unknown-template'>Template:{0}</span>".format(inclusion.name)
|
|
|
|
try:
|
|
wikitext.replace(inclusion, result) #self.transclude_templates(result))
|
|
except ValueError: pass
|
|
return wikitext
|
|
|
|
def render (wikitext, base="", linker=None):
|
|
rendered = []
|
|
|
|
if not linker:
|
|
linker = Linker()
|
|
|
|
for node in wikitext.ifilter(False):
|
|
# node types:
|
|
# https://mwparserfromhell.readthedocs.io/en/latest/api/mwparserfromhell.nodes.html#module-mwparserfromhell.nodes.text
|
|
node_type = type(node)
|
|
if node_type is Wikilink:
|
|
image_name = linker.translate_image_title(node.title)
|
|
if image_name:
|
|
rendered.append('<img src="{}{}" />'.format(
|
|
base,
|
|
image_name,
|
|
render(mwparserfromhell.parse(node.text), base, linker)
|
|
))
|
|
else:
|
|
url = linker.translate_interwiki_title(node.title)
|
|
if not url:
|
|
url = "{}{}".format(base, linker.translate_page_title(node.title))
|
|
|
|
rendered.append('<a href="{}">{}</a>'.format(
|
|
url,
|
|
render(node.text if node.text else node.title, base, linker)
|
|
))
|
|
elif node_type is ExternalLink:
|
|
rendered.append('<a href="{}">{}</a>'.format(
|
|
node.url,
|
|
render(node.title if node.title else node.url)
|
|
))
|
|
elif node_type is Tag:
|
|
rendered.append("<{}>{}</{}>".format(
|
|
render(node.tag),
|
|
render(node.contents, base, linker),
|
|
render(node.tag)
|
|
))
|
|
elif node_type is Heading:
|
|
rendered.append('<h{} id="{}">{}</h{}>'.format(
|
|
node.level,
|
|
reformat_page_title(node.title),
|
|
render(node.title, base, linker),
|
|
node.level
|
|
))
|
|
elif node_type is Text:
|
|
rendered.append(node.value)
|
|
|
|
return "".join(rendered).strip().replace("\n\n", "<br /><br />")
|
|
|
|
class Linker():
|
|
def __init__ (self, file_namespaces=FILE_NAMESPACES, interwiki_namespaces=INTERWIKI_NAMESPACES):
|
|
self.file_namespaces = file_namespaces
|
|
self.interwiki_namespaces = interwiki_namespaces
|
|
|
|
def translate_interwiki_title (self, page_title):
|
|
for namespace, url in self.interwiki_namespaces.items():
|
|
if page_title.startswith(namespace):
|
|
return url.format(page_title[len(namespace):])
|
|
|
|
def translate_page_title (self, page_title):
|
|
if page_title.startswith(CATEGORY_LINK_NAMESPACE):
|
|
page_title = page_title[1:]
|
|
|
|
return reformat_page_title(page_title)
|
|
|
|
def translate_image_title (self, page_title):
|
|
for namespace in self.file_namespaces:
|
|
if page_title.startswith(namespace):
|
|
return reformat_page_title(page_title[len(namespace):])
|
|
|
|
def reformat_page_title (page_title):
|
|
if not page_title:
|
|
return ""
|
|
|
|
return "{}{}".format(page_title[0].upper(), page_title[1:].replace(' ', '_'))
|
|
|
|
class Template():
|
|
def __init__ (self, wikicode):
|
|
self.wikicode = mwparserfromhell.parse(wikicode)
|
|
for tag in self.wikicode.ifilter_tags():
|
|
if tag.tag == "noinclude":
|
|
self.wikicode.remove(tag)
|
|
|
|
def __call__ (self, inclusion, *args, **kwargs):
|
|
parsed_wikicode = mwparserfromhell.parse(self.wikicode)
|
|
for argument in parsed_wikicode.ifilter_arguments():
|
|
value = argument.default if argument.default else argument.name
|
|
if inclusion.has(argument.name):
|
|
value = inclusion.get(argument.name)
|
|
|
|
try:
|
|
parsed_wikicode.replace(argument, value)
|
|
except ValueError: pass
|
|
return parsed_wikicode
|