improvements to template substitution, begin parsing out and dumping category links

This commit is contained in:
Adrian Kuschelyagi Malacoda 2020-08-23 04:48:38 -05:00
parent 43a36ba730
commit df25b09eb7
2 changed files with 115 additions and 60 deletions

View File

@ -2,11 +2,13 @@ import os
import logging import logging
import shutil import shutil
from traceback import print_exc
import chevron import chevron
import bbcode import bbcode
import html import html
from .wiki import Template, Renderer, NAMESPACES as WIKI_NAMESPACES from .wiki import Template, Renderer, reformat_page_title, NAMESPACES as WIKI_NAMESPACES
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ArchiveGenerator") logger = logging.getLogger("ArchiveGenerator")
@ -48,6 +50,7 @@ class ArchiveGenerator():
"target": "Main_Page.html" "target": "Main_Page.html"
}) })
categories = {}
templates = dict([(page.title.split(":")[1], Template(page.get_latest().text)) for page in wiki.get_pages() if page.namespace == WIKI_NAMESPACES['TEMPLATE']]) templates = dict([(page.title.split(":")[1], Template(page.get_latest().text)) for page in wiki.get_pages() if page.namespace == WIKI_NAMESPACES['TEMPLATE']])
wikitext_renderer = Renderer(templates) wikitext_renderer = Renderer(templates)
for page in wiki.get_pages(): for page in wiki.get_pages():
@ -66,18 +69,46 @@ class ArchiveGenerator():
if page.redirect: if page.redirect:
logger.info("Archiving redirect page (%s -> %s) to %s", page.title, page.redirect, page_out) logger.info("Archiving redirect page (%s -> %s) to %s", page.title, page.redirect, page_out)
renderer.render_template_to_file("redirect", page_out, { renderer.render_template_to_file("redirect", page_out, {
"target": "{}{}{}.html".format(base, page.redirect[0].upper(), page.redirect[1:].replace(" ", "_")) "target": f"{base}{reformat_page_title(page.redirect)}.html"
}) })
else: else:
logger.info("Archiving page %s to %s", page.title, page_out) logger.info("Archiving page %s to %s", page.title, page_out)
(rendered, page_categories) = wikitext_renderer.render(page.get_latest().text, page=page)
for category in page_categories:
if not category in categories:
categories[category] = []
categories[category].append({
"url": page_out,
"title": page.title
})
renderer.render_template_to_file("page", page_out, { renderer.render_template_to_file("page", page_out, {
"title": " - {}".format(page.title), "title": " - {}".format(page.title),
"page": page, "page": page,
"base": base, "base": base,
"text": wikitext_renderer.render(page.get_latest().text) "text": rendered
}) })
except Exception as e: except Exception as e:
logger.error("Error encountered when archiving %s: %s", page.title, e) logger.error("Error encountered when archiving %s: %s", page.title, e)
print_exc()
if isinstance(e, ValueError):
raise e
for category, pages in categories.items():
category_out = f"Category:{reformat_page_title(category)}.html"
logger.info("Archiving category %s to %s", category, category_out)
try:
renderer.render_template_to_file("category", category_out, {
"title": f" - {category}",
"category": category,
"pages": pages
})
except Exception as e:
logger.error("Error encountered when archiving %s: %s", category, e)
print_exc()
def generate_forum (self, forum, out_dir): def generate_forum (self, forum, out_dir):
logger.info("Archiving forum to %s", out_dir) logger.info("Archiving forum to %s", out_dir)

View File

@ -88,76 +88,97 @@ class Renderer():
def __init__ (self, templates={}): def __init__ (self, templates={}):
self.templates = templates self.templates = templates
def render (self, wikitext): def render (self, wikitext, *args, **kwargs):
rendered = [] categories = []
wikitext = self.transclude_templates(wikitext) wikitext = self.transclude_templates(wikitext, *args, **kwargs)
for node in wikitext.ifilter(False):
# node types: # parse out categories
# https://mwparserfromhell.readthedocs.io/en/latest/api/mwparserfromhell.nodes.html#module-mwparserfromhell.nodes.text for link in wikitext.ifilter_wikilinks():
node_type = type(node) if not link.title.startswith(CATEGORY_NAMESPACE):
if node_type is Wikilink: continue
image_name = self.translate_image_title(node.title)
if image_name:
rendered.append('<img src="{}" />'.format(
image_name,
self.render(node.text)
))
elif node.title.startswith(CATEGORY_NAMESPACE):
pass # todo: generate category links
else:
rendered.append('<a href="{}">{}</a>'.format(
self.translate_page_title(node.title),
self.render(node.text if node.text else node.title)
))
elif node_type is ExternalLink:
rendered.append('<a href="{}">{}</a>'.format(
node.url,
self.render(node.title if node.title else node.url)
))
elif node_type is Tag:
rendered.append("<{}>{}</{}>".format(
self.render(node.tag),
self.render(node.contents),
self.render(node.tag)
))
elif node_type is Heading:
rendered.append("<h{}>{}</h{}>".format(
node.level,
self.render(node.title),
node.level
))
elif node_type is Text:
rendered.append(node.value)
return "".join(rendered).strip().replace("\n\n", "<br /><br />")
def transclude_templates (self, wikitext): wikitext.remove(link)
categories.append(link.title[len(CATEGORY_NAMESPACE):])
rendered = [render(wikitext)]
if categories:
rendered.append('<h2>Categories</h2><ul class="categories">')
for category in categories:
rendered.append('<li><a href="Category:{}.html">{}</a></li>'.format(reformat_page_title(category), category))
rendered.append("</ul>")
return ("".join(rendered), categories)
def transclude_templates (self, wikitext, *args, **kwargs):
wikitext = mwparserfromhell.parse(wikitext) wikitext = mwparserfromhell.parse(wikitext)
for inclusion in wikitext.ifilter_templates(): for inclusion in wikitext.ifilter_templates():
template_key = str(inclusion.name) template_key = str(inclusion.name)
template = self.templates.get(template_key, self.templates.get(template_key[0].upper() + template_key[1:], None)) template = self.templates.get(template_key, self.templates.get(template_key[0].upper() + template_key[1:], None))
result = None result = None
if template: if template:
result = template(inclusion, *inclusion.params) result = template(inclusion, *args, **kwargs)
else: else:
result = "<span class='unknown-template'>Template:{0}</span>".format(inclusion.name) result = "<span class='unknown-template'>Template:{0}</span>".format(inclusion.name)
wikitext.replace(inclusion, self.transclude_templates(result)) try:
wikitext.replace(inclusion, result) #self.transclude_templates(result))
except ValueError: pass
return wikitext return wikitext
def translate_page_title (self, page_title): def render (wikitext):
for namespace, url in INTERWIKI_NAMESPACES.items(): rendered = []
if page_title.startswith(namespace): for node in wikitext.ifilter(False):
return url.format(page_title[len(namespace):]) # node types:
# https://mwparserfromhell.readthedocs.io/en/latest/api/mwparserfromhell.nodes.html#module-mwparserfromhell.nodes.text
node_type = type(node)
if node_type is Wikilink:
image_name = translate_image_title(node.title)
if image_name:
rendered.append('<img src="{}" />'.format(
image_name,
render(mwparserfromhell.parse(node.text))
))
else:
rendered.append('<a href="{}">{}</a>'.format(
translate_page_title(node.title),
render(node.text if node.text else node.title)
))
elif node_type is ExternalLink:
rendered.append('<a href="{}">{}</a>'.format(
node.url,
render(node.title if node.title else node.url)
))
elif node_type is Tag:
rendered.append("<{}>{}</{}>".format(
render(node.tag),
render(node.contents),
render(node.tag)
))
elif node_type is Heading:
rendered.append("<h{}>{}</h{}>".format(
node.level,
render(node.title),
node.level
))
elif node_type is Text:
rendered.append(node.value)
return "{}.html".format(self.reformat_page_title(page_title)) return "".join(rendered).strip().replace("\n\n", "<br /><br />")
def translate_image_title (self, page_title): def translate_page_title (page_title):
for namespace in FILE_NAMESPACES: for namespace, url in INTERWIKI_NAMESPACES.items():
if page_title.startswith(namespace): if page_title.startswith(namespace):
return self.reformat_page_title(page_title[len(namespace):]) return url.format(page_title[len(namespace):])
def reformat_page_title (self, page_title): return "{}.html".format(reformat_page_title(page_title))
return "{}{}".format(page_title[0].upper(), page_title[1:].replace(" ", "_"))
def translate_image_title (page_title):
for namespace in FILE_NAMESPACES:
if page_title.startswith(namespace):
return reformat_page_title(page_title[len(namespace):])
def reformat_page_title (page_title):
return "{}{}".format(page_title[0].upper(), page_title[1:].replace(" ", "_"))
class Template(): class Template():
def __init__ (self, wikicode): def __init__ (self, wikicode):
@ -166,11 +187,14 @@ class Template():
if tag.tag == "noinclude": if tag.tag == "noinclude":
self.wikicode.remove(tag) self.wikicode.remove(tag)
def __call__ (self, inclusion, *args): def __call__ (self, inclusion, *args, **kwargs):
parsed_wikicode = mwparserfromhell.parse(self.wikicode) parsed_wikicode = mwparserfromhell.parse(self.wikicode)
for argument in parsed_wikicode.ifilter_arguments(): for argument in parsed_wikicode.ifilter_arguments():
value = argument.default if argument.default else argument.name value = argument.default if argument.default else argument.name
if inclusion.has(argument.name): if inclusion.has(argument.name):
value = inclusion.get(argument.name) value = inclusion.get(argument.name)
parsed_wikicode.replace(argument, value)
try:
parsed_wikicode.replace(argument, value)
except ValueError: pass
return parsed_wikicode return parsed_wikicode