improved handling of categories and refactored Linker stuff into a class, also use ./ as default base path so Category: links will work correctly
This commit is contained in:
parent
77b160a35b
commit
2aa1a7cf47
@ -9,7 +9,7 @@ import chevron
|
|||||||
import bbcode
|
import bbcode
|
||||||
import html
|
import html
|
||||||
|
|
||||||
from .wiki import Template, Renderer, reformat_page_title, translate_page_title, NAMESPACES as WIKI_NAMESPACES
|
from .wiki import Template, Renderer, Linker, reformat_page_title, NAMESPACES as WIKI_NAMESPACES
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger("ArchiveGenerator")
|
logger = logging.getLogger("ArchiveGenerator")
|
||||||
@ -61,14 +61,15 @@ class ArchiveGenerator():
|
|||||||
|
|
||||||
categories = {}
|
categories = {}
|
||||||
templates = dict([(page.title.split(":")[1], Template(page.get_latest().text)) for page in wiki.get_pages() if page.namespace == WIKI_NAMESPACES['TEMPLATE']])
|
templates = dict([(page.title.split(":")[1], Template(page.get_latest().text)) for page in wiki.get_pages() if page.namespace == WIKI_NAMESPACES['TEMPLATE']])
|
||||||
wikitext_renderer = Renderer(templates, DEXES)
|
linker = Linker(directory_names=DEXES)
|
||||||
|
wikitext_renderer = Renderer(templates, linker)
|
||||||
for page in wiki.get_pages():
|
for page in wiki.get_pages():
|
||||||
try:
|
try:
|
||||||
if page.namespace != WIKI_NAMESPACES['MAIN']:
|
if page.namespace != WIKI_NAMESPACES['MAIN']:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
page_out = translate_page_title(page.title, DEXES)
|
page_out = linker.translate_page_title(page.title)
|
||||||
base = ""
|
base = "./"
|
||||||
if "/" in page_out:
|
if "/" in page_out:
|
||||||
base = "../" * page_out.count("/")
|
base = "../" * page_out.count("/")
|
||||||
try:
|
try:
|
||||||
@ -78,7 +79,7 @@ class ArchiveGenerator():
|
|||||||
if page.redirect:
|
if page.redirect:
|
||||||
logger.info("Archiving redirect page (%s -> %s) to %s", page.title, page.redirect, page_out)
|
logger.info("Archiving redirect page (%s -> %s) to %s", page.title, page.redirect, page_out)
|
||||||
renderer.render_template_to_file("redirect", page_out, {
|
renderer.render_template_to_file("redirect", page_out, {
|
||||||
"target": f"{base}{translate_page_title(page.redirect, DEXES)}"
|
"target": f"{base}{linker.translate_page_title(page.redirect)}"
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
logger.info("Archiving page %s to %s", page.title, page_out)
|
logger.info("Archiving page %s to %s", page.title, page_out)
|
||||||
|
@ -32,6 +32,7 @@ INTERWIKI_NAMESPACES = {
|
|||||||
|
|
||||||
FILE_NAMESPACES = ["File:", "Image:"]
|
FILE_NAMESPACES = ["File:", "Image:"]
|
||||||
CATEGORY_NAMESPACE = "Category:"
|
CATEGORY_NAMESPACE = "Category:"
|
||||||
|
CATEGORY_LINK_NAMESPACE = f":{CATEGORY_NAMESPACE}"
|
||||||
|
|
||||||
class Wiki():
|
class Wiki():
|
||||||
def __init__ (self, xml_path):
|
def __init__ (self, xml_path):
|
||||||
@ -85,9 +86,9 @@ class Contributor():
|
|||||||
self.username = child.text
|
self.username = child.text
|
||||||
|
|
||||||
class Renderer():
|
class Renderer():
|
||||||
def __init__ (self, templates={}, directory_names=[]):
|
def __init__ (self, templates={}, linker=None):
|
||||||
self.templates = templates
|
self.templates = templates
|
||||||
self.directory_names = directory_names
|
self.linker = linker if linker else Linker()
|
||||||
|
|
||||||
def render (self, wikitext, base="", *args, **kwargs):
|
def render (self, wikitext, base="", *args, **kwargs):
|
||||||
categories = []
|
categories = []
|
||||||
@ -101,7 +102,7 @@ class Renderer():
|
|||||||
wikitext.remove(link)
|
wikitext.remove(link)
|
||||||
categories.append(link.title[len(CATEGORY_NAMESPACE):])
|
categories.append(link.title[len(CATEGORY_NAMESPACE):])
|
||||||
|
|
||||||
rendered = [render(wikitext, base, self.directory_names)]
|
rendered = [render(wikitext, base, self.linker)]
|
||||||
if categories:
|
if categories:
|
||||||
rendered.append('<h2>Categories</h2><ul class="categories">')
|
rendered.append('<h2>Categories</h2><ul class="categories">')
|
||||||
for category in categories:
|
for category in categories:
|
||||||
@ -130,28 +131,32 @@ class Renderer():
|
|||||||
except ValueError: pass
|
except ValueError: pass
|
||||||
return wikitext
|
return wikitext
|
||||||
|
|
||||||
def render (wikitext, base="", directory_names=[]):
|
def render (wikitext, base="", linker=None):
|
||||||
rendered = []
|
rendered = []
|
||||||
|
|
||||||
|
if not linker:
|
||||||
|
linker = Linker()
|
||||||
|
|
||||||
for node in wikitext.ifilter(False):
|
for node in wikitext.ifilter(False):
|
||||||
# node types:
|
# node types:
|
||||||
# https://mwparserfromhell.readthedocs.io/en/latest/api/mwparserfromhell.nodes.html#module-mwparserfromhell.nodes.text
|
# https://mwparserfromhell.readthedocs.io/en/latest/api/mwparserfromhell.nodes.html#module-mwparserfromhell.nodes.text
|
||||||
node_type = type(node)
|
node_type = type(node)
|
||||||
if node_type is Wikilink:
|
if node_type is Wikilink:
|
||||||
image_name = translate_image_title(node.title)
|
image_name = linker.translate_image_title(node.title)
|
||||||
if image_name:
|
if image_name:
|
||||||
rendered.append('<img src="{}{}" />'.format(
|
rendered.append('<img src="{}{}" />'.format(
|
||||||
base,
|
base,
|
||||||
image_name,
|
image_name,
|
||||||
render(mwparserfromhell.parse(node.text), base, directory_names)
|
render(mwparserfromhell.parse(node.text), base, linker)
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
url = translate_interwiki_title(node.title)
|
url = linker.translate_interwiki_title(node.title)
|
||||||
if not url:
|
if not url:
|
||||||
url = f"{base}{translate_page_title(node.title, directory_names)}"
|
url = f"{base}{linker.translate_page_title(node.title)}"
|
||||||
|
|
||||||
rendered.append('<a href="{}">{}</a>'.format(
|
rendered.append('<a href="{}">{}</a>'.format(
|
||||||
url,
|
url,
|
||||||
render(node.text if node.text else node.title, base, directory_names)
|
render(node.text if node.text else node.title, base, linker)
|
||||||
))
|
))
|
||||||
elif node_type is ExternalLink:
|
elif node_type is ExternalLink:
|
||||||
rendered.append('<a href="{}">{}</a>'.format(
|
rendered.append('<a href="{}">{}</a>'.format(
|
||||||
@ -161,13 +166,13 @@ def render (wikitext, base="", directory_names=[]):
|
|||||||
elif node_type is Tag:
|
elif node_type is Tag:
|
||||||
rendered.append("<{}>{}</{}>".format(
|
rendered.append("<{}>{}</{}>".format(
|
||||||
render(node.tag),
|
render(node.tag),
|
||||||
render(node.contents, base, directory_names),
|
render(node.contents, base, linker),
|
||||||
render(node.tag)
|
render(node.tag)
|
||||||
))
|
))
|
||||||
elif node_type is Heading:
|
elif node_type is Heading:
|
||||||
rendered.append("<h{}>{}</h{}>".format(
|
rendered.append("<h{}>{}</h{}>".format(
|
||||||
node.level,
|
node.level,
|
||||||
render(node.title, base, directory_names),
|
render(node.title, base, linker),
|
||||||
node.level
|
node.level
|
||||||
))
|
))
|
||||||
elif node_type is Text:
|
elif node_type is Text:
|
||||||
@ -175,31 +180,40 @@ def render (wikitext, base="", directory_names=[]):
|
|||||||
|
|
||||||
return "".join(rendered).strip().replace("\n\n", "<br /><br />")
|
return "".join(rendered).strip().replace("\n\n", "<br /><br />")
|
||||||
|
|
||||||
def translate_interwiki_title (page_title):
|
class Linker():
|
||||||
for namespace, url in INTERWIKI_NAMESPACES.items():
|
def __init__ (self, file_namespaces=FILE_NAMESPACES, interwiki_namespaces=INTERWIKI_NAMESPACES, directory_names=[]):
|
||||||
if page_title.startswith(namespace):
|
self.file_namespaces = file_namespaces
|
||||||
return url.format(page_title[len(namespace):])
|
self.interwiki_namespaces = interwiki_namespaces
|
||||||
|
self.directory_names = directory_names
|
||||||
|
|
||||||
def translate_page_title (page_title, directory_names=[]):
|
def translate_interwiki_title (self, page_title):
|
||||||
directory_name = ""
|
for namespace, url in self.interwiki_namespaces.items():
|
||||||
for name in directory_names:
|
if page_title.startswith(namespace):
|
||||||
if page_title.startswith(f"{name}/"):
|
return url.format(page_title[len(namespace):])
|
||||||
directory_name = name
|
|
||||||
page_title = page_title[len(directory_name) + 1:]
|
|
||||||
break
|
|
||||||
|
|
||||||
return f"{reformat_page_title(directory_name)}{'/' if directory_name else ''}{reformat_page_title(page_title)}.html"
|
def translate_page_title (self, page_title):
|
||||||
|
if page_title.startswith(CATEGORY_LINK_NAMESPACE):
|
||||||
|
page_title = page_title[1:]
|
||||||
|
|
||||||
|
directory_name = ""
|
||||||
|
for name in self.directory_names:
|
||||||
|
if page_title.startswith(f"{name}/"):
|
||||||
|
directory_name = name
|
||||||
|
page_title = page_title[len(directory_name) + 1:]
|
||||||
|
break
|
||||||
|
|
||||||
def translate_image_title (page_title):
|
return f"{reformat_page_title(directory_name)}{'/' if directory_name else ''}{reformat_page_title(page_title)}.html"
|
||||||
for namespace in FILE_NAMESPACES:
|
|
||||||
if page_title.startswith(namespace):
|
def translate_image_title (self, page_title):
|
||||||
return reformat_page_title(page_title[len(namespace):])
|
for namespace in self.file_namespaces:
|
||||||
|
if page_title.startswith(namespace):
|
||||||
|
return reformat_page_title(page_title[len(namespace):])
|
||||||
|
|
||||||
def reformat_page_title (page_title):
|
def reformat_page_title (page_title):
|
||||||
if not page_title:
|
if not page_title:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
return "{}{}".format(page_title[0].upper(), page_title[1:].replace(" ", "_").replace("/", "%2F"))
|
return f"{page_title[0].upper()}{page_title[1:].replace(' ', '_').replace('/', '%2F')}"
|
||||||
|
|
||||||
class Template():
|
class Template():
|
||||||
def __init__ (self, wikicode):
|
def __init__ (self, wikicode):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user