Compare commits

..

6 Commits

8 changed files with 87 additions and 51 deletions

View File

@ -1,6 +1,7 @@
import os
import logging
import shutil
import math
from itertools import chain
from traceback import print_exc
@ -9,7 +10,8 @@ import chevron
import bbcode
import html
from .wiki import Template, Renderer, Linker, reformat_page_title, NAMESPACES as WIKI_NAMESPACES
from .forum import DEFAULT_POSTS_PER_PAGE
from .wiki import Template, Renderer, Linker, NAMESPACES as WIKI_NAMESPACES
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ArchiveGenerator")
@ -22,6 +24,30 @@ DEX_TYPES = [
]
DEXES = list(chain.from_iterable([[f"{dex_type}{language}" for dex_type in DEX_TYPES] for language in DEX_LANGUAGES]))
class ArchiveLinker(Linker):
def __init__ (self, directory_names=[]):
super().__init__()
self.directory_names = directory_names
self.replacements = {
"/": "+",
#":": ""
}
def translate_page_title (self, page_title):
page_title = super().translate_page_title(page_title)
directory_name = ""
for name in self.directory_names:
if page_title.startswith(f"{name}/"):
directory_name = name
page_title = page_title[len(directory_name) + 1:]
break
for key, value in self.replacements.items():
page_title = page_title.replace(key, value)
return f"{directory_name}{'/' if directory_name else ''}{page_title}.html"
def prepare_thread (thread):
thread.subject = html.unescape(thread.subject)
return thread
@ -61,7 +87,7 @@ class ArchiveGenerator():
categories = {}
templates = dict([(page.title.split(":")[1], Template(page.get_latest().text)) for page in wiki.get_pages() if page.namespace == WIKI_NAMESPACES['TEMPLATE']])
linker = Linker(directory_names=DEXES)
linker = ArchiveLinker(directory_names=DEXES)
wikitext_renderer = Renderer(templates, linker)
for page in wiki.get_pages():
try:
@ -107,7 +133,7 @@ class ArchiveGenerator():
raise e
for category, pages in categories.items():
category_out = f"Category:{reformat_page_title(category)}.html"
category_out = f"Category:{linker.translate_page_title(category)}"
logger.info("Archiving category %s to %s", category, category_out)
try:
@ -166,6 +192,8 @@ class ArchiveGenerator():
"target": "page-0.html"
})
total_pages = math.ceil((thread.num_replies + 1) / DEFAULT_POSTS_PER_PAGE)
page_links = [{"label": page + 1, "link": f"page-{page}.html"} for page in range(total_pages)]
page = 0
while True:
posts = [prepare_post(post) for post in forum.get_posts_in_thread(thread, page)]
@ -180,6 +208,7 @@ class ArchiveGenerator():
"thread": thread,
"page": page,
"next": page + 1,
"page_links": page_links,
"prev": page - 1,
"posts": posts
})

View File

@ -23,6 +23,9 @@ GET_POSTS = """
LIMIT ? OFFSET ?
""".format(PREFIX)
DEFAULT_POSTS_PER_PAGE = 15
DEFAULT_THREADS_PER_PAGE = 2000
def fix_encoding (string):
return string.encode("latin1", errors="ignore").decode(errors="ignore")
@ -50,7 +53,7 @@ class Forum():
cursor.execute(GET_BOARDS)
return [Board(board) for board in cursor.fetchall()]
def get_threads_in_board (self, board, page=0, per_page=2000):
def get_threads_in_board (self, board, page=0, per_page=DEFAULT_THREADS_PER_PAGE):
try:
board = board.id
except ValueError: pass
@ -58,7 +61,7 @@ class Forum():
cursor.execute(GET_THREADS, (board, per_page, page * per_page))
return [Thread(thread) for thread in cursor.fetchall()]
def get_posts_in_thread (self, thread, page=0, per_page=15):
def get_posts_in_thread (self, thread, page=0, per_page=DEFAULT_POSTS_PER_PAGE):
try:
thread = thread.id
except ValueError: pass
@ -89,6 +92,7 @@ class Thread():
self.datetime = datetime.fromtimestamp(row['poster_time'])
self.subject = fix_encoding(row['subject'])
self.poster_name = fix_encoding(row['poster_name'])
self.num_replies = row['num_replies']
class Post():
def __init__ (self, row):

View File

@ -1,39 +1,33 @@
import argparse
from flask import Flask, redirect
from .archive_generator import ArchiveLinker, DEXES
from flask import Flask, redirect, request
app = Flask(__name__)
def is_wiki_directory_name (name):
return "Dex" in name
def escape_wiki_page_name (page_name):
page_name = page_name[0].upper() + page_name[1:].replace(" ", "_")
if page_name.endswith("/"):
page_name = page_name[:-1]
if "/" in page_name:
(prefix, suffix) = page_name.split("/", 1)
suffix = suffix.replace("/", "%2F")
page_name = prefix + ("/" if is_wiki_directory_name(prefix) else "%2F") + suffix
return page_name
## Wiki redirector
@app.route("/wiki/<path:path>")
def redirect_wiki (path):
return redirect(make_wiki_url(path))
def make_wiki_url (path):
url = app.args.wiki_archive
if path.endswith("/"):
path = path[:-1]
if not url.endswith("/"):
url = url + "/"
return url + escape_wiki_page_name(path) + ".html"
return url + app.wiki_linker.translate_page_title(path)
## Forum redirector
@app.route('/forums/<path:path>')
def redirect_forums (path):
pass
return redirect(make_forum_url(path))
@app.route("/wiki/<path:path>")
def redirect_wiki (path):
return redirect(make_wiki_url(path))
def make_forum_url (request):
return str(request)
def main ():
parser = argparse.ArgumentParser()
@ -41,4 +35,5 @@ def main ():
parser.add_argument("--forums-archive", help="URL to forums archive")
app.args = parser.parse_args()
app.wiki_linker = ArchiveLinker(directory_names=DEXES)
app.run()

View File

@ -106,9 +106,9 @@ class Renderer():
if categories:
rendered.append('<h2>Categories</h2><ul class="categories">')
for category in categories:
rendered.append('<li><a href="{}Category:{}.html">{}</a></li>'.format(
rendered.append('<li><a href="{}Category:{}">{}</a></li>'.format(
base,
reformat_page_title(category),
self.linker.translate_page_title(category),
category
))
rendered.append("</ul>")
@ -181,10 +181,9 @@ def render (wikitext, base="", linker=None):
return "".join(rendered).strip().replace("\n\n", "<br /><br />")
class Linker():
def __init__ (self, file_namespaces=FILE_NAMESPACES, interwiki_namespaces=INTERWIKI_NAMESPACES, directory_names=[]):
def __init__ (self, file_namespaces=FILE_NAMESPACES, interwiki_namespaces=INTERWIKI_NAMESPACES):
self.file_namespaces = file_namespaces
self.interwiki_namespaces = interwiki_namespaces
self.directory_names = directory_names
def translate_interwiki_title (self, page_title):
for namespace, url in self.interwiki_namespaces.items():
@ -194,15 +193,8 @@ class Linker():
def translate_page_title (self, page_title):
if page_title.startswith(CATEGORY_LINK_NAMESPACE):
page_title = page_title[1:]
directory_name = ""
for name in self.directory_names:
if page_title.startswith(f"{name}/"):
directory_name = name
page_title = page_title[len(directory_name) + 1:]
break
return f"{reformat_page_title(directory_name)}{'/' if directory_name else ''}{reformat_page_title(page_title)}.html"
return reformat_page_title(page_title)
def translate_image_title (self, page_title):
for namespace in self.file_namespaces:
@ -213,7 +205,7 @@ def reformat_page_title (page_title):
if not page_title:
return ""
return f"{page_title[0].upper()}{page_title[1:].replace(' ', '_').replace('/', '%2F')}"
return f"{page_title[0].upper()}{page_title[1:].replace(' ', '_')}"
class Template():
def __init__ (self, wikicode):

View File

@ -68,6 +68,7 @@ TOPICS_DUMP = "threads.sql"
# Categories we are not interested in archiving.
# `id_cat` in (1, 2)
DO_NOT_ARCHIVE_CATEGORIES = [
7, # Links
12, # Epsilon: ?????
6, # Sigma: Higher Access
8 # Omega: Garbage
@ -76,17 +77,21 @@ DO_NOT_ARCHIVE_CATEGORIES = [
# Boards we are not interested in archiving.
# `id_board` in (1, 2)
DO_NOT_ARCHIVE_BOARDS = [
40, # Exclusive Board
65, # Requests for Moderatorship
66, # Requests for Membership+
67, # Requests for Distinguished Membership
23, # M.A.S.K. HQ (Staff Board)
22, # Admins Only Board
89, # Test Board
86, # Omega Archives
51, 37, 79, 26, 47, 44, 99, 93, 119, 96,
28, # The Dumpster Out Back
123 # ?????
24, 94, 118, 121 # Links
40, # Exclusive Board
65, # Requests for Moderatorship
66, # Requests for Membership+
67, # Requests for Distinguished Membership
23, # M.A.S.K. HQ (Staff Board)
22, # Admins Only Board
89, # Test Board
86, # Omega Archives
51, 37, 79, 26, 47, 44, 45, 99, 93, 119, 96,
62, # Submit-A-Glitch Archives
3, 4, 5, 57, 58, 59, 38, 54, 63, 64,
68, 69, 70, 81, 82, 83,
28, # The Dumpster Out Back
123 # ?????
]
# Regexes for sensitive information

View File

@ -11,7 +11,11 @@ ul.boards { margin-left: 0; padding-left: 0; }
.label { font-weight: bold }
article { border-top: 1px solid black; }
section { margin-top: 15px; margin-bottom: 15px; }
.next { float: right; }
.pagination { margin-bottom: 10px; }
.pagination ul { list-style-type: none; margin-left: 0; padding-left: 0; display: inline; }
.pagination li { display: inline; }
.page { padding-top: 15px; }
.page table { width: 100%; }

View File

@ -1,4 +1,9 @@
<div class="pagination">
<a class="prev" href="page-{{prev}}.html">Previous Page</a>
<ul>
{{#page_links}}
<li><a href="{{link}}">{{label}}</a></li>
{{/page_links}}
</ul>
<a class="next" href="page-{{next}}.html">Next Page</a>
</div>

View File

@ -3,14 +3,16 @@
<table id="threads">
<tr>
<th>Title</th>
<th>Poster</th>
<th>Poster</th>
<th>Date</th>
<th>Replies</th>
</tr>
{{#threads}}
<tr>
<td class="thread-subject"><a href="thread-{{id}}">{{subject}}</a></td>
<td class="thread-poster">{{poster_name}}</td>
<td class="thread-date">{{datetime}}</td>
<td class="replies">{{num_replies}}</td>
</tr>
{{/threads}}
</table>