Compare commits

...

20 Commits

Author SHA1 Message Date
eff22ff325 Bolden names of successor websites. 2020-09-20 05:48:15 -05:00
6ab90d5fff Add link to GCW on the archive index page. 2020-09-20 05:47:35 -05:00
d98354046b Prefix image paths with images subdirectory (which does not yet exist) 2020-09-20 05:45:49 -05:00
f1ae73b737 Correctly parse out fragment from html archive link and append it in the correct place. 2020-09-20 05:40:57 -05:00
d062ca6787 Add anchors to headings. 2020-09-20 05:31:55 -05:00
a9adf51453 Add GCW link to archived wiki pages, for the live version of said archived article. 2020-09-20 05:28:37 -05:00
7157757d43 redirector: handle case where thread id can have an .html extension, because of course it can... 2020-09-03 04:12:54 -05:00
5859ee0408 Add notice to the top of archived pages that this is an archive. 2020-09-03 03:34:22 -05:00
b33ea016d5 convert all f-strings to old-style format strings for python 3.5 compatibility (since that is what's available on the server) 2020-09-01 02:46:37 -05:00
ab29250b74 add "added submissions" boards from Submit-A-Glitch to exclusion filter 2020-09-01 02:25:32 -05:00
167a03be3c fix syntax error 2020-09-01 01:31:39 -05:00
f65361e06b Implement forums redirector and default archive urls (since it's unlikely these will be changed). Also add redirectors for the index/main pages since those don't get handled by default. 2020-08-30 16:51:55 -05:00
c37cf4fc44 Implement thread index for mapping thread ids back to board ids, for use with the redirector.
The archive domain (archives.glitchcity.info) will host this file and the redirector will pull and unpack it when it starts up.
2020-08-30 16:50:21 -05:00
ef3f3dd60c Add pagination to forums archives. 2020-08-28 14:19:09 -05:00
0e3f1274cc Exclude links and submit-a-glitch archives from the archive. 2020-08-28 02:29:30 -05:00
1b7e3ce08b Since the wiki linker code is modularized the redirector can just import it 2020-08-27 02:24:24 -05:00
646b840be4 Use '+' as the substitution for '/' rather than '%2F' since nginx seems to want to normalize the %2F back into / which would defeat the purpose. 2020-08-27 02:11:56 -05:00
a382e6d4fd Extract more archive generator specific functionality from linker (e.g. the .html suffix, / -> %2F replacement) 2020-08-27 02:02:43 -05:00
ade44491d4 Extract archive generator specific linker functionality into ArchiveLinker subclass. 2020-08-27 01:52:17 -05:00
2e73ecd59f Begin work on redirector webapp. The logic for redirecting wiki pages -seems- consistent with how the archives are generated but... nginx is normalizing the %2Fs into slashes and therefore not able to access any files with that escape sequence (e.g. Gold%2FSilver). Might need to find another character to escape / with.
+ might work, it's semantically acceptable (e.g. "Gold+Silver" in place of "Gold/Silver") although this character is sometimes interpreted equivalently to a space. Regardless, nginx seems to be happy with it so might go with it.

May also need to test on a web host e.g. my old tripod account to see if assumptions hold up there too.
2020-08-24 00:24:45 -05:00
16 changed files with 244 additions and 44 deletions

View File

@@ -1,6 +1,9 @@
import os
import logging
import shutil
import math
import json
import gzip
from itertools import chain
from traceback import print_exc
@@ -9,7 +12,8 @@ import chevron
import bbcode
import html
from .wiki import Template, Renderer, Linker, reformat_page_title, NAMESPACES as WIKI_NAMESPACES
from .forum import DEFAULT_POSTS_PER_PAGE
from .wiki import Template, Renderer, Linker, NAMESPACES as WIKI_NAMESPACES
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ArchiveGenerator")
@@ -20,7 +24,47 @@ DEX_TYPES = [
"MDIGlitchDex", "MetascriptDex", "TMHMDex", "StatDex", "PosterDex", "TypeDex", "UnownDex", "DollDex", "DefaultNameDex",
"BattleTypeDe", "BadgeDescriptionDex", "FacingDex"
]
DEXES = list(chain.from_iterable([[f"{dex_type}{language}" for dex_type in DEX_TYPES] for language in DEX_LANGUAGES]))
DEXES = list(chain.from_iterable([["{}{}".format(dex_type, language) for dex_type in DEX_TYPES] for language in DEX_LANGUAGES]))
FORUM_THREAD_INDEX = "thread_index.json.gz"
IMAGE_DIRECTORY = "images"
class ArchiveLinker(Linker):
def __init__ (self, directory_names=[]):
super().__init__()
self.directory_names = directory_names
self.image_directory = IMAGE_DIRECTORY
self.replacements = {
"/": "+",
#":": ""
}
def translate_page_title (self, page_title):
page_title = super().translate_page_title(page_title)
fragment = ""
if "#" in page_title:
fragment = page_title[page_title.find("#"):]
page_title = page_title[:-len(fragment)]
directory_name = ""
for name in self.directory_names:
if page_title.startswith("{}/".format(name)):
directory_name = name
page_title = page_title[len(directory_name) + 1:]
break
for key, value in self.replacements.items():
page_title = page_title.replace(key, value)
return "{}{}{}.html{}".format(directory_name, '/' if directory_name else '', page_title, fragment)
def translate_image_title (self, page_title):
image_title = super().translate_image_title(page_title)
if not image_title:
return
return "{}/{}".format(self.image_directory, image_title)
def prepare_thread (thread):
thread.subject = html.unescape(thread.subject)
@@ -61,7 +105,7 @@ class ArchiveGenerator():
categories = {}
templates = dict([(page.title.split(":")[1], Template(page.get_latest().text)) for page in wiki.get_pages() if page.namespace == WIKI_NAMESPACES['TEMPLATE']])
linker = Linker(directory_names=DEXES)
linker = ArchiveLinker(directory_names=DEXES)
wikitext_renderer = Renderer(templates, linker)
for page in wiki.get_pages():
try:
@@ -79,7 +123,7 @@ class ArchiveGenerator():
if page.redirect:
logger.info("Archiving redirect page (%s -> %s) to %s", page.title, page.redirect, page_out)
renderer.render_template_to_file("redirect", page_out, {
"target": f"{base}{linker.translate_page_title(page.redirect)}"
"target": "{}{}".format(base, linker.translate_page_title(page.redirect))
})
else:
logger.info("Archiving page %s to %s", page.title, page_out)
@@ -96,6 +140,7 @@ class ArchiveGenerator():
renderer.render_template_to_file("page", page_out, {
"title": " - {}".format(page.title),
"pagename": page.title,
"page": page,
"base": base,
"text": rendered
@@ -107,12 +152,13 @@ class ArchiveGenerator():
raise e
for category, pages in categories.items():
category_out = f"Category:{reformat_page_title(category)}.html"
category_out = "Category:{}".format(linker.translate_page_title(category))
logger.info("Archiving category %s to %s", category, category_out)
try:
renderer.render_template_to_file("category", category_out, {
"title": f" - {category}",
"title": " - {}".format(category),
"pagename": "Category:{}".format(category),
"category": category,
"pages": pages
})
@@ -132,10 +178,15 @@ class ArchiveGenerator():
"categories": forum.get_board_tree()
})
threads = []
for board in forum.get_boards():
self.generate_forum_board(forum, board, out_dir)
forum_threads = forum.get_threads_in_board(board)
threads = threads + forum_threads
self.generate_forum_board(forum, board, forum_threads, out_dir)
def generate_forum_board (self, forum, board, out_dir):
self.generate_thread_index(threads, os.path.join(out_dir, FORUM_THREAD_INDEX))
def generate_forum_board (self, forum, board, threads, out_dir):
board_out_dir = os.path.join(out_dir, "board-{}".format(board.id))
logger.info("Archiving board %s to %s", board.name, board_out_dir)
try:
@@ -143,7 +194,7 @@ class ArchiveGenerator():
except FileExistsError: pass
renderer = TemplateRenderer(self.template_dir, board_out_dir)
threads = [prepare_thread(thread) for thread in forum.get_threads_in_board(board)]
threads = [prepare_thread(thread) for thread in threads]
renderer.render_template_to_file("threads", "index.html", {
"title": " - {}".format(board.name),
"base": "../",
@@ -166,6 +217,8 @@ class ArchiveGenerator():
"target": "page-0.html"
})
total_pages = math.ceil((thread.num_replies + 1) / DEFAULT_POSTS_PER_PAGE)
page_links = [{"label": page + 1, "link": "page-{}.html".format(page)} for page in range(total_pages)]
page = 0
while True:
posts = [prepare_post(post) for post in forum.get_posts_in_thread(thread, page)]
@@ -180,11 +233,19 @@ class ArchiveGenerator():
"thread": thread,
"page": page,
"next": page + 1,
"page_links": page_links,
"prev": page - 1,
"posts": posts
})
page = page + 1
def generate_thread_index (self,threads, out_path):
# with open(out_path, "wb") as out:
# pickle.dump({thread.id: {"parent": thread.parent} for thread in threads}, out, protocol=4)
threads = {thread.id: {"parent": thread.parent} for thread in threads}
with gzip.open(out_path, "w") as out:
out.write(json.dumps(threads).encode())
class TemplateRenderer():
def __init__ (self, template_dir, out_dir):
self.template_dir = template_dir

View File

@@ -23,6 +23,9 @@ GET_POSTS = """
LIMIT ? OFFSET ?
""".format(PREFIX)
DEFAULT_POSTS_PER_PAGE = 15
DEFAULT_THREADS_PER_PAGE = 2000
def fix_encoding (string):
return string.encode("latin1", errors="ignore").decode(errors="ignore")
@@ -50,7 +53,7 @@ class Forum():
cursor.execute(GET_BOARDS)
return [Board(board) for board in cursor.fetchall()]
def get_threads_in_board (self, board, page=0, per_page=2000):
def get_threads_in_board (self, board, page=0, per_page=DEFAULT_THREADS_PER_PAGE):
try:
board = board.id
except ValueError: pass
@@ -58,7 +61,7 @@ class Forum():
cursor.execute(GET_THREADS, (board, per_page, page * per_page))
return [Thread(thread) for thread in cursor.fetchall()]
def get_posts_in_thread (self, thread, page=0, per_page=15):
def get_posts_in_thread (self, thread, page=0, per_page=DEFAULT_POSTS_PER_PAGE):
try:
thread = thread.id
except ValueError: pass
@@ -89,6 +92,7 @@ class Thread():
self.datetime = datetime.fromtimestamp(row['poster_time'])
self.subject = fix_encoding(row['subject'])
self.poster_name = fix_encoding(row['poster_name'])
self.num_replies = row['num_replies']
class Post():
def __init__ (self, row):

98
epilogue/redirector.py Normal file
View File

@@ -0,0 +1,98 @@
import argparse
import gzip
import urllib.request
import json
from .archive_generator import ArchiveLinker, DEXES, FORUM_THREAD_INDEX
from flask import Flask, redirect, request
app = Flask(__name__)
DEFAULT_ARCHIVES_DOMAIN = "https://archives.glitchcity.info/"
DEFAULT_FORUMS_ARCHIVE = "{}forums".format(DEFAULT_ARCHIVES_DOMAIN)
DEFAULT_WIKI_ARCHIVE = "{}wiki".format(DEFAULT_ARCHIVES_DOMAIN)
## Wiki redirector
@app.route("/wiki/")
def redirect_wiki_main ():
return redirect_wiki("Main Page")
@app.route("/wiki/<path:path>")
def redirect_wiki (path):
return redirect(make_wiki_url(path))
def make_wiki_url (path):
if path.endswith("/"):
path = path[:-1]
return app.args.wiki_archive + app.wiki_linker.translate_page_title(path)
## Forum redirector
@app.route('/forums/')
def redirect_forums_index ():
return redirect_forums("")
@app.route('/forums/<path:path>')
def redirect_forums (path):
return redirect(make_forum_url(request))
def make_forum_url (request):
thread_id = request.args.get("topic", None)
board_id = request.args.get("board", None)
post_id = None
if thread_id:
thread_id = strip_extension(thread_id)
if "." in thread_id:
(thread_id, post_id) = thread_id.split(".")
post_id = post_id[len("msg"):]
if not board_id:
board_id = app.thread_index[thread_id]['parent']
try:
if "." in board_id:
board_id = board_id.split(".")[0]
except TypeError: pass
url = app.args.forums_archive
if board_id:
url = url + "board-{}".format(board_id)
if thread_id:
url = url + "/thread-{}".format(thread_id)
if not url.endswith("/"):
url = url + "/"
return url
def strip_extension (item):
for extension in [".html"]:
if item.endswith(extension):
item = item[:-len(extension)]
return item
def read_thread_index (forums_archive):
with urllib.request.urlopen("{}{}".format(forums_archive, FORUM_THREAD_INDEX)) as gzipped_in:
data = gzipped_in.read()
return json.loads(gzip.decompress(data).decode())
def main ():
parser = argparse.ArgumentParser()
parser.add_argument("--wiki-archive", help="URL to wiki archive", default=DEFAULT_WIKI_ARCHIVE)
parser.add_argument("--forums-archive", help="URL to forums archive", default=DEFAULT_FORUMS_ARCHIVE)
args = parser.parse_args()
if not args.wiki_archive.endswith("/"):
args.wiki_archive = args.wiki_archive + "/"
if not args.forums_archive.endswith("/"):
args.forums_archive = args.forums_archive + "/"
app.args = args
app.thread_index = read_thread_index(args.forums_archive)
app.wiki_linker = ArchiveLinker(directory_names=DEXES)
app.run()

View File

@@ -32,7 +32,7 @@ INTERWIKI_NAMESPACES = {
FILE_NAMESPACES = ["File:", "Image:"]
CATEGORY_NAMESPACE = "Category:"
CATEGORY_LINK_NAMESPACE = f":{CATEGORY_NAMESPACE}"
CATEGORY_LINK_NAMESPACE = ":{}".format(CATEGORY_NAMESPACE)
class Wiki():
def __init__ (self, xml_path):
@@ -106,9 +106,9 @@ class Renderer():
if categories:
rendered.append('<h2>Categories</h2><ul class="categories">')
for category in categories:
rendered.append('<li><a href="{}Category:{}.html">{}</a></li>'.format(
rendered.append('<li><a href="{}Category:{}">{}</a></li>'.format(
base,
reformat_page_title(category),
self.linker.translate_page_title(category),
category
))
rendered.append("</ul>")
@@ -152,7 +152,7 @@ def render (wikitext, base="", linker=None):
else:
url = linker.translate_interwiki_title(node.title)
if not url:
url = f"{base}{linker.translate_page_title(node.title)}"
url = "{}{}".format(base, linker.translate_page_title(node.title))
rendered.append('<a href="{}">{}</a>'.format(
url,
@@ -170,8 +170,9 @@ def render (wikitext, base="", linker=None):
render(node.tag)
))
elif node_type is Heading:
rendered.append("<h{}>{}</h{}>".format(
rendered.append('<h{} id="{}">{}</h{}>'.format(
node.level,
reformat_page_title(node.title),
render(node.title, base, linker),
node.level
))
@@ -181,10 +182,9 @@ def render (wikitext, base="", linker=None):
return "".join(rendered).strip().replace("\n\n", "<br /><br />")
class Linker():
def __init__ (self, file_namespaces=FILE_NAMESPACES, interwiki_namespaces=INTERWIKI_NAMESPACES, directory_names=[]):
def __init__ (self, file_namespaces=FILE_NAMESPACES, interwiki_namespaces=INTERWIKI_NAMESPACES):
self.file_namespaces = file_namespaces
self.interwiki_namespaces = interwiki_namespaces
self.directory_names = directory_names
def translate_interwiki_title (self, page_title):
for namespace, url in self.interwiki_namespaces.items():
@@ -195,14 +195,7 @@ class Linker():
if page_title.startswith(CATEGORY_LINK_NAMESPACE):
page_title = page_title[1:]
directory_name = ""
for name in self.directory_names:
if page_title.startswith(f"{name}/"):
directory_name = name
page_title = page_title[len(directory_name) + 1:]
break
return f"{reformat_page_title(directory_name)}{'/' if directory_name else ''}{reformat_page_title(page_title)}.html"
return reformat_page_title(page_title)
def translate_image_title (self, page_title):
for namespace in self.file_namespaces:
@@ -213,7 +206,7 @@ def reformat_page_title (page_title):
if not page_title:
return ""
return f"{page_title[0].upper()}{page_title[1:].replace(' ', '_').replace('/', '%2F')}"
return "{}{}".format(page_title[0].upper(), page_title[1:].replace(' ', '_'))
class Template():
def __init__ (self, wikicode):

View File

@@ -68,6 +68,7 @@ TOPICS_DUMP = "threads.sql"
# Categories we are not interested in archiving.
# `id_cat` in (1, 2)
DO_NOT_ARCHIVE_CATEGORIES = [
7, # Links
12, # Epsilon: ?????
6, # Sigma: Higher Access
8 # Omega: Garbage
@@ -76,6 +77,7 @@ DO_NOT_ARCHIVE_CATEGORIES = [
# Boards we are not interested in archiving.
# `id_board` in (1, 2)
DO_NOT_ARCHIVE_BOARDS = [
24, 94, 118, 121, # Links
40, # Exclusive Board
65, # Requests for Moderatorship
66, # Requests for Membership+
@@ -84,7 +86,10 @@ DO_NOT_ARCHIVE_BOARDS = [
22, # Admins Only Board
89, # Test Board
86, # Omega Archives
51, 37, 79, 26, 47, 44, 99, 93, 119, 96,
51, 37, 79, 26, 47, 44, 45, 99, 93, 119, 96,
62, 60, 80, 84, # Submit-A-Glitch Archives
3, 4, 5, 57, 58, 59, 38, 54, 63, 64,
68, 69, 70, 81, 82, 83,
28, # The Dumpster Out Back
123 # ?????
]

View File

@@ -8,10 +8,11 @@ setup(
description='Tools for exporting and creating archives of Glitch City Labs data',
author='Adrian Kuschelyagi Malacoda',
packages=['epilogue'],
install_requires=['pysqlite3 >= 0.4.3', 'chevron >= 0.13.1', 'bbcode >= 1.1.0', 'mwparserfromhell >= 0.5.4'],
install_requires=['pysqlite3 >= 0.4.3', 'chevron >= 0.13.1', 'bbcode >= 1.1.0', 'mwparserfromhell >= 0.5.4', 'flask >= 1.1.2'],
entry_points={
'console_scripts': [
'epilogue = epilogue:main'
'epilogue = epilogue:main',
'gclredirector = epilogue.redirector:main'
]
}
)

View File

@@ -11,7 +11,11 @@ ul.boards { margin-left: 0; padding-left: 0; }
.label { font-weight: bold }
article { border-top: 1px solid black; }
section { margin-top: 15px; margin-bottom: 15px; }
.next { float: right; }
.pagination { margin-bottom: 10px; }
.pagination ul { list-style-type: none; margin-left: 0; padding-left: 0; display: inline; }
.pagination li { display: inline; }
.page { padding-top: 15px; }
.page table { width: 100%; }

View File

@@ -1,4 +1,5 @@
{{>header}}
{{>forums_notice}}
{{#categories}}
<h2 class="category-name">{{name}}</h2>
{{>child_boards}}

View File

@@ -0,0 +1,9 @@
{{>header}}
{{>wiki_notice}}
<h2>{{pagename}}</h2>
<ul>
{{#pages}}
<li><a href="{{url}}">{{title}}</a></li>
{{/pages}}
</ul>
{{>footer}}

View File

@@ -1,5 +1,8 @@
{{>header}}
Welcome to the <b>Glitch City Laboratories Archives</b>.
<p>Glitch City Laboratories was a Pok&eacute;mon glitch website that existed from March 2006 to September 2020 (<a href="forums/board-2/thread-9114/page-0.html">announcement of closure</a>). This is an <b>archive</b> of content from the website prior to its closure.</p>
<p>Further development and discussion is happening at <b><a href="https://discord.com/invite/EA7jxJ6">Glitch City Research Institute</a></b>, the successor community.</p>
<p>The <b><a href="https://glitchcity.wiki/">Glitch City Wiki</a></b> is the continuation of the Glitch City Laboratories wiki.</p>
<h2>Archives</h2>
<ul>
<li><a href="forums">Forums</a> (<a href="forums.tar.gz">.tar.gz</a>) (<a href="forums.sql.gz">.sql.gz</a>) (<a href="forums.sqlite.gz">.sqlite.gz</a>)</li>

View File

@@ -1,4 +1,5 @@
{{>header}}
{{>wiki_notice}}
<h2>{{page.title}}</h2>
<article class="page">
{{{text}}}

View File

@@ -0,0 +1,5 @@
<div class="notice">
<p>Glitch City Laboratories closed on 1 September 2020 (<a href="{{base}}board-2/thread-9114/page-0.html">announcement</a>). This is an <b>archived</b> copy of a thread from Glitch City Laboratories Forums.</p>
<p>You can join <a href="https://discord.com/invite/EA7jxJ6">Glitch City Research Institute</a> to ask questions or discuss current developments.</p>
<p>You may also download the archive of this forum in <a href="{{base}}../forums.tar.gz">.tar.gz</a>, <a href="{{base}}../forums.sql.gz">.sql.gz</a>, or <a href="{{base}}../forums.sqlite.gz">.sqlite.gz</a> formats.</p>
</div>

View File

@@ -1,4 +1,9 @@
<div class="pagination">
<a class="prev" href="page-{{prev}}.html">Previous Page</a>
<ul>
{{#page_links}}
<li><a href="{{link}}">{{label}}</a></li>
{{/page_links}}
</ul>
<a class="next" href="page-{{next}}.html">Next Page</a>
</div>

View File

@@ -0,0 +1,6 @@
<div class="notice">
<p>Glitch City Laboratories closed on 1 September 2020 (<a href="{{base}}../forums/board-2/thread-9114/page-0.html">announcement</a>). This is an <b>archived</b> copy of an article from Glitch City Laboratories wiki.</p>
<p><b>A live version of this article is available at the <a href="https://glitchcity.wiki/">Glitch City Wiki</a> <a href="https://glitchcity.wiki/{{pagename}}">here</a>.</b></p>
<p>You can join <a href="https://discord.com/invite/EA7jxJ6">Glitch City Research Institute</a> to ask questions or discuss current developments.</p>
<p>You may also download the archive of the wiki in <a href="{{base}}../wiki.tar.gz">.tar.gz</a> or <a href="{{base}}../wiki.xml.gz">.xml.gz</a> formats.</p>
</div>

View File

@@ -1,4 +1,5 @@
{{>header}}
{{>forums_notice}}
<h2><a href="../">{{board.name}}</a></h2>
<h3>{{thread.subject}} - Page {{next}}</h3>
{{>pagination}}

View File

@@ -1,16 +1,19 @@
{{>header}}
{{>forums_notice}}
<h2>{{board.name}}</h2>
<table id="threads">
<tr>
<th>Title</th>
<th>Poster</th>
<th>Date</th>
<th>Replies</th>
</tr>
{{#threads}}
<tr>
<td class="thread-subject"><a href="thread-{{id}}">{{subject}}</a></td>
<td class="thread-poster">{{poster_name}}</td>
<td class="thread-date">{{datetime}}</td>
<td class="replies">{{num_replies}}</td>
</tr>
{{/threads}}
</table>