Fix broken utf-8 encoding and unescape html entities.

This commit is contained in:
Adrian Kuschelyagi Malacoda 2020-08-16 18:52:05 -05:00
parent 4045473e65
commit 23f4789599
2 changed files with 57 additions and 25 deletions

View File

@ -1,10 +1,10 @@
import os
import logging
import shutil
from datetime import datetime
import chevron
import bbcode
import html
from .wiki import NAMESPACES as WIKI_NAMESPACES
import mwparserfromhell
@ -12,19 +12,14 @@ import mwparserfromhell
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ArchiveGenerator")
def format_datetime (timestamp):
return datetime.fromtimestamp(timestamp)
def prepare_thread (thread):
thread = dict(thread)
thread['datetime'] = format_datetime(thread['poster_time'])
thread.subject = html.unescape(thread.subject)
return thread
def prepare_post (post):
post = prepare_thread(post)
parser = bbcode.Parser()
post['body'] = parser.format(post['body']).replace("&lt;br /&gt;", "<br />")
post.body = html.unescape(parser.format(post.body))
return post
class ArchiveGenerator():
@ -80,8 +75,8 @@ class ArchiveGenerator():
self.generate_forum_board(forum, board, out_dir)
def generate_forum_board (self, forum, board, out_dir):
board_out_dir = os.path.join(out_dir, "board-{}".format(board['id_board']))
logger.info("Archiving board %s to %s", board['name'], board_out_dir)
board_out_dir = os.path.join(out_dir, "board-{}".format(board.id))
logger.info("Archiving board %s to %s", board.name, board_out_dir)
try:
os.makedirs(board_out_dir)
except FileExistsError: pass
@ -89,7 +84,7 @@ class ArchiveGenerator():
renderer = TemplateRenderer(self.template_dir, board_out_dir)
threads = [prepare_thread(thread) for thread in forum.get_threads_in_board(board)]
renderer.render_template_to_file("threads", "index.html", {
"title": " - {}".format(board['name']),
"title": " - {}".format(board.name),
"base": "../",
"board": board,
"threads": threads
@ -99,8 +94,8 @@ class ArchiveGenerator():
self.generate_forum_thread(forum, board, thread, board_out_dir)
def generate_forum_thread (self, forum, board, thread, out_dir):
thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread['id_topic']))
logger.info("Archiving thread %s to %s", thread['subject'], thread_out_dir)
thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread.id))
logger.info("Archiving thread %s to %s", thread.subject, thread_out_dir)
try:
os.makedirs(thread_out_dir)
except FileExistsError: pass
@ -114,9 +109,9 @@ class ArchiveGenerator():
if len(posts) < 1:
break
logger.info("Archiving page %s of thread %s", page, thread['subject'])
logger.info("Archiving page %s of thread %s", page, thread.subject)
renderer.render_template_to_file("posts", "page-{}.html".format(page), {
"title": " - {} - Page {}".format(thread['subject'], page + 1),
"title": " - {} - Page {}".format(thread.subject, page + 1),
"base": "../../",
"board": board,
"thread": thread,

View File

@ -1,4 +1,5 @@
import sqlite3
from datetime import datetime
PREFIX = "smf_"
GET_BOARDS = """
@ -22,42 +23,78 @@ GET_POSTS = """
LIMIT ? OFFSET ?
""".format(PREFIX)
def fix_encoding (string):
return string.encode("latin1", errors="ignore").decode(errors="ignore")
class Forum():
def __init__ (self, db_path):
self.connection = sqlite3.connect(db_path)
self.connection.row_factory = sqlite3.Row
def get_board_tree (self):
categories = [dict(category) for category in self.get_categories()]
boards = [dict(board) for board in self.get_boards()]
categories = self.get_categories()
boards = self.get_boards()
for category in categories:
category['children'] = [child for child in boards if child['id_cat'] == category['id_cat'] and child['child_level'] == 0]
category.children = [child for child in boards if child.category == category.id and child.child_level == 0]
for board in boards:
board['children'] = [child for child in boards if child['id_parent'] == board['id_board']]
board.children = [child for child in boards if child.parent_board == board.id]
return categories
def get_categories (self):
cursor = self.connection.cursor()
cursor.execute(GET_CATEGORIES)
return cursor.fetchall()
return [Category(category) for category in cursor.fetchall()]
def get_boards (self):
cursor = self.connection.cursor()
cursor.execute(GET_BOARDS)
return cursor.fetchall()
return [Board(board) for board in cursor.fetchall()]
def get_threads_in_board (self, board, page=0, per_page=2000):
try:
board = board['id_board']
board = board.id
except ValueError: pass
cursor = self.connection.cursor()
cursor.execute(GET_THREADS, (board, per_page, page * per_page))
return cursor.fetchall()
return [Thread(thread) for thread in cursor.fetchall()]
def get_posts_in_thread (self, thread, page=0, per_page=15):
try:
thread = thread['id_topic']
thread = thread.id
except ValueError: pass
cursor = self.connection.cursor()
cursor.execute(GET_POSTS, (thread, per_page, page * per_page))
return cursor.fetchall()
return [Post(post) for post in cursor.fetchall()]
class Category():
def __init__ (self, row):
self.id = row['id_cat']
self.name = fix_encoding(row['name'])
self.children = []
class Board():
def __init__ (self, row):
self.id = row['id_board']
self.category = row['id_cat']
self.parent_board = row['id_parent']
self.child_level = row['child_level']
self.name = fix_encoding(row['name'])
self.description = fix_encoding(row['description'])
self.children = []
class Thread():
def __init__ (self, row):
self.id = row['id_topic']
self.parent = row['id_board']
self.datetime = datetime.fromtimestamp(row['poster_time'])
self.subject = fix_encoding(row['subject'])
self.poster_name = fix_encoding(row['poster_name'])
class Post():
def __init__ (self, row):
self.id = row['id_msg']
self.parent = row['id_topic']
self.datetime = datetime.fromtimestamp(row['poster_time'])
self.subject = fix_encoding(row['subject'])
self.body = fix_encoding(row['body'])
self.poster_name = fix_encoding(row['poster_name'])