Fix broken utf-8 encoding and unescape html entities.

This commit is contained in:
Adrian Kuschelyagi Malacoda 2020-08-16 18:52:05 -05:00
parent 4045473e65
commit 23f4789599
2 changed files with 57 additions and 25 deletions

View File

@ -1,10 +1,10 @@
import os import os
import logging import logging
import shutil import shutil
from datetime import datetime
import chevron import chevron
import bbcode import bbcode
import html
from .wiki import NAMESPACES as WIKI_NAMESPACES from .wiki import NAMESPACES as WIKI_NAMESPACES
import mwparserfromhell import mwparserfromhell
@ -12,19 +12,14 @@ import mwparserfromhell
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ArchiveGenerator") logger = logging.getLogger("ArchiveGenerator")
def format_datetime (timestamp):
return datetime.fromtimestamp(timestamp)
def prepare_thread (thread): def prepare_thread (thread):
thread = dict(thread) thread.subject = html.unescape(thread.subject)
thread['datetime'] = format_datetime(thread['poster_time'])
return thread return thread
def prepare_post (post): def prepare_post (post):
post = prepare_thread(post) post = prepare_thread(post)
parser = bbcode.Parser() parser = bbcode.Parser()
post['body'] = parser.format(post['body']).replace("&lt;br /&gt;", "<br />") post.body = html.unescape(parser.format(post.body))
return post return post
class ArchiveGenerator(): class ArchiveGenerator():
@ -80,8 +75,8 @@ class ArchiveGenerator():
self.generate_forum_board(forum, board, out_dir) self.generate_forum_board(forum, board, out_dir)
def generate_forum_board (self, forum, board, out_dir): def generate_forum_board (self, forum, board, out_dir):
board_out_dir = os.path.join(out_dir, "board-{}".format(board['id_board'])) board_out_dir = os.path.join(out_dir, "board-{}".format(board.id))
logger.info("Archiving board %s to %s", board['name'], board_out_dir) logger.info("Archiving board %s to %s", board.name, board_out_dir)
try: try:
os.makedirs(board_out_dir) os.makedirs(board_out_dir)
except FileExistsError: pass except FileExistsError: pass
@ -89,7 +84,7 @@ class ArchiveGenerator():
renderer = TemplateRenderer(self.template_dir, board_out_dir) renderer = TemplateRenderer(self.template_dir, board_out_dir)
threads = [prepare_thread(thread) for thread in forum.get_threads_in_board(board)] threads = [prepare_thread(thread) for thread in forum.get_threads_in_board(board)]
renderer.render_template_to_file("threads", "index.html", { renderer.render_template_to_file("threads", "index.html", {
"title": " - {}".format(board['name']), "title": " - {}".format(board.name),
"base": "../", "base": "../",
"board": board, "board": board,
"threads": threads "threads": threads
@ -99,8 +94,8 @@ class ArchiveGenerator():
self.generate_forum_thread(forum, board, thread, board_out_dir) self.generate_forum_thread(forum, board, thread, board_out_dir)
def generate_forum_thread (self, forum, board, thread, out_dir): def generate_forum_thread (self, forum, board, thread, out_dir):
thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread['id_topic'])) thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread.id))
logger.info("Archiving thread %s to %s", thread['subject'], thread_out_dir) logger.info("Archiving thread %s to %s", thread.subject, thread_out_dir)
try: try:
os.makedirs(thread_out_dir) os.makedirs(thread_out_dir)
except FileExistsError: pass except FileExistsError: pass
@ -114,9 +109,9 @@ class ArchiveGenerator():
if len(posts) < 1: if len(posts) < 1:
break break
logger.info("Archiving page %s of thread %s", page, thread['subject']) logger.info("Archiving page %s of thread %s", page, thread.subject)
renderer.render_template_to_file("posts", "page-{}.html".format(page), { renderer.render_template_to_file("posts", "page-{}.html".format(page), {
"title": " - {} - Page {}".format(thread['subject'], page + 1), "title": " - {} - Page {}".format(thread.subject, page + 1),
"base": "../../", "base": "../../",
"board": board, "board": board,
"thread": thread, "thread": thread,

View File

@ -1,4 +1,5 @@
import sqlite3 import sqlite3
from datetime import datetime
PREFIX = "smf_" PREFIX = "smf_"
GET_BOARDS = """ GET_BOARDS = """
@ -22,42 +23,78 @@ GET_POSTS = """
LIMIT ? OFFSET ? LIMIT ? OFFSET ?
""".format(PREFIX) """.format(PREFIX)
def fix_encoding (string):
return string.encode("latin1", errors="ignore").decode(errors="ignore")
class Forum(): class Forum():
def __init__ (self, db_path): def __init__ (self, db_path):
self.connection = sqlite3.connect(db_path) self.connection = sqlite3.connect(db_path)
self.connection.row_factory = sqlite3.Row self.connection.row_factory = sqlite3.Row
def get_board_tree (self): def get_board_tree (self):
categories = [dict(category) for category in self.get_categories()] categories = self.get_categories()
boards = [dict(board) for board in self.get_boards()] boards = self.get_boards()
for category in categories: for category in categories:
category['children'] = [child for child in boards if child['id_cat'] == category['id_cat'] and child['child_level'] == 0] category.children = [child for child in boards if child.category == category.id and child.child_level == 0]
for board in boards: for board in boards:
board['children'] = [child for child in boards if child['id_parent'] == board['id_board']] board.children = [child for child in boards if child.parent_board == board.id]
return categories return categories
def get_categories (self): def get_categories (self):
cursor = self.connection.cursor() cursor = self.connection.cursor()
cursor.execute(GET_CATEGORIES) cursor.execute(GET_CATEGORIES)
return cursor.fetchall() return [Category(category) for category in cursor.fetchall()]
def get_boards (self): def get_boards (self):
cursor = self.connection.cursor() cursor = self.connection.cursor()
cursor.execute(GET_BOARDS) cursor.execute(GET_BOARDS)
return cursor.fetchall() return [Board(board) for board in cursor.fetchall()]
def get_threads_in_board (self, board, page=0, per_page=2000): def get_threads_in_board (self, board, page=0, per_page=2000):
try: try:
board = board['id_board'] board = board.id
except ValueError: pass except ValueError: pass
cursor = self.connection.cursor() cursor = self.connection.cursor()
cursor.execute(GET_THREADS, (board, per_page, page * per_page)) cursor.execute(GET_THREADS, (board, per_page, page * per_page))
return cursor.fetchall() return [Thread(thread) for thread in cursor.fetchall()]
def get_posts_in_thread (self, thread, page=0, per_page=15): def get_posts_in_thread (self, thread, page=0, per_page=15):
try: try:
thread = thread['id_topic'] thread = thread.id
except ValueError: pass except ValueError: pass
cursor = self.connection.cursor() cursor = self.connection.cursor()
cursor.execute(GET_POSTS, (thread, per_page, page * per_page)) cursor.execute(GET_POSTS, (thread, per_page, page * per_page))
return cursor.fetchall() return [Post(post) for post in cursor.fetchall()]
class Category():
def __init__ (self, row):
self.id = row['id_cat']
self.name = fix_encoding(row['name'])
self.children = []
class Board():
def __init__ (self, row):
self.id = row['id_board']
self.category = row['id_cat']
self.parent_board = row['id_parent']
self.child_level = row['child_level']
self.name = fix_encoding(row['name'])
self.description = fix_encoding(row['description'])
self.children = []
class Thread():
def __init__ (self, row):
self.id = row['id_topic']
self.parent = row['id_board']
self.datetime = datetime.fromtimestamp(row['poster_time'])
self.subject = fix_encoding(row['subject'])
self.poster_name = fix_encoding(row['poster_name'])
class Post():
def __init__ (self, row):
self.id = row['id_msg']
self.parent = row['id_topic']
self.datetime = datetime.fromtimestamp(row['poster_time'])
self.subject = fix_encoding(row['subject'])
self.body = fix_encoding(row['body'])
self.poster_name = fix_encoding(row['poster_name'])