Fix broken utf-8 encoding and unescape html entities.
This commit is contained in:
parent
4045473e65
commit
23f4789599
@ -1,10 +1,10 @@
|
||||
import os
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
|
||||
import chevron
|
||||
import bbcode
|
||||
import html
|
||||
|
||||
from .wiki import NAMESPACES as WIKI_NAMESPACES
|
||||
import mwparserfromhell
|
||||
@ -12,19 +12,14 @@ import mwparserfromhell
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("ArchiveGenerator")
|
||||
|
||||
def format_datetime (timestamp):
|
||||
return datetime.fromtimestamp(timestamp)
|
||||
|
||||
def prepare_thread (thread):
|
||||
thread = dict(thread)
|
||||
thread['datetime'] = format_datetime(thread['poster_time'])
|
||||
thread.subject = html.unescape(thread.subject)
|
||||
return thread
|
||||
|
||||
def prepare_post (post):
|
||||
post = prepare_thread(post)
|
||||
|
||||
parser = bbcode.Parser()
|
||||
post['body'] = parser.format(post['body']).replace("<br />", "<br />")
|
||||
post.body = html.unescape(parser.format(post.body))
|
||||
return post
|
||||
|
||||
class ArchiveGenerator():
|
||||
@ -80,8 +75,8 @@ class ArchiveGenerator():
|
||||
self.generate_forum_board(forum, board, out_dir)
|
||||
|
||||
def generate_forum_board (self, forum, board, out_dir):
|
||||
board_out_dir = os.path.join(out_dir, "board-{}".format(board['id_board']))
|
||||
logger.info("Archiving board %s to %s", board['name'], board_out_dir)
|
||||
board_out_dir = os.path.join(out_dir, "board-{}".format(board.id))
|
||||
logger.info("Archiving board %s to %s", board.name, board_out_dir)
|
||||
try:
|
||||
os.makedirs(board_out_dir)
|
||||
except FileExistsError: pass
|
||||
@ -89,7 +84,7 @@ class ArchiveGenerator():
|
||||
renderer = TemplateRenderer(self.template_dir, board_out_dir)
|
||||
threads = [prepare_thread(thread) for thread in forum.get_threads_in_board(board)]
|
||||
renderer.render_template_to_file("threads", "index.html", {
|
||||
"title": " - {}".format(board['name']),
|
||||
"title": " - {}".format(board.name),
|
||||
"base": "../",
|
||||
"board": board,
|
||||
"threads": threads
|
||||
@ -99,8 +94,8 @@ class ArchiveGenerator():
|
||||
self.generate_forum_thread(forum, board, thread, board_out_dir)
|
||||
|
||||
def generate_forum_thread (self, forum, board, thread, out_dir):
|
||||
thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread['id_topic']))
|
||||
logger.info("Archiving thread %s to %s", thread['subject'], thread_out_dir)
|
||||
thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread.id))
|
||||
logger.info("Archiving thread %s to %s", thread.subject, thread_out_dir)
|
||||
try:
|
||||
os.makedirs(thread_out_dir)
|
||||
except FileExistsError: pass
|
||||
@ -114,9 +109,9 @@ class ArchiveGenerator():
|
||||
if len(posts) < 1:
|
||||
break
|
||||
|
||||
logger.info("Archiving page %s of thread %s", page, thread['subject'])
|
||||
logger.info("Archiving page %s of thread %s", page, thread.subject)
|
||||
renderer.render_template_to_file("posts", "page-{}.html".format(page), {
|
||||
"title": " - {} - Page {}".format(thread['subject'], page + 1),
|
||||
"title": " - {} - Page {}".format(thread.subject, page + 1),
|
||||
"base": "../../",
|
||||
"board": board,
|
||||
"thread": thread,
|
||||
|
@ -1,4 +1,5 @@
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
PREFIX = "smf_"
|
||||
GET_BOARDS = """
|
||||
@ -22,42 +23,78 @@ GET_POSTS = """
|
||||
LIMIT ? OFFSET ?
|
||||
""".format(PREFIX)
|
||||
|
||||
def fix_encoding (string):
|
||||
return string.encode("latin1", errors="ignore").decode(errors="ignore")
|
||||
|
||||
class Forum():
|
||||
def __init__ (self, db_path):
|
||||
self.connection = sqlite3.connect(db_path)
|
||||
self.connection.row_factory = sqlite3.Row
|
||||
|
||||
def get_board_tree (self):
|
||||
categories = [dict(category) for category in self.get_categories()]
|
||||
boards = [dict(board) for board in self.get_boards()]
|
||||
categories = self.get_categories()
|
||||
boards = self.get_boards()
|
||||
for category in categories:
|
||||
category['children'] = [child for child in boards if child['id_cat'] == category['id_cat'] and child['child_level'] == 0]
|
||||
category.children = [child for child in boards if child.category == category.id and child.child_level == 0]
|
||||
for board in boards:
|
||||
board['children'] = [child for child in boards if child['id_parent'] == board['id_board']]
|
||||
board.children = [child for child in boards if child.parent_board == board.id]
|
||||
return categories
|
||||
|
||||
def get_categories (self):
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(GET_CATEGORIES)
|
||||
return cursor.fetchall()
|
||||
return [Category(category) for category in cursor.fetchall()]
|
||||
|
||||
def get_boards (self):
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(GET_BOARDS)
|
||||
return cursor.fetchall()
|
||||
return [Board(board) for board in cursor.fetchall()]
|
||||
|
||||
def get_threads_in_board (self, board, page=0, per_page=2000):
|
||||
try:
|
||||
board = board['id_board']
|
||||
board = board.id
|
||||
except ValueError: pass
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(GET_THREADS, (board, per_page, page * per_page))
|
||||
return cursor.fetchall()
|
||||
return [Thread(thread) for thread in cursor.fetchall()]
|
||||
|
||||
def get_posts_in_thread (self, thread, page=0, per_page=15):
|
||||
try:
|
||||
thread = thread['id_topic']
|
||||
thread = thread.id
|
||||
except ValueError: pass
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(GET_POSTS, (thread, per_page, page * per_page))
|
||||
return cursor.fetchall()
|
||||
return [Post(post) for post in cursor.fetchall()]
|
||||
|
||||
class Category():
|
||||
def __init__ (self, row):
|
||||
self.id = row['id_cat']
|
||||
self.name = fix_encoding(row['name'])
|
||||
self.children = []
|
||||
|
||||
class Board():
|
||||
def __init__ (self, row):
|
||||
self.id = row['id_board']
|
||||
self.category = row['id_cat']
|
||||
self.parent_board = row['id_parent']
|
||||
self.child_level = row['child_level']
|
||||
self.name = fix_encoding(row['name'])
|
||||
self.description = fix_encoding(row['description'])
|
||||
self.children = []
|
||||
|
||||
class Thread():
|
||||
def __init__ (self, row):
|
||||
self.id = row['id_topic']
|
||||
self.parent = row['id_board']
|
||||
self.datetime = datetime.fromtimestamp(row['poster_time'])
|
||||
self.subject = fix_encoding(row['subject'])
|
||||
self.poster_name = fix_encoding(row['poster_name'])
|
||||
|
||||
class Post():
|
||||
def __init__ (self, row):
|
||||
self.id = row['id_msg']
|
||||
self.parent = row['id_topic']
|
||||
self.datetime = datetime.fromtimestamp(row['poster_time'])
|
||||
self.subject = fix_encoding(row['subject'])
|
||||
self.body = fix_encoding(row['body'])
|
||||
self.poster_name = fix_encoding(row['poster_name'])
|
Loading…
x
Reference in New Issue
Block a user