Fix broken utf-8 encoding and unescape html entities.
This commit is contained in:
parent
4045473e65
commit
23f4789599
@ -1,10 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import chevron
|
import chevron
|
||||||
import bbcode
|
import bbcode
|
||||||
|
import html
|
||||||
|
|
||||||
from .wiki import NAMESPACES as WIKI_NAMESPACES
|
from .wiki import NAMESPACES as WIKI_NAMESPACES
|
||||||
import mwparserfromhell
|
import mwparserfromhell
|
||||||
@ -12,19 +12,14 @@ import mwparserfromhell
|
|||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger("ArchiveGenerator")
|
logger = logging.getLogger("ArchiveGenerator")
|
||||||
|
|
||||||
def format_datetime (timestamp):
|
|
||||||
return datetime.fromtimestamp(timestamp)
|
|
||||||
|
|
||||||
def prepare_thread (thread):
|
def prepare_thread (thread):
|
||||||
thread = dict(thread)
|
thread.subject = html.unescape(thread.subject)
|
||||||
thread['datetime'] = format_datetime(thread['poster_time'])
|
|
||||||
return thread
|
return thread
|
||||||
|
|
||||||
def prepare_post (post):
|
def prepare_post (post):
|
||||||
post = prepare_thread(post)
|
post = prepare_thread(post)
|
||||||
|
|
||||||
parser = bbcode.Parser()
|
parser = bbcode.Parser()
|
||||||
post['body'] = parser.format(post['body']).replace("<br />", "<br />")
|
post.body = html.unescape(parser.format(post.body))
|
||||||
return post
|
return post
|
||||||
|
|
||||||
class ArchiveGenerator():
|
class ArchiveGenerator():
|
||||||
@ -80,8 +75,8 @@ class ArchiveGenerator():
|
|||||||
self.generate_forum_board(forum, board, out_dir)
|
self.generate_forum_board(forum, board, out_dir)
|
||||||
|
|
||||||
def generate_forum_board (self, forum, board, out_dir):
|
def generate_forum_board (self, forum, board, out_dir):
|
||||||
board_out_dir = os.path.join(out_dir, "board-{}".format(board['id_board']))
|
board_out_dir = os.path.join(out_dir, "board-{}".format(board.id))
|
||||||
logger.info("Archiving board %s to %s", board['name'], board_out_dir)
|
logger.info("Archiving board %s to %s", board.name, board_out_dir)
|
||||||
try:
|
try:
|
||||||
os.makedirs(board_out_dir)
|
os.makedirs(board_out_dir)
|
||||||
except FileExistsError: pass
|
except FileExistsError: pass
|
||||||
@ -89,7 +84,7 @@ class ArchiveGenerator():
|
|||||||
renderer = TemplateRenderer(self.template_dir, board_out_dir)
|
renderer = TemplateRenderer(self.template_dir, board_out_dir)
|
||||||
threads = [prepare_thread(thread) for thread in forum.get_threads_in_board(board)]
|
threads = [prepare_thread(thread) for thread in forum.get_threads_in_board(board)]
|
||||||
renderer.render_template_to_file("threads", "index.html", {
|
renderer.render_template_to_file("threads", "index.html", {
|
||||||
"title": " - {}".format(board['name']),
|
"title": " - {}".format(board.name),
|
||||||
"base": "../",
|
"base": "../",
|
||||||
"board": board,
|
"board": board,
|
||||||
"threads": threads
|
"threads": threads
|
||||||
@ -99,8 +94,8 @@ class ArchiveGenerator():
|
|||||||
self.generate_forum_thread(forum, board, thread, board_out_dir)
|
self.generate_forum_thread(forum, board, thread, board_out_dir)
|
||||||
|
|
||||||
def generate_forum_thread (self, forum, board, thread, out_dir):
|
def generate_forum_thread (self, forum, board, thread, out_dir):
|
||||||
thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread['id_topic']))
|
thread_out_dir = os.path.join(out_dir, "thread-{}".format(thread.id))
|
||||||
logger.info("Archiving thread %s to %s", thread['subject'], thread_out_dir)
|
logger.info("Archiving thread %s to %s", thread.subject, thread_out_dir)
|
||||||
try:
|
try:
|
||||||
os.makedirs(thread_out_dir)
|
os.makedirs(thread_out_dir)
|
||||||
except FileExistsError: pass
|
except FileExistsError: pass
|
||||||
@ -114,9 +109,9 @@ class ArchiveGenerator():
|
|||||||
if len(posts) < 1:
|
if len(posts) < 1:
|
||||||
break
|
break
|
||||||
|
|
||||||
logger.info("Archiving page %s of thread %s", page, thread['subject'])
|
logger.info("Archiving page %s of thread %s", page, thread.subject)
|
||||||
renderer.render_template_to_file("posts", "page-{}.html".format(page), {
|
renderer.render_template_to_file("posts", "page-{}.html".format(page), {
|
||||||
"title": " - {} - Page {}".format(thread['subject'], page + 1),
|
"title": " - {} - Page {}".format(thread.subject, page + 1),
|
||||||
"base": "../../",
|
"base": "../../",
|
||||||
"board": board,
|
"board": board,
|
||||||
"thread": thread,
|
"thread": thread,
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
PREFIX = "smf_"
|
PREFIX = "smf_"
|
||||||
GET_BOARDS = """
|
GET_BOARDS = """
|
||||||
@ -22,42 +23,78 @@ GET_POSTS = """
|
|||||||
LIMIT ? OFFSET ?
|
LIMIT ? OFFSET ?
|
||||||
""".format(PREFIX)
|
""".format(PREFIX)
|
||||||
|
|
||||||
|
def fix_encoding (string):
|
||||||
|
return string.encode("latin1", errors="ignore").decode(errors="ignore")
|
||||||
|
|
||||||
class Forum():
|
class Forum():
|
||||||
def __init__ (self, db_path):
|
def __init__ (self, db_path):
|
||||||
self.connection = sqlite3.connect(db_path)
|
self.connection = sqlite3.connect(db_path)
|
||||||
self.connection.row_factory = sqlite3.Row
|
self.connection.row_factory = sqlite3.Row
|
||||||
|
|
||||||
def get_board_tree (self):
|
def get_board_tree (self):
|
||||||
categories = [dict(category) for category in self.get_categories()]
|
categories = self.get_categories()
|
||||||
boards = [dict(board) for board in self.get_boards()]
|
boards = self.get_boards()
|
||||||
for category in categories:
|
for category in categories:
|
||||||
category['children'] = [child for child in boards if child['id_cat'] == category['id_cat'] and child['child_level'] == 0]
|
category.children = [child for child in boards if child.category == category.id and child.child_level == 0]
|
||||||
for board in boards:
|
for board in boards:
|
||||||
board['children'] = [child for child in boards if child['id_parent'] == board['id_board']]
|
board.children = [child for child in boards if child.parent_board == board.id]
|
||||||
return categories
|
return categories
|
||||||
|
|
||||||
def get_categories (self):
|
def get_categories (self):
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
cursor.execute(GET_CATEGORIES)
|
cursor.execute(GET_CATEGORIES)
|
||||||
return cursor.fetchall()
|
return [Category(category) for category in cursor.fetchall()]
|
||||||
|
|
||||||
def get_boards (self):
|
def get_boards (self):
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
cursor.execute(GET_BOARDS)
|
cursor.execute(GET_BOARDS)
|
||||||
return cursor.fetchall()
|
return [Board(board) for board in cursor.fetchall()]
|
||||||
|
|
||||||
def get_threads_in_board (self, board, page=0, per_page=2000):
|
def get_threads_in_board (self, board, page=0, per_page=2000):
|
||||||
try:
|
try:
|
||||||
board = board['id_board']
|
board = board.id
|
||||||
except ValueError: pass
|
except ValueError: pass
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
cursor.execute(GET_THREADS, (board, per_page, page * per_page))
|
cursor.execute(GET_THREADS, (board, per_page, page * per_page))
|
||||||
return cursor.fetchall()
|
return [Thread(thread) for thread in cursor.fetchall()]
|
||||||
|
|
||||||
def get_posts_in_thread (self, thread, page=0, per_page=15):
|
def get_posts_in_thread (self, thread, page=0, per_page=15):
|
||||||
try:
|
try:
|
||||||
thread = thread['id_topic']
|
thread = thread.id
|
||||||
except ValueError: pass
|
except ValueError: pass
|
||||||
cursor = self.connection.cursor()
|
cursor = self.connection.cursor()
|
||||||
cursor.execute(GET_POSTS, (thread, per_page, page * per_page))
|
cursor.execute(GET_POSTS, (thread, per_page, page * per_page))
|
||||||
return cursor.fetchall()
|
return [Post(post) for post in cursor.fetchall()]
|
||||||
|
|
||||||
|
class Category():
|
||||||
|
def __init__ (self, row):
|
||||||
|
self.id = row['id_cat']
|
||||||
|
self.name = fix_encoding(row['name'])
|
||||||
|
self.children = []
|
||||||
|
|
||||||
|
class Board():
|
||||||
|
def __init__ (self, row):
|
||||||
|
self.id = row['id_board']
|
||||||
|
self.category = row['id_cat']
|
||||||
|
self.parent_board = row['id_parent']
|
||||||
|
self.child_level = row['child_level']
|
||||||
|
self.name = fix_encoding(row['name'])
|
||||||
|
self.description = fix_encoding(row['description'])
|
||||||
|
self.children = []
|
||||||
|
|
||||||
|
class Thread():
|
||||||
|
def __init__ (self, row):
|
||||||
|
self.id = row['id_topic']
|
||||||
|
self.parent = row['id_board']
|
||||||
|
self.datetime = datetime.fromtimestamp(row['poster_time'])
|
||||||
|
self.subject = fix_encoding(row['subject'])
|
||||||
|
self.poster_name = fix_encoding(row['poster_name'])
|
||||||
|
|
||||||
|
class Post():
|
||||||
|
def __init__ (self, row):
|
||||||
|
self.id = row['id_msg']
|
||||||
|
self.parent = row['id_topic']
|
||||||
|
self.datetime = datetime.fromtimestamp(row['poster_time'])
|
||||||
|
self.subject = fix_encoding(row['subject'])
|
||||||
|
self.body = fix_encoding(row['body'])
|
||||||
|
self.poster_name = fix_encoding(row['poster_name'])
|
Loading…
x
Reference in New Issue
Block a user