style and convention fixes to make pylint happy

This commit is contained in:
Adrian Malacoda 2016-12-16 00:29:59 -06:00
parent 1db0d315b8
commit 05c766011f
9 changed files with 148 additions and 68 deletions

View File

@ -1,3 +1,6 @@
"""The Great Escape is a tool for scraping data from a web forum and
exporting it into a format which can be re-imported."""
import argparse import argparse
from urllib.parse import urlparse from urllib.parse import urlparse
@ -5,11 +8,29 @@ from . import scrapers, outputters
from .util import sanitize_title from .util import sanitize_title
def main(): def main():
"""The Great Escape's entry point."""
parser = argparse.ArgumentParser(description="Forum scraper") parser = argparse.ArgumentParser(description="Forum scraper")
parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess") parser.add_argument(
parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape") "--scraper",
parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url") dest="scraper",
parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used") help="Scraper to use; if not specified, tries to guess"
)
parser.add_argument(
"--in",
dest="in",
required=True,
help="URL or file to scrape"
)
parser.add_argument(
"--out",
dest="out",
help="Path to output; if not specified, is the target forum's url"
)
parser.add_argument(
"--outformat",
dest="outformat",
help="Format to output data out; if not specified, default (JSON-based) format is used"
)
args = parser.parse_args() args = parser.parse_args()
source = vars(args)['in'] source = vars(args)['in']
@ -22,7 +43,7 @@ def main ():
print("Guessed scraper: {}".format(scraper.__name__)) print("Guessed scraper: {}".format(scraper.__name__))
scraped = scraper.scrape(source) scraped = scraper.scrape(source)
print(scraped.title)
out = args.out if args.out else sanitize_title(scraped.title) out = args.out if args.out else sanitize_title(scraped.title)
outformat = args.outformat if args.outformat else "json" outformat = args.outformat if args.outformat else "json"
print("Outputting to: {}, using {} outputter".format(out, outformat)) print("Outputting to: {}, using {} outputter".format(out, outformat))

View File

@ -1,10 +1,20 @@
"""The Great Escape model objects.
Note that, depending on the forum software, terms might have different meanings.
For example, sometimes "board" refers to the entire site and "forum" to a subsection.
"""
# pylint: disable=too-few-public-methods, too-many-arguments
class Forum(object): class Forum(object):
"""Forum represents an entire web forum."""
def __init__(self, title=None): def __init__(self, title=None):
self.title = title self.title = title
self.users = [] self.users = []
self.categories = [] self.categories = []
class Post(object): class Post(object):
"""Post represents a singular post in a thread."""
def __init__(self, title=None, body=None, author=None, timestamp=None): def __init__(self, title=None, body=None, author=None, timestamp=None):
self.title = title self.title = title
self.body = body self.body = body
@ -12,11 +22,13 @@ class Post (object):
self.timestamp = timestamp self.timestamp = timestamp
class Thread(object): class Thread(object):
"""Thread represents a thread, or topic, in a board, on a forum."""
def __init__(self, title=None): def __init__(self, title=None):
self.title = title self.title = title
self.children = [] self.children = []
class User(object): class User(object):
"""User represents an individual user of a forum."""
def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None): def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None):
self.name = name self.name = name
self.signature = signature self.signature = signature
@ -25,12 +37,15 @@ class User (object):
self.avatar = avatar self.avatar = avatar
class Category(object): class Category(object):
"""Category represents a category of boards.
Note however in some forum software categories are a type of board."""
def __init__(self, title=None, description=None): def __init__(self, title=None, description=None):
self.title = title self.title = title
self.description = description self.description = description
self.children = [] self.children = []
class Board(object): class Board(object):
"""Board represents a board which contains threads."""
def __init__(self, title=None, description=None): def __init__(self, title=None, description=None):
self.title = title self.title = title
self.description = description self.description = description

View File

@ -1,9 +1,12 @@
"""Outputters take scraped objects and save them to a certain format."""
from . import json, pickle from . import json, pickle
outputters = [json, pickle] OUTPUTTERS = [json, pickle]
def get_outputter(name): def get_outputter(name):
for outputter in outputters: """Get the outputter with the specified name."""
for outputter in OUTPUTTERS:
if outputter.__name__.endswith(".{}".format(name)): if outputter.__name__.endswith(".{}".format(name)):
return outputter return outputter

View File

@ -1,10 +1,13 @@
from ..model import User, Category, Forum, Board, Post, Thread """JSON outputter."""
from ..util import sanitize_title
import json import json
import os import os
from ..model import Forum, Board, Thread
from ..util import sanitize_title
def output(data, destination): def output(data, destination):
"""Output the given object to the specified folder."""
if isinstance(data, Forum): if isinstance(data, Forum):
output_forum(data, destination) output_forum(data, destination)
elif isinstance(data, Board): elif isinstance(data, Board):
@ -13,17 +16,23 @@ def output (data, destination):
output_thread(data, destination) output_thread(data, destination)
def output_forum(data, destination): def output_forum(data, destination):
"""Output the given Forum object to the specified folder."""
os.makedirs(destination) os.makedirs(destination)
with open(os.path.join(destination, "index.json"), "w") as out_file: with open(os.path.join(destination, "index.json"), "w") as out_file:
out_file.write(json.dumps({"title": data.title}, indent=4)) out_file.write(json.dumps({"title": data.title}, indent=4))
for category in data.categories: for category in data.categories:
os.makedirs(os.path.join(destination, sanitize_title(category.title))) category_dir = os.path.join(destination, sanitize_title(category.title))
os.makedirs(category_dir)
for board in category.children: for board in category.children:
output_board(board, os.path.join(destination, sanitize_title(category.title), sanitize_title(board.title))) output_board(
board,
os.path.join(category_dir, sanitize_title(board.title))
)
def output_board(data, destination): def output_board(data, destination):
"""Output the given Board object to the specified folder."""
os.makedirs(destination) os.makedirs(destination)
os.makedirs(os.path.join(destination, "threads")) os.makedirs(os.path.join(destination, "threads"))
with open(os.path.join(destination, "index.json"), "w") as out_file: with open(os.path.join(destination, "index.json"), "w") as out_file:
@ -36,5 +45,6 @@ def output_board (data, destination):
output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title))) output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title)))
def output_thread(data, destination): def output_thread(data, destination):
"""Output the given Thread object to the specified file."""
with open(destination, "w") as out_file: with open(destination, "w") as out_file:
out_file.write(json.dumps(data, default=vars, indent=4)) out_file.write(json.dumps(data, default=vars, indent=4))

View File

@ -1,5 +1,9 @@
"""Outputter based on Python's pickle module.
The output of this outputter can be read with the pickle scraper."""
import pickle import pickle
def output(data, destination): def output(data, destination):
"""Output the given object into the specified pickle file."""
with open(destination, "wb") as out_file: with open(destination, "wb") as out_file:
pickle.dump(data, out_file) pickle.dump(data, out_file)

View File

@ -1,16 +1,21 @@
"""Scrapers accept an input located somewhere (at a URL or local file)
and scrape them into objects, which can be dumped by an outputter."""
from . import yuku, pickle from . import yuku, pickle
scrapers = [yuku, pickle] SCRAPERS = [yuku, pickle]
def get_scraper(name): def get_scraper(name):
for scraper in scrapers: """Get the scraper with the specified name."""
for scraper in SCRAPERS:
if scraper.__name__.endswith(".{}".format(name)): if scraper.__name__.endswith(".{}".format(name)):
return scraper return scraper
raise Exception("Unknown scraper: {}".format(name)) raise Exception("Unknown scraper: {}".format(name))
def guess_scraper(url): def guess_scraper(url):
for scraper in scrapers: """Attempt to guess the correct scraper for handling the given path or URL."""
for scraper in SCRAPERS:
if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url): if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
return scraper return scraper

View File

@ -1,5 +1,8 @@
"""The pickle scraper reads a pickled file saved by the pickle outputter."""
import pickle import pickle
def scrape(source): def scrape(source):
"""Load the given pickle file into an object."""
with open(source, "rb") as in_file: with open(source, "rb") as in_file:
return pickle.load(in_file) return pickle.load(in_file)

View File

@ -1,16 +1,24 @@
from ..model import User, Category, Forum, Board, Post, Thread """Scraper for Yuku Forumer forums."""
# pylint: disable=no-member
from urllib.parse import urlparse from urllib.parse import urlparse
from time import strptime, mktime from time import strptime, mktime
import dateutil.parser import dateutil.parser
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from retrying import retry from retrying import retry
time_format = "%b %d %y %I:%M %p" from ..model import User, Category, Forum, Board, Post, Thread
TIME_FORMAT = "%b %d %y %I:%M %p"
def can_scrape_url(url): def can_scrape_url(url):
"""Returns true if this url can be scraped by this scraper."""
return ".fr.yuku.com" in url return ".fr.yuku.com" in url
def scrape(url): def scrape(url):
"""Scrapes the URL into an object."""
path = urlparse(url).path path = urlparse(url).path
if path.startswith("/topic/"): if path.startswith("/topic/"):
return scrape_thread_from_url(url) return scrape_thread_from_url(url)
@ -21,17 +29,19 @@ def scrape (url):
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document(url): def get_document(url):
"""Returns a pyquery document for the specified url, retrying if necessary."""
return pq(url=url) return pq(url=url)
def get_paged_document(url): def get_paged_document(url):
"""Returns a generator that yields all pages of the specified url."""
urlparts = urlparse(url) urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
while True: while True:
d = get_document(url=url) doc = get_document(url=url)
yield d yield doc
nextlink = d("a[accesskey=n]") nextlink = doc("a[accesskey=n]")
if not nextlink: if not nextlink:
break break
@ -39,16 +49,18 @@ def get_paged_document (url):
print(" --> Following next page link to: {}".format(url)) print(" --> Following next page link to: {}".format(url))
def scrape_index(url): def scrape_index(url):
"""Scrapes the forum index at url into a Forum object."""
print("Scraping forum index from url: {}".format(url)) print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url) urlparts = urlparse(url)
d = get_document(url=url) doc = get_document(url=url)
forum = Forum(title=d("title").text()) forum = Forum(title=doc("title").text())
for category_element in d("div.span9 > div.row-fluid").items(): for category_element in doc("div.span9 > div.row-fluid").items():
category = Category(title=category_element.find("h3").text()) category = Category(title=category_element.find("h3").text())
forum.categories.append(category) forum.categories.append(category)
for board_link in category_element.find("a[href^='/forums/']").items(): for board_link in category_element.find("a[href^='/forums/']").items():
board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
board = scrape_board_from_url(full_url)
board.description = board_link.closest("div").find("p").eq(0).text() board.description = board_link.closest("div").find("p").eq(0).text()
category.children.append(board) category.children.append(board)
print("Finished scraping all boards in category: {}".format(category.title)) print("Finished scraping all boards in category: {}".format(category.title))
@ -56,24 +68,26 @@ def scrape_index (url):
return forum return forum
def scrape_board_from_url(url): def scrape_board_from_url(url):
"""Scrapes the board index at url into a Board object."""
print("Scraping board from url: {}".format(url)) print("Scraping board from url: {}".format(url))
board = None board = None
for d in get_paged_document(url): for doc in get_paged_document(url):
if not board: if not board:
board = scrape_board_from_document(url, d) board = scrape_board_from_document(url, doc)
else: else:
board.children = board.children + scrape_board_from_document(url, d).children board.children = board.children + scrape_board_from_document(url, doc).children
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board return board
def scrape_board_from_document (url, d): def scrape_board_from_document(url, doc):
"""Scrapes the given document into a Board object."""
urlparts = urlparse(url) urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
board = Board(title=d("h1").text()) board = Board(title=doc("h1").text())
for thread_link in d("a[href^='/topic/']").items(): for thread_link in doc("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"): if thread_link.closest(".topic-pager"):
continue continue
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href)) thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
@ -82,21 +96,23 @@ def scrape_board_from_document (url, d):
return board return board
def scrape_thread_from_url(url): def scrape_thread_from_url(url):
"""Scrapes the given thread url into a Thread object."""
print("Scraping thread from url: {}".format(url)) print("Scraping thread from url: {}".format(url))
thread = None thread = None
for d in get_paged_document(url): for doc in get_paged_document(url):
if not thread: if not thread:
thread = scrape_thread_from_document(d) thread = scrape_thread_from_document(doc)
else: else:
thread.children = thread.children + scrape_thread_from_document(d).children thread.children = thread.children + scrape_thread_from_document(doc).children
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread return thread
def scrape_thread_from_document (d): def scrape_thread_from_document(doc):
thread = Thread(title=d("h2").eq(0).text()) """Scrapes the given document into a Thread object."""
for post_entry in d("article.post-entry").items(): thread = Thread(title=doc("h2").eq(0).text())
for post_entry in doc("article.post-entry").items():
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually # <article>'s aren't being closed correctly so each selector actually
# returns the rest of the thread's contents instead of just that post. # returns the rest of the thread's contents instead of just that post.
@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
if date_element.find("time"): if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp() timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else: else:
timestamp = mktime(strptime(date_element.text(), time_format)) timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
thread.children.append(Post( thread.children.append(Post(
author=User( author=User(

View File

@ -1,6 +1,9 @@
characters_to_replace = ["/", ":", " ", "?", "!", "&", ",", "'", '""'] """Utility functions."""
CHARACTERS_TO_REPLACE = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
def sanitize_title(title): def sanitize_title(title):
for character in characters_to_replace: """Sanitizes the given title by removing certain characters."""
for character in CHARACTERS_TO_REPLACE:
title = title.replace(character, "-") title = title.replace(character, "-")
return title return title