style and convention fixes to make pylint happy

This commit is contained in:
Adrian Malacoda 2016-12-16 00:29:59 -06:00
parent 1db0d315b8
commit 05c766011f
9 changed files with 148 additions and 68 deletions

View File

@ -1,15 +1,36 @@
"""The Great Escape is a tool for scraping data from a web forum and
exporting it into a format which can be re-imported."""
import argparse import argparse
from urllib.parse import urlparse from urllib.parse import urlparse
from . import scrapers, outputters from . import scrapers, outputters
from .util import sanitize_title from .util import sanitize_title
def main (): def main():
"""The Great Escape's entry point."""
parser = argparse.ArgumentParser(description="Forum scraper") parser = argparse.ArgumentParser(description="Forum scraper")
parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess") parser.add_argument(
parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape") "--scraper",
parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url") dest="scraper",
parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used") help="Scraper to use; if not specified, tries to guess"
)
parser.add_argument(
"--in",
dest="in",
required=True,
help="URL or file to scrape"
)
parser.add_argument(
"--out",
dest="out",
help="Path to output; if not specified, is the target forum's url"
)
parser.add_argument(
"--outformat",
dest="outformat",
help="Format to output data out; if not specified, default (JSON-based) format is used"
)
args = parser.parse_args() args = parser.parse_args()
source = vars(args)['in'] source = vars(args)['in']
@ -22,7 +43,7 @@ def main ():
print("Guessed scraper: {}".format(scraper.__name__)) print("Guessed scraper: {}".format(scraper.__name__))
scraped = scraper.scrape(source) scraped = scraper.scrape(source)
print(scraped.title)
out = args.out if args.out else sanitize_title(scraped.title) out = args.out if args.out else sanitize_title(scraped.title)
outformat = args.outformat if args.outformat else "json" outformat = args.outformat if args.outformat else "json"
print("Outputting to: {}, using {} outputter".format(out, outformat)) print("Outputting to: {}, using {} outputter".format(out, outformat))

View File

@ -1,37 +1,52 @@
class Forum (object): """The Great Escape model objects.
def __init__ (self, title=None):
Note that, depending on the forum software, terms might have different meanings.
For example, sometimes "board" refers to the entire site and "forum" to a subsection.
"""
# pylint: disable=too-few-public-methods, too-many-arguments
class Forum(object):
"""Forum represents an entire web forum."""
def __init__(self, title=None):
self.title = title self.title = title
self.users = [] self.users = []
self.categories = [] self.categories = []
class Post (object): class Post(object):
def __init__ (self, title=None, body=None, author=None, timestamp=None): """Post represents a singular post in a thread."""
def __init__(self, title=None, body=None, author=None, timestamp=None):
self.title = title self.title = title
self.body = body self.body = body
self.author = author self.author = author
self.timestamp = timestamp self.timestamp = timestamp
class Thread (object): class Thread(object):
def __init__ (self, title=None): """Thread represents a thread, or topic, in a board, on a forum."""
def __init__(self, title=None):
self.title = title self.title = title
self.children = [] self.children = []
class User (object): class User(object):
def __init__ (self, name=None, signature=None, avatar=None, title=None, subtitle=None): """User represents an individual user of a forum."""
def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None):
self.name = name self.name = name
self.signature = signature self.signature = signature
self.title = title self.title = title
self.subtitle = subtitle self.subtitle = subtitle
self.avatar = avatar self.avatar = avatar
class Category (object): class Category(object):
def __init__ (self, title=None, description=None): """Category represents a category of boards.
Note however in some forum software categories are a type of board."""
def __init__(self, title=None, description=None):
self.title = title self.title = title
self.description = description self.description = description
self.children = [] self.children = []
class Board (object): class Board(object):
def __init__ (self, title=None, description=None): """Board represents a board which contains threads."""
def __init__(self, title=None, description=None):
self.title = title self.title = title
self.description = description self.description = description
self.children = [] self.children = []

View File

@ -1,9 +1,12 @@
"""Outputters take scraped objects and save them to a certain format."""
from . import json, pickle from . import json, pickle
outputters = [json, pickle] OUTPUTTERS = [json, pickle]
def get_outputter (name): def get_outputter(name):
for outputter in outputters: """Get the outputter with the specified name."""
for outputter in OUTPUTTERS:
if outputter.__name__.endswith(".{}".format(name)): if outputter.__name__.endswith(".{}".format(name)):
return outputter return outputter

View File

@ -1,10 +1,13 @@
from ..model import User, Category, Forum, Board, Post, Thread """JSON outputter."""
from ..util import sanitize_title
import json import json
import os import os
def output (data, destination): from ..model import Forum, Board, Thread
from ..util import sanitize_title
def output(data, destination):
"""Output the given object to the specified folder."""
if isinstance(data, Forum): if isinstance(data, Forum):
output_forum(data, destination) output_forum(data, destination)
elif isinstance(data, Board): elif isinstance(data, Board):
@ -12,18 +15,24 @@ def output (data, destination):
elif isinstance(data, Thread): elif isinstance(data, Thread):
output_thread(data, destination) output_thread(data, destination)
def output_forum (data, destination): def output_forum(data, destination):
"""Output the given Forum object to the specified folder."""
os.makedirs(destination) os.makedirs(destination)
with open(os.path.join(destination, "index.json"), "w") as out_file: with open(os.path.join(destination, "index.json"), "w") as out_file:
out_file.write(json.dumps({"title": data.title}, indent=4)) out_file.write(json.dumps({"title": data.title}, indent=4))
for category in data.categories: for category in data.categories:
os.makedirs(os.path.join(destination, sanitize_title(category.title))) category_dir = os.path.join(destination, sanitize_title(category.title))
os.makedirs(category_dir)
for board in category.children: for board in category.children:
output_board(board, os.path.join(destination, sanitize_title(category.title), sanitize_title(board.title))) output_board(
board,
os.path.join(category_dir, sanitize_title(board.title))
)
def output_board (data, destination): def output_board(data, destination):
"""Output the given Board object to the specified folder."""
os.makedirs(destination) os.makedirs(destination)
os.makedirs(os.path.join(destination, "threads")) os.makedirs(os.path.join(destination, "threads"))
with open(os.path.join(destination, "index.json"), "w") as out_file: with open(os.path.join(destination, "index.json"), "w") as out_file:
@ -35,6 +44,7 @@ def output_board (data, destination):
for thread in data.children: for thread in data.children:
output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title))) output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title)))
def output_thread (data, destination): def output_thread(data, destination):
"""Output the given Thread object to the specified file."""
with open(destination, "w") as out_file: with open(destination, "w") as out_file:
out_file.write(json.dumps(data, default=vars, indent=4)) out_file.write(json.dumps(data, default=vars, indent=4))

View File

@ -1,5 +1,9 @@
"""Outputter based on Python's pickle module.
The output of this outputter can be read with the pickle scraper."""
import pickle import pickle
def output (data, destination): def output(data, destination):
"""Output the given object into the specified pickle file."""
with open(destination, "wb") as out_file: with open(destination, "wb") as out_file:
pickle.dump(data, out_file) pickle.dump(data, out_file)

View File

@ -1,16 +1,21 @@
"""Scrapers accept an input located somewhere (at a URL or local file)
and scrape them into objects, which can be dumped by an outputter."""
from . import yuku, pickle from . import yuku, pickle
scrapers = [yuku, pickle] SCRAPERS = [yuku, pickle]
def get_scraper (name): def get_scraper(name):
for scraper in scrapers: """Get the scraper with the specified name."""
for scraper in SCRAPERS:
if scraper.__name__.endswith(".{}".format(name)): if scraper.__name__.endswith(".{}".format(name)):
return scraper return scraper
raise Exception("Unknown scraper: {}".format(name)) raise Exception("Unknown scraper: {}".format(name))
def guess_scraper (url): def guess_scraper(url):
for scraper in scrapers: """Attempt to guess the correct scraper for handling the given path or URL."""
for scraper in SCRAPERS:
if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url): if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
return scraper return scraper

View File

@ -1,5 +1,8 @@
"""The pickle scraper reads a pickled file saved by the pickle outputter."""
import pickle import pickle
def scrape (source): def scrape(source):
"""Load the given pickle file into an object."""
with open(source, "rb") as in_file: with open(source, "rb") as in_file:
return pickle.load(in_file) return pickle.load(in_file)

View File

@ -1,16 +1,24 @@
from ..model import User, Category, Forum, Board, Post, Thread """Scraper for Yuku Forumer forums."""
# pylint: disable=no-member
from urllib.parse import urlparse from urllib.parse import urlparse
from time import strptime, mktime from time import strptime, mktime
import dateutil.parser import dateutil.parser
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
from retrying import retry from retrying import retry
time_format = "%b %d %y %I:%M %p" from ..model import User, Category, Forum, Board, Post, Thread
def can_scrape_url (url): TIME_FORMAT = "%b %d %y %I:%M %p"
def can_scrape_url(url):
"""Returns true if this url can be scraped by this scraper."""
return ".fr.yuku.com" in url return ".fr.yuku.com" in url
def scrape (url): def scrape(url):
"""Scrapes the URL into an object."""
path = urlparse(url).path path = urlparse(url).path
if path.startswith("/topic/"): if path.startswith("/topic/"):
return scrape_thread_from_url(url) return scrape_thread_from_url(url)
@ -20,60 +28,66 @@ def scrape (url):
return scrape_index(url) return scrape_index(url)
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document (url): def get_document(url):
"""Returns a pyquery document for the specified url, retrying if necessary."""
return pq(url=url) return pq(url=url)
def get_paged_document (url): def get_paged_document(url):
"""Returns a generator that yields all pages of the specified url."""
urlparts = urlparse(url) urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
while True: while True:
d = get_document(url=url) doc = get_document(url=url)
yield d yield doc
nextlink = d("a[accesskey=n]") nextlink = doc("a[accesskey=n]")
if not nextlink: if not nextlink:
break break
url = "{}{}".format(baseurl, nextlink.attr.href) url = "{}{}".format(baseurl, nextlink.attr.href)
print(" --> Following next page link to: {}".format(url)) print(" --> Following next page link to: {}".format(url))
def scrape_index (url): def scrape_index(url):
"""Scrapes the forum index at url into a Forum object."""
print("Scraping forum index from url: {}".format(url)) print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url) urlparts = urlparse(url)
d = get_document(url=url) doc = get_document(url=url)
forum = Forum(title=d("title").text()) forum = Forum(title=doc("title").text())
for category_element in d("div.span9 > div.row-fluid").items(): for category_element in doc("div.span9 > div.row-fluid").items():
category = Category(title=category_element.find("h3").text()) category = Category(title=category_element.find("h3").text())
forum.categories.append(category) forum.categories.append(category)
for board_link in category_element.find("a[href^='/forums/']").items(): for board_link in category_element.find("a[href^='/forums/']").items():
board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
board = scrape_board_from_url(full_url)
board.description = board_link.closest("div").find("p").eq(0).text() board.description = board_link.closest("div").find("p").eq(0).text()
category.children.append(board) category.children.append(board)
print("Finished scraping all boards in category: {}".format(category.title)) print("Finished scraping all boards in category: {}".format(category.title))
return forum return forum
def scrape_board_from_url (url): def scrape_board_from_url(url):
"""Scrapes the board index at url into a Board object."""
print("Scraping board from url: {}".format(url)) print("Scraping board from url: {}".format(url))
board = None board = None
for d in get_paged_document(url): for doc in get_paged_document(url):
if not board: if not board:
board = scrape_board_from_document(url, d) board = scrape_board_from_document(url, doc)
else: else:
board.children = board.children + scrape_board_from_document(url, d).children board.children = board.children + scrape_board_from_document(url, doc).children
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board return board
def scrape_board_from_document (url, d): def scrape_board_from_document(url, doc):
"""Scrapes the given document into a Board object."""
urlparts = urlparse(url) urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
board = Board(title=d("h1").text()) board = Board(title=doc("h1").text())
for thread_link in d("a[href^='/topic/']").items(): for thread_link in doc("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"): if thread_link.closest(".topic-pager"):
continue continue
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href)) thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
@ -81,22 +95,24 @@ def scrape_board_from_document (url, d):
return board return board
def scrape_thread_from_url (url): def scrape_thread_from_url(url):
"""Scrapes the given thread url into a Thread object."""
print("Scraping thread from url: {}".format(url)) print("Scraping thread from url: {}".format(url))
thread = None thread = None
for d in get_paged_document(url): for doc in get_paged_document(url):
if not thread: if not thread:
thread = scrape_thread_from_document(d) thread = scrape_thread_from_document(doc)
else: else:
thread.children = thread.children + scrape_thread_from_document(d).children thread.children = thread.children + scrape_thread_from_document(doc).children
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread return thread
def scrape_thread_from_document (d): def scrape_thread_from_document(doc):
thread = Thread(title=d("h2").eq(0).text()) """Scrapes the given document into a Thread object."""
for post_entry in d("article.post-entry").items(): thread = Thread(title=doc("h2").eq(0).text())
for post_entry in doc("article.post-entry").items():
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually # <article>'s aren't being closed correctly so each selector actually
# returns the rest of the thread's contents instead of just that post. # returns the rest of the thread's contents instead of just that post.
@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
if date_element.find("time"): if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp() timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else: else:
timestamp = mktime(strptime(date_element.text(), time_format)) timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
thread.children.append(Post( thread.children.append(Post(
author=User( author=User(

View File

@ -1,6 +1,9 @@
characters_to_replace = ["/", ":", " ", "?", "!", "&", ",", "'", '""'] """Utility functions."""
def sanitize_title (title): CHARACTERS_TO_REPLACE = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
for character in characters_to_replace:
def sanitize_title(title):
"""Sanitizes the given title by removing certain characters."""
for character in CHARACTERS_TO_REPLACE:
title = title.replace(character, "-") title = title.replace(character, "-")
return title return title