From 05c766011fec33c9ac7ded2aa105fe2dbf8e36ac Mon Sep 17 00:00:00 2001 From: Adrian Malacoda Date: Fri, 16 Dec 2016 00:29:59 -0600 Subject: [PATCH] style and convention fixes to make pylint happy --- tge/__init__.py | 33 +++++++++++++---- tge/model.py | 39 +++++++++++++------- tge/outputters/__init__.py | 9 +++-- tge/outputters/json.py | 26 +++++++++----- tge/outputters/pickle.py | 6 +++- tge/scrapers/__init__.py | 15 +++++--- tge/scrapers/pickle.py | 5 ++- tge/scrapers/yuku.py | 74 +++++++++++++++++++++++--------------- tge/util.py | 9 +++-- 9 files changed, 148 insertions(+), 68 deletions(-) diff --git a/tge/__init__.py b/tge/__init__.py index 93bf780..3549e35 100644 --- a/tge/__init__.py +++ b/tge/__init__.py @@ -1,15 +1,36 @@ +"""The Great Escape is a tool for scraping data from a web forum and +exporting it into a format which can be re-imported.""" + import argparse from urllib.parse import urlparse from . import scrapers, outputters from .util import sanitize_title -def main (): +def main(): + """The Great Escape's entry point.""" parser = argparse.ArgumentParser(description="Forum scraper") - parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess") - parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape") - parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url") - parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used") + parser.add_argument( + "--scraper", + dest="scraper", + help="Scraper to use; if not specified, tries to guess" + ) + parser.add_argument( + "--in", + dest="in", + required=True, + help="URL or file to scrape" + ) + parser.add_argument( + "--out", + dest="out", + help="Path to output; if not specified, is the target forum's url" + ) + parser.add_argument( + "--outformat", + dest="outformat", + help="Format to output data out; if not specified, default (JSON-based) format is used" + ) args = parser.parse_args() source = vars(args)['in'] @@ -22,7 +43,7 @@ def main (): print("Guessed scraper: {}".format(scraper.__name__)) scraped = scraper.scrape(source) - + print(scraped.title) out = args.out if args.out else sanitize_title(scraped.title) outformat = args.outformat if args.outformat else "json" print("Outputting to: {}, using {} outputter".format(out, outformat)) diff --git a/tge/model.py b/tge/model.py index ea76ed8..bf28a26 100644 --- a/tge/model.py +++ b/tge/model.py @@ -1,37 +1,52 @@ -class Forum (object): - def __init__ (self, title=None): +"""The Great Escape model objects. + +Note that, depending on the forum software, terms might have different meanings. +For example, sometimes "board" refers to the entire site and "forum" to a subsection. +""" + +# pylint: disable=too-few-public-methods, too-many-arguments + +class Forum(object): + """Forum represents an entire web forum.""" + def __init__(self, title=None): self.title = title self.users = [] self.categories = [] -class Post (object): - def __init__ (self, title=None, body=None, author=None, timestamp=None): +class Post(object): + """Post represents a singular post in a thread.""" + def __init__(self, title=None, body=None, author=None, timestamp=None): self.title = title self.body = body self.author = author self.timestamp = timestamp -class Thread (object): - def __init__ (self, title=None): +class Thread(object): + """Thread represents a thread, or topic, in a board, on a forum.""" + def __init__(self, title=None): self.title = title self.children = [] -class User (object): - def __init__ (self, name=None, signature=None, avatar=None, title=None, subtitle=None): +class User(object): + """User represents an individual user of a forum.""" + def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None): self.name = name self.signature = signature self.title = title self.subtitle = subtitle self.avatar = avatar -class Category (object): - def __init__ (self, title=None, description=None): +class Category(object): + """Category represents a category of boards. + Note however in some forum software categories are a type of board.""" + def __init__(self, title=None, description=None): self.title = title self.description = description self.children = [] -class Board (object): - def __init__ (self, title=None, description=None): +class Board(object): + """Board represents a board which contains threads.""" + def __init__(self, title=None, description=None): self.title = title self.description = description self.children = [] diff --git a/tge/outputters/__init__.py b/tge/outputters/__init__.py index 8e26d94..b01d3d6 100644 --- a/tge/outputters/__init__.py +++ b/tge/outputters/__init__.py @@ -1,9 +1,12 @@ +"""Outputters take scraped objects and save them to a certain format.""" + from . import json, pickle -outputters = [json, pickle] +OUTPUTTERS = [json, pickle] -def get_outputter (name): - for outputter in outputters: +def get_outputter(name): + """Get the outputter with the specified name.""" + for outputter in OUTPUTTERS: if outputter.__name__.endswith(".{}".format(name)): return outputter diff --git a/tge/outputters/json.py b/tge/outputters/json.py index 697cc6b..d15f5d2 100644 --- a/tge/outputters/json.py +++ b/tge/outputters/json.py @@ -1,10 +1,13 @@ -from ..model import User, Category, Forum, Board, Post, Thread -from ..util import sanitize_title +"""JSON outputter.""" import json import os -def output (data, destination): +from ..model import Forum, Board, Thread +from ..util import sanitize_title + +def output(data, destination): + """Output the given object to the specified folder.""" if isinstance(data, Forum): output_forum(data, destination) elif isinstance(data, Board): @@ -12,18 +15,24 @@ def output (data, destination): elif isinstance(data, Thread): output_thread(data, destination) -def output_forum (data, destination): +def output_forum(data, destination): + """Output the given Forum object to the specified folder.""" os.makedirs(destination) with open(os.path.join(destination, "index.json"), "w") as out_file: out_file.write(json.dumps({"title": data.title}, indent=4)) for category in data.categories: - os.makedirs(os.path.join(destination, sanitize_title(category.title))) + category_dir = os.path.join(destination, sanitize_title(category.title)) + os.makedirs(category_dir) for board in category.children: - output_board(board, os.path.join(destination, sanitize_title(category.title), sanitize_title(board.title))) + output_board( + board, + os.path.join(category_dir, sanitize_title(board.title)) + ) -def output_board (data, destination): +def output_board(data, destination): + """Output the given Board object to the specified folder.""" os.makedirs(destination) os.makedirs(os.path.join(destination, "threads")) with open(os.path.join(destination, "index.json"), "w") as out_file: @@ -35,6 +44,7 @@ def output_board (data, destination): for thread in data.children: output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title))) -def output_thread (data, destination): +def output_thread(data, destination): + """Output the given Thread object to the specified file.""" with open(destination, "w") as out_file: out_file.write(json.dumps(data, default=vars, indent=4)) diff --git a/tge/outputters/pickle.py b/tge/outputters/pickle.py index 92e37f4..d2d56bb 100644 --- a/tge/outputters/pickle.py +++ b/tge/outputters/pickle.py @@ -1,5 +1,9 @@ +"""Outputter based on Python's pickle module. +The output of this outputter can be read with the pickle scraper.""" + import pickle -def output (data, destination): +def output(data, destination): + """Output the given object into the specified pickle file.""" with open(destination, "wb") as out_file: pickle.dump(data, out_file) diff --git a/tge/scrapers/__init__.py b/tge/scrapers/__init__.py index c23ad63..02b01cb 100644 --- a/tge/scrapers/__init__.py +++ b/tge/scrapers/__init__.py @@ -1,16 +1,21 @@ +"""Scrapers accept an input located somewhere (at a URL or local file) +and scrape them into objects, which can be dumped by an outputter.""" + from . import yuku, pickle -scrapers = [yuku, pickle] +SCRAPERS = [yuku, pickle] -def get_scraper (name): - for scraper in scrapers: +def get_scraper(name): + """Get the scraper with the specified name.""" + for scraper in SCRAPERS: if scraper.__name__.endswith(".{}".format(name)): return scraper raise Exception("Unknown scraper: {}".format(name)) -def guess_scraper (url): - for scraper in scrapers: +def guess_scraper(url): + """Attempt to guess the correct scraper for handling the given path or URL.""" + for scraper in SCRAPERS: if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url): return scraper diff --git a/tge/scrapers/pickle.py b/tge/scrapers/pickle.py index 93c7471..26110be 100644 --- a/tge/scrapers/pickle.py +++ b/tge/scrapers/pickle.py @@ -1,5 +1,8 @@ +"""The pickle scraper reads a pickled file saved by the pickle outputter.""" + import pickle -def scrape (source): +def scrape(source): + """Load the given pickle file into an object.""" with open(source, "rb") as in_file: return pickle.load(in_file) diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py index ed63cfc..beec209 100644 --- a/tge/scrapers/yuku.py +++ b/tge/scrapers/yuku.py @@ -1,16 +1,24 @@ -from ..model import User, Category, Forum, Board, Post, Thread +"""Scraper for Yuku Forumer forums.""" + +# pylint: disable=no-member + from urllib.parse import urlparse from time import strptime, mktime + import dateutil.parser from pyquery import PyQuery as pq from retrying import retry -time_format = "%b %d %y %I:%M %p" +from ..model import User, Category, Forum, Board, Post, Thread -def can_scrape_url (url): +TIME_FORMAT = "%b %d %y %I:%M %p" + +def can_scrape_url(url): + """Returns true if this url can be scraped by this scraper.""" return ".fr.yuku.com" in url -def scrape (url): +def scrape(url): + """Scrapes the URL into an object.""" path = urlparse(url).path if path.startswith("/topic/"): return scrape_thread_from_url(url) @@ -20,60 +28,66 @@ def scrape (url): return scrape_index(url) @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) -def get_document (url): +def get_document(url): + """Returns a pyquery document for the specified url, retrying if necessary.""" return pq(url=url) -def get_paged_document (url): +def get_paged_document(url): + """Returns a generator that yields all pages of the specified url.""" urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) while True: - d = get_document(url=url) - yield d + doc = get_document(url=url) + yield doc - nextlink = d("a[accesskey=n]") + nextlink = doc("a[accesskey=n]") if not nextlink: break url = "{}{}".format(baseurl, nextlink.attr.href) print(" --> Following next page link to: {}".format(url)) -def scrape_index (url): +def scrape_index(url): + """Scrapes the forum index at url into a Forum object.""" print("Scraping forum index from url: {}".format(url)) urlparts = urlparse(url) - d = get_document(url=url) - forum = Forum(title=d("title").text()) - for category_element in d("div.span9 > div.row-fluid").items(): + doc = get_document(url=url) + forum = Forum(title=doc("title").text()) + for category_element in doc("div.span9 > div.row-fluid").items(): category = Category(title=category_element.find("h3").text()) forum.categories.append(category) for board_link in category_element.find("a[href^='/forums/']").items(): - board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) + full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href) + board = scrape_board_from_url(full_url) board.description = board_link.closest("div").find("p").eq(0).text() category.children.append(board) print("Finished scraping all boards in category: {}".format(category.title)) return forum -def scrape_board_from_url (url): +def scrape_board_from_url(url): + """Scrapes the board index at url into a Board object.""" print("Scraping board from url: {}".format(url)) board = None - for d in get_paged_document(url): + for doc in get_paged_document(url): if not board: - board = scrape_board_from_document(url, d) + board = scrape_board_from_document(url, doc) else: - board.children = board.children + scrape_board_from_document(url, d).children + board.children = board.children + scrape_board_from_document(url, doc).children print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) return board -def scrape_board_from_document (url, d): +def scrape_board_from_document(url, doc): + """Scrapes the given document into a Board object.""" urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) - board = Board(title=d("h1").text()) - for thread_link in d("a[href^='/topic/']").items(): + board = Board(title=doc("h1").text()) + for thread_link in doc("a[href^='/topic/']").items(): if thread_link.closest(".topic-pager"): continue thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href)) @@ -81,22 +95,24 @@ def scrape_board_from_document (url, d): return board -def scrape_thread_from_url (url): +def scrape_thread_from_url(url): + """Scrapes the given thread url into a Thread object.""" print("Scraping thread from url: {}".format(url)) thread = None - for d in get_paged_document(url): + for doc in get_paged_document(url): if not thread: - thread = scrape_thread_from_document(d) + thread = scrape_thread_from_document(doc) else: - thread.children = thread.children + scrape_thread_from_document(d).children + thread.children = thread.children + scrape_thread_from_document(doc).children print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) return thread -def scrape_thread_from_document (d): - thread = Thread(title=d("h2").eq(0).text()) - for post_entry in d("article.post-entry").items(): +def scrape_thread_from_document(doc): + """Scrapes the given document into a Thread object.""" + thread = Thread(title=doc("h2").eq(0).text()) + for post_entry in doc("article.post-entry").items(): # 26 November 2016: Yuku's broken HTML is breaking this parsing logic #
's aren't being closed correctly so each selector actually # returns the rest of the thread's contents instead of just that post. @@ -116,7 +132,7 @@ def scrape_thread_from_document (d): if date_element.find("time"): timestamp = dateutil.parser.parse(date_element.text()).timestamp() else: - timestamp = mktime(strptime(date_element.text(), time_format)) + timestamp = mktime(strptime(date_element.text(), TIME_FORMAT)) thread.children.append(Post( author=User( diff --git a/tge/util.py b/tge/util.py index 2c28e5b..8e2f8f0 100644 --- a/tge/util.py +++ b/tge/util.py @@ -1,6 +1,9 @@ -characters_to_replace = ["/", ":", " ", "?", "!", "&", ",", "'", '""'] +"""Utility functions.""" -def sanitize_title (title): - for character in characters_to_replace: +CHARACTERS_TO_REPLACE = ["/", ":", " ", "?", "!", "&", ",", "'", '""'] + +def sanitize_title(title): + """Sanitizes the given title by removing certain characters.""" + for character in CHARACTERS_TO_REPLACE: title = title.replace(character, "-") return title