style and convention fixes to make pylint happy

2016-12-16 00:29:59 -06:00
parent 1db0d315b8
commit 05c766011f
9 changed files with 148 additions and 68 deletions
--- a/tge/init.py
+++ b/tge/init.py
@@ -1,15 +1,36 @@
+"""The Great Escape is a tool for scraping data from a web forum and
+exporting it into a format which can be re-imported."""
+
 import argparse
 from urllib.parse import urlparse

 from . import scrapers, outputters
 from .util import sanitize_title

-def main ():
+def main():
+    """The Great Escape's entry point."""
    parser = argparse.ArgumentParser(description="Forum scraper")
-    parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
-    parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape")
-    parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
-    parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
+    parser.add_argument(
+        "--scraper",
+        dest="scraper",
+        help="Scraper to use; if not specified, tries to guess"
+    )
+    parser.add_argument(
+        "--in",
+        dest="in",
+        required=True,
+        help="URL or file to scrape"
+    )
+    parser.add_argument(
+        "--out",
+        dest="out",
+        help="Path to output; if not specified, is the target forum's url"
+    )
+    parser.add_argument(
+        "--outformat",
+        dest="outformat",
+        help="Format to output data out; if not specified, default (JSON-based) format is used"
+    )
    args = parser.parse_args()

    source = vars(args)['in']
@@ -22,7 +43,7 @@ def main ():
        print("Guessed scraper: {}".format(scraper.__name__))

    scraped = scraper.scrape(source)
-
+    print(scraped.title)
    out = args.out if args.out else sanitize_title(scraped.title)
    outformat = args.outformat if args.outformat else "json"
    print("Outputting to: {}, using {} outputter".format(out, outformat))
--- a/tge/model.py
+++ b/tge/model.py
@@ -1,37 +1,52 @@
-class Forum (object):
-    def __init__ (self, title=None):
+"""The Great Escape model objects.
+
+Note that, depending on the forum software, terms might have different meanings.
+For example, sometimes "board" refers to the entire site and "forum" to a subsection.
+"""
+
+# pylint: disable=too-few-public-methods, too-many-arguments
+
+class Forum(object):
+    """Forum represents an entire web forum."""
+    def __init__(self, title=None):
        self.title = title
        self.users = []
        self.categories = []

-class Post (object):
-    def __init__ (self, title=None, body=None, author=None, timestamp=None):
+class Post(object):
+    """Post represents a singular post in a thread."""
+    def __init__(self, title=None, body=None, author=None, timestamp=None):
        self.title = title
        self.body = body
        self.author = author
        self.timestamp = timestamp

-class Thread (object):
-    def __init__ (self, title=None):
+class Thread(object):
+    """Thread represents a thread, or topic, in a board, on a forum."""
+    def __init__(self, title=None):
        self.title = title
        self.children = []

-class User (object):
-    def __init__ (self, name=None, signature=None, avatar=None, title=None, subtitle=None):
+class User(object):
+    """User represents an individual user of a forum."""
+    def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None):
        self.name = name
        self.signature = signature
        self.title = title
        self.subtitle = subtitle
        self.avatar = avatar

-class Category (object):
-    def __init__ (self, title=None, description=None):
+class Category(object):
+    """Category represents a category of boards.
+    Note however in some forum software categories are a type of board."""
+    def __init__(self, title=None, description=None):
        self.title = title
        self.description = description
        self.children = []

-class Board (object):
-    def __init__ (self, title=None, description=None):
+class Board(object):
+    """Board represents a board which contains threads."""
+    def __init__(self, title=None, description=None):
        self.title = title
        self.description = description
        self.children = []
--- a/tge/outputters/init.py
+++ b/tge/outputters/init.py
@@ -1,9 +1,12 @@
+"""Outputters take scraped objects and save them to a certain format."""
+
 from . import json, pickle

-outputters = [json, pickle]
+OUTPUTTERS = [json, pickle]

-def get_outputter (name):
-    for outputter in outputters:
+def get_outputter(name):
+    """Get the outputter with the specified name."""
+    for outputter in OUTPUTTERS:
        if outputter.__name__.endswith(".{}".format(name)):
            return outputter

--- a/tge/outputters/json.py
+++ b/tge/outputters/json.py
@@ -1,10 +1,13 @@
-from ..model import User, Category, Forum, Board, Post, Thread
-from ..util import sanitize_title
+"""JSON outputter."""

 import json
 import os

-def output (data, destination):
+from ..model import Forum, Board, Thread
+from ..util import sanitize_title
+
+def output(data, destination):
+    """Output the given object to the specified folder."""
    if isinstance(data, Forum):
        output_forum(data, destination)
    elif isinstance(data, Board):
@@ -12,18 +15,24 @@ def output (data, destination):
    elif isinstance(data, Thread):
        output_thread(data, destination)

-def output_forum (data, destination):
+def output_forum(data, destination):
+    """Output the given Forum object to the specified folder."""
    os.makedirs(destination)

    with open(os.path.join(destination, "index.json"), "w") as out_file:
        out_file.write(json.dumps({"title": data.title}, indent=4))

    for category in data.categories:
-        os.makedirs(os.path.join(destination, sanitize_title(category.title)))
+        category_dir = os.path.join(destination, sanitize_title(category.title))
+        os.makedirs(category_dir)
        for board in category.children:
-            output_board(board, os.path.join(destination, sanitize_title(category.title), sanitize_title(board.title)))
+            output_board(
+                board,
+                os.path.join(category_dir, sanitize_title(board.title))
+            )

-def output_board (data, destination):
+def output_board(data, destination):
+    """Output the given Board object to the specified folder."""
    os.makedirs(destination)
    os.makedirs(os.path.join(destination, "threads"))
    with open(os.path.join(destination, "index.json"), "w") as out_file:
@@ -35,6 +44,7 @@ def output_board (data, destination):
    for thread in data.children:
        output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title)))

-def output_thread (data, destination):
+def output_thread(data, destination):
+    """Output the given Thread object to the specified file."""
    with open(destination, "w") as out_file:
        out_file.write(json.dumps(data, default=vars, indent=4))
--- a/tge/outputters/pickle.py
+++ b/tge/outputters/pickle.py
@@ -1,5 +1,9 @@
+"""Outputter based on Python's pickle module.
+The output of this outputter can be read with the pickle scraper."""
+
 import pickle

-def output (data, destination):
+def output(data, destination):
+    """Output the given object into the specified pickle file."""
    with open(destination, "wb") as out_file:
        pickle.dump(data, out_file)
--- a/tge/scrapers/init.py
+++ b/tge/scrapers/init.py
@@ -1,16 +1,21 @@
+"""Scrapers accept an input located somewhere (at a URL or local file)
+and scrape them into objects, which can be dumped by an outputter."""
+
 from . import yuku, pickle

-scrapers = [yuku, pickle]
+SCRAPERS = [yuku, pickle]

-def get_scraper (name):
-    for scraper in scrapers:
+def get_scraper(name):
+    """Get the scraper with the specified name."""
+    for scraper in SCRAPERS:
        if scraper.__name__.endswith(".{}".format(name)):
            return scraper

    raise Exception("Unknown scraper: {}".format(name))

-def guess_scraper (url):
-    for scraper in scrapers:
+def guess_scraper(url):
+    """Attempt to guess the correct scraper for handling the given path or URL."""
+    for scraper in SCRAPERS:
        if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
            return scraper

--- a/tge/scrapers/pickle.py
+++ b/tge/scrapers/pickle.py
@@ -1,5 +1,8 @@
+"""The pickle scraper reads a pickled file saved by the pickle outputter."""
+
 import pickle

-def scrape (source):
+def scrape(source):
+    """Load the given pickle file into an object."""
    with open(source, "rb") as in_file:
        return pickle.load(in_file)
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@@ -1,16 +1,24 @@
-from ..model import User, Category, Forum, Board, Post, Thread
+"""Scraper for Yuku Forumer forums."""
+
+# pylint: disable=no-member
+
 from urllib.parse import urlparse
 from time import strptime, mktime
+
 import dateutil.parser
 from pyquery import PyQuery as pq
 from retrying import retry

-time_format = "%b %d %y %I:%M %p"
+from ..model import User, Category, Forum, Board, Post, Thread

-def can_scrape_url (url):
+TIME_FORMAT = "%b %d %y %I:%M %p"
+
+def can_scrape_url(url):
+    """Returns true if this url can be scraped by this scraper."""
    return ".fr.yuku.com" in url

-def scrape (url):
+def scrape(url):
+    """Scrapes the URL into an object."""
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread_from_url(url)
@@ -20,60 +28,66 @@ def scrape (url):
        return scrape_index(url)

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
-def get_document (url):
+def get_document(url):
+    """Returns a pyquery document for the specified url, retrying if necessary."""
    return pq(url=url)

-def get_paged_document (url):
+def get_paged_document(url):
+    """Returns a generator that yields all pages of the specified url."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    while True:
-        d = get_document(url=url)
-        yield d
+        doc = get_document(url=url)
+        yield doc

-        nextlink = d("a[accesskey=n]")
+        nextlink = doc("a[accesskey=n]")
        if not nextlink:
            break

        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))

-def scrape_index (url):
+def scrape_index(url):
+    """Scrapes the forum index at url into a Forum object."""
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

-    d = get_document(url=url)
-    forum = Forum(title=d("title").text())
-    for category_element in d("div.span9 > div.row-fluid").items():
+    doc = get_document(url=url)
+    forum = Forum(title=doc("title").text())
+    for category_element in doc("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
-            board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
+            full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
+            board = scrape_board_from_url(full_url)
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

-def scrape_board_from_url (url):
+def scrape_board_from_url(url):
+    """Scrapes the board index at url into a Board object."""
    print("Scraping board from url: {}".format(url))

    board = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
        if not board:
-            board = scrape_board_from_document(url, d)
+            board = scrape_board_from_document(url, doc)
        else:
-            board.children = board.children + scrape_board_from_document(url, d).children
+            board.children = board.children + scrape_board_from_document(url, doc).children

    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board

-def scrape_board_from_document (url, d):
+def scrape_board_from_document(url, doc):
+    """Scrapes the given document into a Board object."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

-    board = Board(title=d("h1").text())
-    for thread_link in d("a[href^='/topic/']").items():
+    board = Board(title=doc("h1").text())
+    for thread_link in doc("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
@@ -81,22 +95,24 @@ def scrape_board_from_document (url, d):

    return board

-def scrape_thread_from_url (url):
+def scrape_thread_from_url(url):
+    """Scrapes the given thread url into a Thread object."""
    print("Scraping thread from url: {}".format(url))

    thread = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
        if not thread:
-            thread = scrape_thread_from_document(d)
+            thread = scrape_thread_from_document(doc)
        else:
-            thread.children = thread.children + scrape_thread_from_document(d).children
+            thread.children = thread.children + scrape_thread_from_document(doc).children

    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread

-def scrape_thread_from_document (d):
-    thread = Thread(title=d("h2").eq(0).text())
-    for post_entry in d("article.post-entry").items():
+def scrape_thread_from_document(doc):
+    """Scrapes the given document into a Thread object."""
+    thread = Thread(title=doc("h2").eq(0).text())
+    for post_entry in doc("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
@@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
-            timestamp = mktime(strptime(date_element.text(), time_format))
+            timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))

        thread.children.append(Post(
            author=User(
--- a/tge/util.py
+++ b/tge/util.py
@@ -1,6 +1,9 @@
-characters_to_replace = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
+"""Utility functions."""

-def sanitize_title (title):
-    for character in characters_to_replace:
+CHARACTERS_TO_REPLACE = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
+
+def sanitize_title(title):
+    """Sanitizes the given title by removing certain characters."""
+    for character in CHARACTERS_TO_REPLACE:
        title = title.replace(character, "-")
    return title