From 05c766011fec33c9ac7ded2aa105fe2dbf8e36ac Mon Sep 17 00:00:00 2001
From: Adrian Malacoda <adrian.malacoda@monarch-pass.net>
Date: Fri, 16 Dec 2016 00:29:59 -0600
Subject: [PATCH] style and convention fixes to make pylint happy

---
 tge/__init__.py            | 33 +++++++++++++----
 tge/model.py               | 39 +++++++++++++-------
 tge/outputters/__init__.py |  9 +++--
 tge/outputters/json.py     | 26 +++++++++-----
 tge/outputters/pickle.py   |  6 +++-
 tge/scrapers/__init__.py   | 15 +++++---
 tge/scrapers/pickle.py     |  5 ++-
 tge/scrapers/yuku.py       | 74 +++++++++++++++++++++++---------------
 tge/util.py                |  9 +++--
 9 files changed, 148 insertions(+), 68 deletions(-)

diff --git a/tge/__init__.py b/tge/__init__.py
index 93bf780..3549e35 100644
--- a/tge/__init__.py
+++ b/tge/__init__.py
@@ -1,15 +1,36 @@
+"""The Great Escape is a tool for scraping data from a web forum and
+exporting it into a format which can be re-imported."""
+
 import argparse
 from urllib.parse import urlparse
 
 from . import scrapers, outputters
 from .util import sanitize_title
 
-def main ():
+def main():
+    """The Great Escape's entry point."""
     parser = argparse.ArgumentParser(description="Forum scraper")
-    parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
-    parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape")
-    parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
-    parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
+    parser.add_argument(
+        "--scraper",
+        dest="scraper",
+        help="Scraper to use; if not specified, tries to guess"
+    )
+    parser.add_argument(
+        "--in",
+        dest="in",
+        required=True,
+        help="URL or file to scrape"
+    )
+    parser.add_argument(
+        "--out",
+        dest="out",
+        help="Path to output; if not specified, is the target forum's url"
+    )
+    parser.add_argument(
+        "--outformat",
+        dest="outformat",
+        help="Format to output data out; if not specified, default (JSON-based) format is used"
+    )
     args = parser.parse_args()
 
     source = vars(args)['in']
@@ -22,7 +43,7 @@ def main ():
         print("Guessed scraper: {}".format(scraper.__name__))
 
     scraped = scraper.scrape(source)
-
+    print(scraped.title)
     out = args.out if args.out else sanitize_title(scraped.title)
     outformat = args.outformat if args.outformat else "json"
     print("Outputting to: {}, using {} outputter".format(out, outformat))
diff --git a/tge/model.py b/tge/model.py
index ea76ed8..bf28a26 100644
--- a/tge/model.py
+++ b/tge/model.py
@@ -1,37 +1,52 @@
-class Forum (object):
-    def __init__ (self, title=None):
+"""The Great Escape model objects.
+
+Note that, depending on the forum software, terms might have different meanings.
+For example, sometimes "board" refers to the entire site and "forum" to a subsection.
+"""
+
+# pylint: disable=too-few-public-methods, too-many-arguments
+
+class Forum(object):
+    """Forum represents an entire web forum."""
+    def __init__(self, title=None):
         self.title = title
         self.users = []
         self.categories = []
 
-class Post (object):
-    def __init__ (self, title=None, body=None, author=None, timestamp=None):
+class Post(object):
+    """Post represents a singular post in a thread."""
+    def __init__(self, title=None, body=None, author=None, timestamp=None):
         self.title = title
         self.body = body
         self.author = author
         self.timestamp = timestamp
 
-class Thread (object):
-    def __init__ (self, title=None):
+class Thread(object):
+    """Thread represents a thread, or topic, in a board, on a forum."""
+    def __init__(self, title=None):
         self.title = title
         self.children = []
 
-class User (object):
-    def __init__ (self, name=None, signature=None, avatar=None, title=None, subtitle=None):
+class User(object):
+    """User represents an individual user of a forum."""
+    def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None):
         self.name = name
         self.signature = signature
         self.title = title
         self.subtitle = subtitle
         self.avatar = avatar
 
-class Category (object):
-    def __init__ (self, title=None, description=None):
+class Category(object):
+    """Category represents a category of boards.
+    Note however in some forum software categories are a type of board."""
+    def __init__(self, title=None, description=None):
         self.title = title
         self.description = description
         self.children = []
 
-class Board (object):
-    def __init__ (self, title=None, description=None):
+class Board(object):
+    """Board represents a board which contains threads."""
+    def __init__(self, title=None, description=None):
         self.title = title
         self.description = description
         self.children = []
diff --git a/tge/outputters/__init__.py b/tge/outputters/__init__.py
index 8e26d94..b01d3d6 100644
--- a/tge/outputters/__init__.py
+++ b/tge/outputters/__init__.py
@@ -1,9 +1,12 @@
+"""Outputters take scraped objects and save them to a certain format."""
+
 from . import json, pickle
 
-outputters = [json, pickle]
+OUTPUTTERS = [json, pickle]
 
-def get_outputter (name):
-    for outputter in outputters:
+def get_outputter(name):
+    """Get the outputter with the specified name."""
+    for outputter in OUTPUTTERS:
         if outputter.__name__.endswith(".{}".format(name)):
             return outputter
 
diff --git a/tge/outputters/json.py b/tge/outputters/json.py
index 697cc6b..d15f5d2 100644
--- a/tge/outputters/json.py
+++ b/tge/outputters/json.py
@@ -1,10 +1,13 @@
-from ..model import User, Category, Forum, Board, Post, Thread
-from ..util import sanitize_title
+"""JSON outputter."""
 
 import json
 import os
 
-def output (data, destination):
+from ..model import Forum, Board, Thread
+from ..util import sanitize_title
+
+def output(data, destination):
+    """Output the given object to the specified folder."""
     if isinstance(data, Forum):
         output_forum(data, destination)
     elif isinstance(data, Board):
@@ -12,18 +15,24 @@ def output (data, destination):
     elif isinstance(data, Thread):
         output_thread(data, destination)
 
-def output_forum (data, destination):
+def output_forum(data, destination):
+    """Output the given Forum object to the specified folder."""
     os.makedirs(destination)
 
     with open(os.path.join(destination, "index.json"), "w") as out_file:
         out_file.write(json.dumps({"title": data.title}, indent=4))
 
     for category in data.categories:
-        os.makedirs(os.path.join(destination, sanitize_title(category.title)))
+        category_dir = os.path.join(destination, sanitize_title(category.title))
+        os.makedirs(category_dir)
         for board in category.children:
-            output_board(board, os.path.join(destination, sanitize_title(category.title), sanitize_title(board.title)))
+            output_board(
+                board,
+                os.path.join(category_dir, sanitize_title(board.title))
+            )
 
-def output_board (data, destination):
+def output_board(data, destination):
+    """Output the given Board object to the specified folder."""
     os.makedirs(destination)
     os.makedirs(os.path.join(destination, "threads"))
     with open(os.path.join(destination, "index.json"), "w") as out_file:
@@ -35,6 +44,7 @@ def output_board (data, destination):
     for thread in data.children:
         output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title)))
 
-def output_thread (data, destination):
+def output_thread(data, destination):
+    """Output the given Thread object to the specified file."""
     with open(destination, "w") as out_file:
         out_file.write(json.dumps(data, default=vars, indent=4))
diff --git a/tge/outputters/pickle.py b/tge/outputters/pickle.py
index 92e37f4..d2d56bb 100644
--- a/tge/outputters/pickle.py
+++ b/tge/outputters/pickle.py
@@ -1,5 +1,9 @@
+"""Outputter based on Python's pickle module.
+The output of this outputter can be read with the pickle scraper."""
+
 import pickle
 
-def output (data, destination):
+def output(data, destination):
+    """Output the given object into the specified pickle file."""
     with open(destination, "wb") as out_file:
         pickle.dump(data, out_file)
diff --git a/tge/scrapers/__init__.py b/tge/scrapers/__init__.py
index c23ad63..02b01cb 100644
--- a/tge/scrapers/__init__.py
+++ b/tge/scrapers/__init__.py
@@ -1,16 +1,21 @@
+"""Scrapers accept an input located somewhere (at a URL or local file)
+and scrape them into objects, which can be dumped by an outputter."""
+
 from . import yuku, pickle
 
-scrapers = [yuku, pickle]
+SCRAPERS = [yuku, pickle]
 
-def get_scraper (name):
-    for scraper in scrapers:
+def get_scraper(name):
+    """Get the scraper with the specified name."""
+    for scraper in SCRAPERS:
         if scraper.__name__.endswith(".{}".format(name)):
             return scraper
 
     raise Exception("Unknown scraper: {}".format(name))
 
-def guess_scraper (url):
-    for scraper in scrapers:
+def guess_scraper(url):
+    """Attempt to guess the correct scraper for handling the given path or URL."""
+    for scraper in SCRAPERS:
         if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
             return scraper
 
diff --git a/tge/scrapers/pickle.py b/tge/scrapers/pickle.py
index 93c7471..26110be 100644
--- a/tge/scrapers/pickle.py
+++ b/tge/scrapers/pickle.py
@@ -1,5 +1,8 @@
+"""The pickle scraper reads a pickled file saved by the pickle outputter."""
+
 import pickle
 
-def scrape (source):
+def scrape(source):
+    """Load the given pickle file into an object."""
     with open(source, "rb") as in_file:
         return pickle.load(in_file)
diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py
index ed63cfc..beec209 100644
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@@ -1,16 +1,24 @@
-from ..model import User, Category, Forum, Board, Post, Thread
+"""Scraper for Yuku Forumer forums."""
+
+# pylint: disable=no-member
+
 from urllib.parse import urlparse
 from time import strptime, mktime
+
 import dateutil.parser
 from pyquery import PyQuery as pq
 from retrying import retry
 
-time_format = "%b %d %y %I:%M %p"
+from ..model import User, Category, Forum, Board, Post, Thread
 
-def can_scrape_url (url):
+TIME_FORMAT = "%b %d %y %I:%M %p"
+
+def can_scrape_url(url):
+    """Returns true if this url can be scraped by this scraper."""
     return ".fr.yuku.com" in url
 
-def scrape (url):
+def scrape(url):
+    """Scrapes the URL into an object."""
     path = urlparse(url).path
     if path.startswith("/topic/"):
         return scrape_thread_from_url(url)
@@ -20,60 +28,66 @@ def scrape (url):
         return scrape_index(url)
 
 @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
-def get_document (url):
+def get_document(url):
+    """Returns a pyquery document for the specified url, retrying if necessary."""
     return pq(url=url)
 
-def get_paged_document (url):
+def get_paged_document(url):
+    """Returns a generator that yields all pages of the specified url."""
     urlparts = urlparse(url)
     baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
 
     while True:
-        d = get_document(url=url)
-        yield d
+        doc = get_document(url=url)
+        yield doc
 
-        nextlink = d("a[accesskey=n]")
+        nextlink = doc("a[accesskey=n]")
         if not nextlink:
             break
 
         url = "{}{}".format(baseurl, nextlink.attr.href)
         print(" --> Following next page link to: {}".format(url))
 
-def scrape_index (url):
+def scrape_index(url):
+    """Scrapes the forum index at url into a Forum object."""
     print("Scraping forum index from url: {}".format(url))
     urlparts = urlparse(url)
 
-    d = get_document(url=url)
-    forum = Forum(title=d("title").text())
-    for category_element in d("div.span9 > div.row-fluid").items():
+    doc = get_document(url=url)
+    forum = Forum(title=doc("title").text())
+    for category_element in doc("div.span9 > div.row-fluid").items():
         category = Category(title=category_element.find("h3").text())
         forum.categories.append(category)
         for board_link in category_element.find("a[href^='/forums/']").items():
-            board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
+            full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
+            board = scrape_board_from_url(full_url)
             board.description = board_link.closest("div").find("p").eq(0).text()
             category.children.append(board)
         print("Finished scraping all boards in category: {}".format(category.title))
 
     return forum
 
-def scrape_board_from_url (url):
+def scrape_board_from_url(url):
+    """Scrapes the board index at url into a Board object."""
     print("Scraping board from url: {}".format(url))
 
     board = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
         if not board:
-            board = scrape_board_from_document(url, d)
+            board = scrape_board_from_document(url, doc)
         else:
-            board.children = board.children + scrape_board_from_document(url, d).children
+            board.children = board.children + scrape_board_from_document(url, doc).children
 
     print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
     return board
 
-def scrape_board_from_document (url, d):
+def scrape_board_from_document(url, doc):
+    """Scrapes the given document into a Board object."""
     urlparts = urlparse(url)
     baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
 
-    board = Board(title=d("h1").text())
-    for thread_link in d("a[href^='/topic/']").items():
+    board = Board(title=doc("h1").text())
+    for thread_link in doc("a[href^='/topic/']").items():
         if thread_link.closest(".topic-pager"):
             continue
         thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
@@ -81,22 +95,24 @@ def scrape_board_from_document (url, d):
 
     return board
 
-def scrape_thread_from_url (url):
+def scrape_thread_from_url(url):
+    """Scrapes the given thread url into a Thread object."""
     print("Scraping thread from url: {}".format(url))
 
     thread = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
         if not thread:
-            thread = scrape_thread_from_document(d)
+            thread = scrape_thread_from_document(doc)
         else:
-            thread.children = thread.children + scrape_thread_from_document(d).children
+            thread.children = thread.children + scrape_thread_from_document(doc).children
 
     print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
     return thread
 
-def scrape_thread_from_document (d):
-    thread = Thread(title=d("h2").eq(0).text())
-    for post_entry in d("article.post-entry").items():
+def scrape_thread_from_document(doc):
+    """Scrapes the given document into a Thread object."""
+    thread = Thread(title=doc("h2").eq(0).text())
+    for post_entry in doc("article.post-entry").items():
         # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
         #     <article>'s aren't being closed correctly so each selector actually
         #     returns the rest of the thread's contents instead of just that post.
@@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
         if date_element.find("time"):
             timestamp = dateutil.parser.parse(date_element.text()).timestamp()
         else:
-            timestamp = mktime(strptime(date_element.text(), time_format))
+            timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
 
         thread.children.append(Post(
             author=User(
diff --git a/tge/util.py b/tge/util.py
index 2c28e5b..8e2f8f0 100644
--- a/tge/util.py
+++ b/tge/util.py
@@ -1,6 +1,9 @@
-characters_to_replace = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
+"""Utility functions."""
 
-def sanitize_title (title):
-    for character in characters_to_replace:
+CHARACTERS_TO_REPLACE = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
+
+def sanitize_title(title):
+    """Sanitizes the given title by removing certain characters."""
+    for character in CHARACTERS_TO_REPLACE:
         title = title.replace(character, "-")
     return title