style and convention fixes to make pylint happy

2016-12-16 00:29:59 -06:00
parent 1db0d315b8
commit 05c766011f
9 changed files with 148 additions and 68 deletions
--- a/tge/init.py
+++ b/tge/init.py
@@ -1,3 +1,6 @@
 """The Great Escape is a tool for scraping data from a web forum and
 exporting it into a format which can be re-imported."""
 import argparse
 from urllib.parse import urlparse
@@ -5,11 +8,29 @@ from . import scrapers, outputters
 from .util import sanitize_title
 def main():
    """The Great Escape's entry point."""
    parser = argparse.ArgumentParser(description="Forum scraper")
-    parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
+    parser.add_argument(
-    parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape")
+        "--scraper",
-    parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
+        dest="scraper",
-    parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
+        help="Scraper to use; if not specified, tries to guess"
    )
    parser.add_argument(
        "--in",
        dest="in",
        required=True,
        help="URL or file to scrape"
    )
    parser.add_argument(
        "--out",
        dest="out",
        help="Path to output; if not specified, is the target forum's url"
    )
    parser.add_argument(
        "--outformat",
        dest="outformat",
        help="Format to output data out; if not specified, default (JSON-based) format is used"
    )
    args = parser.parse_args()
    source = vars(args)['in']
@@ -22,7 +43,7 @@ def main ():
        print("Guessed scraper: {}".format(scraper.__name__))
    scraped = scraper.scrape(source)
-
+    print(scraped.title)
    out = args.out if args.out else sanitize_title(scraped.title)
    outformat = args.outformat if args.outformat else "json"
    print("Outputting to: {}, using {} outputter".format(out, outformat))
--- a/tge/model.py
+++ b/tge/model.py
@@ -1,10 +1,20 @@
 """The Great Escape model objects.
 Note that, depending on the forum software, terms might have different meanings.
 For example, sometimes "board" refers to the entire site and "forum" to a subsection.
 """
 # pylint: disable=too-few-public-methods, too-many-arguments
 class Forum(object):
    """Forum represents an entire web forum."""
    def __init__(self, title=None):
        self.title = title
        self.users = []
        self.categories = []
 class Post(object):
    """Post represents a singular post in a thread."""
    def __init__(self, title=None, body=None, author=None, timestamp=None):
        self.title = title
        self.body = body
@@ -12,11 +22,13 @@ class Post (object):
        self.timestamp = timestamp
 class Thread(object):
    """Thread represents a thread, or topic, in a board, on a forum."""
    def __init__(self, title=None):
        self.title = title
        self.children = []
 class User(object):
    """User represents an individual user of a forum."""
    def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None):
        self.name = name
        self.signature = signature
@@ -25,12 +37,15 @@ class User (object):
        self.avatar = avatar
 class Category(object):
    """Category represents a category of boards.
    Note however in some forum software categories are a type of board."""
    def __init__(self, title=None, description=None):
        self.title = title
        self.description = description
        self.children = []
 class Board(object):
    """Board represents a board which contains threads."""
    def __init__(self, title=None, description=None):
        self.title = title
        self.description = description
--- a/tge/outputters/init.py
+++ b/tge/outputters/init.py
@@ -1,9 +1,12 @@
 """Outputters take scraped objects and save them to a certain format."""
 from . import json, pickle
-outputters = [json, pickle]
+OUTPUTTERS = [json, pickle]
 def get_outputter(name):
-    for outputter in outputters:
+    """Get the outputter with the specified name."""
    for outputter in OUTPUTTERS:
        if outputter.__name__.endswith(".{}".format(name)):
            return outputter
--- a/tge/outputters/json.py
+++ b/tge/outputters/json.py
@@ -1,10 +1,13 @@
-from ..model import User, Category, Forum, Board, Post, Thread
+"""JSON outputter."""
 from ..util import sanitize_title
 import json
 import os
 from ..model import Forum, Board, Thread
 from ..util import sanitize_title
 def output(data, destination):
    """Output the given object to the specified folder."""
    if isinstance(data, Forum):
        output_forum(data, destination)
    elif isinstance(data, Board):
@@ -13,17 +16,23 @@ def output (data, destination):
        output_thread(data, destination)
 def output_forum(data, destination):
    """Output the given Forum object to the specified folder."""
    os.makedirs(destination)
    with open(os.path.join(destination, "index.json"), "w") as out_file:
        out_file.write(json.dumps({"title": data.title}, indent=4))
    for category in data.categories:
-        os.makedirs(os.path.join(destination, sanitize_title(category.title)))
+        category_dir = os.path.join(destination, sanitize_title(category.title))
        os.makedirs(category_dir)
        for board in category.children:
-            output_board(board, os.path.join(destination, sanitize_title(category.title), sanitize_title(board.title)))
+            output_board(
                board,
                os.path.join(category_dir, sanitize_title(board.title))
            )
 def output_board(data, destination):
    """Output the given Board object to the specified folder."""
    os.makedirs(destination)
    os.makedirs(os.path.join(destination, "threads"))
    with open(os.path.join(destination, "index.json"), "w") as out_file:
@@ -36,5 +45,6 @@ def output_board (data, destination):
        output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title)))
 def output_thread(data, destination):
    """Output the given Thread object to the specified file."""
    with open(destination, "w") as out_file:
        out_file.write(json.dumps(data, default=vars, indent=4))
--- a/tge/outputters/pickle.py
+++ b/tge/outputters/pickle.py
@@ -1,5 +1,9 @@
 """Outputter based on Python's pickle module.
 The output of this outputter can be read with the pickle scraper."""
 import pickle
 def output(data, destination):
    """Output the given object into the specified pickle file."""
    with open(destination, "wb") as out_file:
        pickle.dump(data, out_file)
--- a/tge/scrapers/init.py
+++ b/tge/scrapers/init.py
@@ -1,16 +1,21 @@
 """Scrapers accept an input located somewhere (at a URL or local file)
 and scrape them into objects, which can be dumped by an outputter."""
 from . import yuku, pickle
-scrapers = [yuku, pickle]
+SCRAPERS = [yuku, pickle]
 def get_scraper(name):
-    for scraper in scrapers:
+    """Get the scraper with the specified name."""
    for scraper in SCRAPERS:
        if scraper.__name__.endswith(".{}".format(name)):
            return scraper
    raise Exception("Unknown scraper: {}".format(name))
 def guess_scraper(url):
-    for scraper in scrapers:
+    """Attempt to guess the correct scraper for handling the given path or URL."""
    for scraper in SCRAPERS:
        if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
            return scraper
--- a/tge/scrapers/pickle.py
+++ b/tge/scrapers/pickle.py
@@ -1,5 +1,8 @@
 """The pickle scraper reads a pickled file saved by the pickle outputter."""
 import pickle
 def scrape(source):
    """Load the given pickle file into an object."""
    with open(source, "rb") as in_file:
        return pickle.load(in_file)
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@@ -1,16 +1,24 @@
-from ..model import User, Category, Forum, Board, Post, Thread
+"""Scraper for Yuku Forumer forums."""
 # pylint: disable=no-member
 from urllib.parse import urlparse
 from time import strptime, mktime
 import dateutil.parser
 from pyquery import PyQuery as pq
 from retrying import retry
-time_format = "%b %d %y %I:%M %p"
+from ..model import User, Category, Forum, Board, Post, Thread
 TIME_FORMAT = "%b %d %y %I:%M %p"
 def can_scrape_url(url):
    """Returns true if this url can be scraped by this scraper."""
    return ".fr.yuku.com" in url
 def scrape(url):
    """Scrapes the URL into an object."""
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread_from_url(url)
@@ -21,17 +29,19 @@ def scrape (url):
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
 def get_document(url):
    """Returns a pyquery document for the specified url, retrying if necessary."""
    return pq(url=url)
 def get_paged_document(url):
    """Returns a generator that yields all pages of the specified url."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
    while True:
-        d = get_document(url=url)
+        doc = get_document(url=url)
-        yield d
+        yield doc
-        nextlink = d("a[accesskey=n]")
+        nextlink = doc("a[accesskey=n]")
        if not nextlink:
            break
@@ -39,16 +49,18 @@ def get_paged_document (url):
        print(" --> Following next page link to: {}".format(url))
 def scrape_index(url):
    """Scrapes the forum index at url into a Forum object."""
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)
-    d = get_document(url=url)
+    doc = get_document(url=url)
-    forum = Forum(title=d("title").text())
+    forum = Forum(title=doc("title").text())
-    for category_element in d("div.span9 > div.row-fluid").items():
+    for category_element in doc("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
-            board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
+            full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
            board = scrape_board_from_url(full_url)
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))
@@ -56,24 +68,26 @@ def scrape_index (url):
    return forum
 def scrape_board_from_url(url):
    """Scrapes the board index at url into a Board object."""
    print("Scraping board from url: {}".format(url))
    board = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
        if not board:
-            board = scrape_board_from_document(url, d)
+            board = scrape_board_from_document(url, doc)
        else:
-            board.children = board.children + scrape_board_from_document(url, d).children
+            board.children = board.children + scrape_board_from_document(url, doc).children
    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board
-def scrape_board_from_document (url, d):
+def scrape_board_from_document(url, doc):
    """Scrapes the given document into a Board object."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
-    board = Board(title=d("h1").text())
+    board = Board(title=doc("h1").text())
-    for thread_link in d("a[href^='/topic/']").items():
+    for thread_link in doc("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
@@ -82,21 +96,23 @@ def scrape_board_from_document (url, d):
    return board
 def scrape_thread_from_url(url):
    """Scrapes the given thread url into a Thread object."""
    print("Scraping thread from url: {}".format(url))
    thread = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
        if not thread:
-            thread = scrape_thread_from_document(d)
+            thread = scrape_thread_from_document(doc)
        else:
-            thread.children = thread.children + scrape_thread_from_document(d).children
+            thread.children = thread.children + scrape_thread_from_document(doc).children
    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread
-def scrape_thread_from_document (d):
+def scrape_thread_from_document(doc):
-    thread = Thread(title=d("h2").eq(0).text())
+    """Scrapes the given document into a Thread object."""
-    for post_entry in d("article.post-entry").items():
+    thread = Thread(title=doc("h2").eq(0).text())
    for post_entry in doc("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
@@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
-            timestamp = mktime(strptime(date_element.text(), time_format))
+            timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
        thread.children.append(Post(
            author=User(
--- a/tge/util.py
+++ b/tge/util.py
@@ -1,6 +1,9 @@
-characters_to_replace = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
+"""Utility functions."""
 CHARACTERS_TO_REPLACE = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
 def sanitize_title(title):
-    for character in characters_to_replace:
+    """Sanitizes the given title by removing certain characters."""
    for character in CHARACTERS_TO_REPLACE:
        title = title.replace(character, "-")
    return title