style and convention fixes to make pylint happy

2016-12-16 00:29:59 -06:00
parent 1db0d315b8
commit 05c766011f
9 changed files with 148 additions and 68 deletions
--- a/tge/scrapers/init.py
+++ b/tge/scrapers/init.py
@@ -1,16 +1,21 @@
+"""Scrapers accept an input located somewhere (at a URL or local file)
+and scrape them into objects, which can be dumped by an outputter."""
+
 from . import yuku, pickle

-scrapers = [yuku, pickle]
+SCRAPERS = [yuku, pickle]

-def get_scraper (name):
-    for scraper in scrapers:
+def get_scraper(name):
+    """Get the scraper with the specified name."""
+    for scraper in SCRAPERS:
        if scraper.__name__.endswith(".{}".format(name)):
            return scraper

    raise Exception("Unknown scraper: {}".format(name))

-def guess_scraper (url):
-    for scraper in scrapers:
+def guess_scraper(url):
+    """Attempt to guess the correct scraper for handling the given path or URL."""
+    for scraper in SCRAPERS:
        if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
            return scraper

--- a/tge/scrapers/pickle.py
+++ b/tge/scrapers/pickle.py
@@ -1,5 +1,8 @@
+"""The pickle scraper reads a pickled file saved by the pickle outputter."""
+
 import pickle

-def scrape (source):
+def scrape(source):
+    """Load the given pickle file into an object."""
    with open(source, "rb") as in_file:
        return pickle.load(in_file)
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@@ -1,16 +1,24 @@
-from ..model import User, Category, Forum, Board, Post, Thread
+"""Scraper for Yuku Forumer forums."""
+
+# pylint: disable=no-member
+
 from urllib.parse import urlparse
 from time import strptime, mktime
+
 import dateutil.parser
 from pyquery import PyQuery as pq
 from retrying import retry

-time_format = "%b %d %y %I:%M %p"
+from ..model import User, Category, Forum, Board, Post, Thread

-def can_scrape_url (url):
+TIME_FORMAT = "%b %d %y %I:%M %p"
+
+def can_scrape_url(url):
+    """Returns true if this url can be scraped by this scraper."""
    return ".fr.yuku.com" in url

-def scrape (url):
+def scrape(url):
+    """Scrapes the URL into an object."""
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread_from_url(url)
@@ -20,60 +28,66 @@ def scrape (url):
        return scrape_index(url)

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
-def get_document (url):
+def get_document(url):
+    """Returns a pyquery document for the specified url, retrying if necessary."""
    return pq(url=url)

-def get_paged_document (url):
+def get_paged_document(url):
+    """Returns a generator that yields all pages of the specified url."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    while True:
-        d = get_document(url=url)
-        yield d
+        doc = get_document(url=url)
+        yield doc

-        nextlink = d("a[accesskey=n]")
+        nextlink = doc("a[accesskey=n]")
        if not nextlink:
            break

        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))

-def scrape_index (url):
+def scrape_index(url):
+    """Scrapes the forum index at url into a Forum object."""
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

-    d = get_document(url=url)
-    forum = Forum(title=d("title").text())
-    for category_element in d("div.span9 > div.row-fluid").items():
+    doc = get_document(url=url)
+    forum = Forum(title=doc("title").text())
+    for category_element in doc("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
-            board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
+            full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
+            board = scrape_board_from_url(full_url)
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

-def scrape_board_from_url (url):
+def scrape_board_from_url(url):
+    """Scrapes the board index at url into a Board object."""
    print("Scraping board from url: {}".format(url))

    board = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
        if not board:
-            board = scrape_board_from_document(url, d)
+            board = scrape_board_from_document(url, doc)
        else:
-            board.children = board.children + scrape_board_from_document(url, d).children
+            board.children = board.children + scrape_board_from_document(url, doc).children

    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board

-def scrape_board_from_document (url, d):
+def scrape_board_from_document(url, doc):
+    """Scrapes the given document into a Board object."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

-    board = Board(title=d("h1").text())
-    for thread_link in d("a[href^='/topic/']").items():
+    board = Board(title=doc("h1").text())
+    for thread_link in doc("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
@@ -81,22 +95,24 @@ def scrape_board_from_document (url, d):

    return board

-def scrape_thread_from_url (url):
+def scrape_thread_from_url(url):
+    """Scrapes the given thread url into a Thread object."""
    print("Scraping thread from url: {}".format(url))

    thread = None
-    for d in get_paged_document(url):
+    for doc in get_paged_document(url):
        if not thread:
-            thread = scrape_thread_from_document(d)
+            thread = scrape_thread_from_document(doc)
        else:
-            thread.children = thread.children + scrape_thread_from_document(d).children
+            thread.children = thread.children + scrape_thread_from_document(doc).children

    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread

-def scrape_thread_from_document (d):
-    thread = Thread(title=d("h2").eq(0).text())
-    for post_entry in d("article.post-entry").items():
+def scrape_thread_from_document(doc):
+    """Scrapes the given document into a Thread object."""
+    thread = Thread(title=doc("h2").eq(0).text())
+    for post_entry in doc("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
@@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
-            timestamp = mktime(strptime(date_element.text(), time_format))
+            timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))

        thread.children.append(Post(
            author=User(