style and convention fixes to make pylint happy
This commit is contained in:
@@ -1,16 +1,21 @@
|
||||
"""Scrapers accept an input located somewhere (at a URL or local file)
|
||||
and scrape them into objects, which can be dumped by an outputter."""
|
||||
|
||||
from . import yuku, pickle
|
||||
|
||||
scrapers = [yuku, pickle]
|
||||
SCRAPERS = [yuku, pickle]
|
||||
|
||||
def get_scraper (name):
|
||||
for scraper in scrapers:
|
||||
def get_scraper(name):
|
||||
"""Get the scraper with the specified name."""
|
||||
for scraper in SCRAPERS:
|
||||
if scraper.__name__.endswith(".{}".format(name)):
|
||||
return scraper
|
||||
|
||||
raise Exception("Unknown scraper: {}".format(name))
|
||||
|
||||
def guess_scraper (url):
|
||||
for scraper in scrapers:
|
||||
def guess_scraper(url):
|
||||
"""Attempt to guess the correct scraper for handling the given path or URL."""
|
||||
for scraper in SCRAPERS:
|
||||
if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
|
||||
return scraper
|
||||
|
||||
|
@@ -1,5 +1,8 @@
|
||||
"""The pickle scraper reads a pickled file saved by the pickle outputter."""
|
||||
|
||||
import pickle
|
||||
|
||||
def scrape (source):
|
||||
def scrape(source):
|
||||
"""Load the given pickle file into an object."""
|
||||
with open(source, "rb") as in_file:
|
||||
return pickle.load(in_file)
|
||||
|
@@ -1,16 +1,24 @@
|
||||
from ..model import User, Category, Forum, Board, Post, Thread
|
||||
"""Scraper for Yuku Forumer forums."""
|
||||
|
||||
# pylint: disable=no-member
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from time import strptime, mktime
|
||||
|
||||
import dateutil.parser
|
||||
from pyquery import PyQuery as pq
|
||||
from retrying import retry
|
||||
|
||||
time_format = "%b %d %y %I:%M %p"
|
||||
from ..model import User, Category, Forum, Board, Post, Thread
|
||||
|
||||
def can_scrape_url (url):
|
||||
TIME_FORMAT = "%b %d %y %I:%M %p"
|
||||
|
||||
def can_scrape_url(url):
|
||||
"""Returns true if this url can be scraped by this scraper."""
|
||||
return ".fr.yuku.com" in url
|
||||
|
||||
def scrape (url):
|
||||
def scrape(url):
|
||||
"""Scrapes the URL into an object."""
|
||||
path = urlparse(url).path
|
||||
if path.startswith("/topic/"):
|
||||
return scrape_thread_from_url(url)
|
||||
@@ -20,60 +28,66 @@ def scrape (url):
|
||||
return scrape_index(url)
|
||||
|
||||
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
||||
def get_document (url):
|
||||
def get_document(url):
|
||||
"""Returns a pyquery document for the specified url, retrying if necessary."""
|
||||
return pq(url=url)
|
||||
|
||||
def get_paged_document (url):
|
||||
def get_paged_document(url):
|
||||
"""Returns a generator that yields all pages of the specified url."""
|
||||
urlparts = urlparse(url)
|
||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||
|
||||
while True:
|
||||
d = get_document(url=url)
|
||||
yield d
|
||||
doc = get_document(url=url)
|
||||
yield doc
|
||||
|
||||
nextlink = d("a[accesskey=n]")
|
||||
nextlink = doc("a[accesskey=n]")
|
||||
if not nextlink:
|
||||
break
|
||||
|
||||
url = "{}{}".format(baseurl, nextlink.attr.href)
|
||||
print(" --> Following next page link to: {}".format(url))
|
||||
|
||||
def scrape_index (url):
|
||||
def scrape_index(url):
|
||||
"""Scrapes the forum index at url into a Forum object."""
|
||||
print("Scraping forum index from url: {}".format(url))
|
||||
urlparts = urlparse(url)
|
||||
|
||||
d = get_document(url=url)
|
||||
forum = Forum(title=d("title").text())
|
||||
for category_element in d("div.span9 > div.row-fluid").items():
|
||||
doc = get_document(url=url)
|
||||
forum = Forum(title=doc("title").text())
|
||||
for category_element in doc("div.span9 > div.row-fluid").items():
|
||||
category = Category(title=category_element.find("h3").text())
|
||||
forum.categories.append(category)
|
||||
for board_link in category_element.find("a[href^='/forums/']").items():
|
||||
board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
|
||||
full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
|
||||
board = scrape_board_from_url(full_url)
|
||||
board.description = board_link.closest("div").find("p").eq(0).text()
|
||||
category.children.append(board)
|
||||
print("Finished scraping all boards in category: {}".format(category.title))
|
||||
|
||||
return forum
|
||||
|
||||
def scrape_board_from_url (url):
|
||||
def scrape_board_from_url(url):
|
||||
"""Scrapes the board index at url into a Board object."""
|
||||
print("Scraping board from url: {}".format(url))
|
||||
|
||||
board = None
|
||||
for d in get_paged_document(url):
|
||||
for doc in get_paged_document(url):
|
||||
if not board:
|
||||
board = scrape_board_from_document(url, d)
|
||||
board = scrape_board_from_document(url, doc)
|
||||
else:
|
||||
board.children = board.children + scrape_board_from_document(url, d).children
|
||||
board.children = board.children + scrape_board_from_document(url, doc).children
|
||||
|
||||
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
||||
return board
|
||||
|
||||
def scrape_board_from_document (url, d):
|
||||
def scrape_board_from_document(url, doc):
|
||||
"""Scrapes the given document into a Board object."""
|
||||
urlparts = urlparse(url)
|
||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||
|
||||
board = Board(title=d("h1").text())
|
||||
for thread_link in d("a[href^='/topic/']").items():
|
||||
board = Board(title=doc("h1").text())
|
||||
for thread_link in doc("a[href^='/topic/']").items():
|
||||
if thread_link.closest(".topic-pager"):
|
||||
continue
|
||||
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
|
||||
@@ -81,22 +95,24 @@ def scrape_board_from_document (url, d):
|
||||
|
||||
return board
|
||||
|
||||
def scrape_thread_from_url (url):
|
||||
def scrape_thread_from_url(url):
|
||||
"""Scrapes the given thread url into a Thread object."""
|
||||
print("Scraping thread from url: {}".format(url))
|
||||
|
||||
thread = None
|
||||
for d in get_paged_document(url):
|
||||
for doc in get_paged_document(url):
|
||||
if not thread:
|
||||
thread = scrape_thread_from_document(d)
|
||||
thread = scrape_thread_from_document(doc)
|
||||
else:
|
||||
thread.children = thread.children + scrape_thread_from_document(d).children
|
||||
thread.children = thread.children + scrape_thread_from_document(doc).children
|
||||
|
||||
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
||||
return thread
|
||||
|
||||
def scrape_thread_from_document (d):
|
||||
thread = Thread(title=d("h2").eq(0).text())
|
||||
for post_entry in d("article.post-entry").items():
|
||||
def scrape_thread_from_document(doc):
|
||||
"""Scrapes the given document into a Thread object."""
|
||||
thread = Thread(title=doc("h2").eq(0).text())
|
||||
for post_entry in doc("article.post-entry").items():
|
||||
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
||||
# <article>'s aren't being closed correctly so each selector actually
|
||||
# returns the rest of the thread's contents instead of just that post.
|
||||
@@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
|
||||
if date_element.find("time"):
|
||||
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
|
||||
else:
|
||||
timestamp = mktime(strptime(date_element.text(), time_format))
|
||||
timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
|
||||
|
||||
thread.children.append(Post(
|
||||
author=User(
|
||||
|
Reference in New Issue
Block a user