2016-12-16 00:29:59 -06:00

150 lines
5.4 KiB
Python

"""Scraper for Yuku Forumer forums."""
# pylint: disable=no-member
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
from retrying import retry
from ..model import User, Category, Forum, Board, Post, Thread
TIME_FORMAT = "%b %d %y %I:%M %p"
def can_scrape_url(url):
"""Returns true if this url can be scraped by this scraper."""
return ".fr.yuku.com" in url
def scrape(url):
"""Scrapes the URL into an object."""
path = urlparse(url).path
if path.startswith("/topic/"):
return scrape_thread_from_url(url)
elif path.startswith("/forums/"):
return scrape_board_from_url(url)
elif (not path) or path == "/":
return scrape_index(url)
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document(url):
"""Returns a pyquery document for the specified url, retrying if necessary."""
return pq(url=url)
def get_paged_document(url):
"""Returns a generator that yields all pages of the specified url."""
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
while True:
doc = get_document(url=url)
yield doc
nextlink = doc("a[accesskey=n]")
if not nextlink:
break
url = "{}{}".format(baseurl, nextlink.attr.href)
print(" --> Following next page link to: {}".format(url))
def scrape_index(url):
"""Scrapes the forum index at url into a Forum object."""
print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url)
doc = get_document(url=url)
forum = Forum(title=doc("title").text())
for category_element in doc("div.span9 > div.row-fluid").items():
category = Category(title=category_element.find("h3").text())
forum.categories.append(category)
for board_link in category_element.find("a[href^='/forums/']").items():
full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
board = scrape_board_from_url(full_url)
board.description = board_link.closest("div").find("p").eq(0).text()
category.children.append(board)
print("Finished scraping all boards in category: {}".format(category.title))
return forum
def scrape_board_from_url(url):
"""Scrapes the board index at url into a Board object."""
print("Scraping board from url: {}".format(url))
board = None
for doc in get_paged_document(url):
if not board:
board = scrape_board_from_document(url, doc)
else:
board.children = board.children + scrape_board_from_document(url, doc).children
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board
def scrape_board_from_document(url, doc):
"""Scrapes the given document into a Board object."""
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
board = Board(title=doc("h1").text())
for thread_link in doc("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"):
continue
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
board.children.append(thread)
return board
def scrape_thread_from_url(url):
"""Scrapes the given thread url into a Thread object."""
print("Scraping thread from url: {}".format(url))
thread = None
for doc in get_paged_document(url):
if not thread:
thread = scrape_thread_from_document(doc)
else:
thread.children = thread.children + scrape_thread_from_document(doc).children
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread
def scrape_thread_from_document(doc):
"""Scrapes the given document into a Thread object."""
thread = Thread(title=doc("h2").eq(0).text())
for post_entry in doc("article.post-entry").items():
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually
# returns the rest of the thread's contents instead of just that post.
# So we need to pick out only the first (username/signature/postbody)
# to get around this.
date_element = post_entry.find(".date").eq(0)
post_content_container = post_entry.find(".post-content-container").eq(0)
user_header = post_entry.find("header").eq(0)
signature = post_content_container.find(".signature").eq(0)
post_content_container.remove(".signature")
if signature:
signature = signature.html().strip()
else:
signature = None
if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else:
timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
thread.children.append(Post(
author=User(
name=user_header.find("p > a").eq(0).text(),
avatar=user_header.find("img[alt='avatar']").attr.src,
title=user_header.find(".auto-title").text(),
subtitle=user_header.find(".custom_title").text(),
signature=signature
),
timestamp=timestamp,
body=post_content_container.html().strip()
))
return thread