150 lines
5.4 KiB
Python
Raw Normal View History

"""Scraper for Yuku Forumer forums."""
# pylint: disable=no-member
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
2016-11-27 13:03:13 -06:00
from retrying import retry
from ..model import User, Category, Forum, Board, Post, Thread
TIME_FORMAT = "%b %d %y %I:%M %p"
def can_scrape_url(url):
"""Returns true if this url can be scraped by this scraper."""
return ".fr.yuku.com" in url
def scrape(url):
"""Scrapes the URL into an object."""
path = urlparse(url).path
if path.startswith("/topic/"):
return scrape_thread_from_url(url)
elif path.startswith("/forums/"):
return scrape_board_from_url(url)
elif (not path) or path == "/":
return scrape_index(url)
2016-11-27 13:03:13 -06:00
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document(url):
"""Returns a pyquery document for the specified url, retrying if necessary."""
return pq(url=url)
def get_paged_document(url):
"""Returns a generator that yields all pages of the specified url."""
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
while True:
doc = get_document(url=url)
yield doc
nextlink = doc("a[accesskey=n]")
if not nextlink:
break
url = "{}{}".format(baseurl, nextlink.attr.href)
print(" --> Following next page link to: {}".format(url))
def scrape_index(url):
"""Scrapes the forum index at url into a Forum object."""
print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url)
doc = get_document(url=url)
forum = Forum(title=doc("title").text())
for category_element in doc("div.span9 > div.row-fluid").items():
2016-11-27 00:14:16 -06:00
category = Category(title=category_element.find("h3").text())
forum.categories.append(category)
2016-11-27 00:11:42 -06:00
for board_link in category_element.find("a[href^='/forums/']").items():
full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
board = scrape_board_from_url(full_url)
board.description = board_link.closest("div").find("p").eq(0).text()
category.children.append(board)
print("Finished scraping all boards in category: {}".format(category.title))
return forum
def scrape_board_from_url(url):
"""Scrapes the board index at url into a Board object."""
print("Scraping board from url: {}".format(url))
board = None
for doc in get_paged_document(url):
if not board:
board = scrape_board_from_document(url, doc)
else:
board.children = board.children + scrape_board_from_document(url, doc).children
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board
def scrape_board_from_document(url, doc):
"""Scrapes the given document into a Board object."""
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
board = Board(title=doc("h1").text())
for thread_link in doc("a[href^='/topic/']").items():
2016-11-26 23:42:30 -06:00
if thread_link.closest(".topic-pager"):
continue
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
board.children.append(thread)
return board
def scrape_thread_from_url(url):
"""Scrapes the given thread url into a Thread object."""
print("Scraping thread from url: {}".format(url))
thread = None
for doc in get_paged_document(url):
if not thread:
thread = scrape_thread_from_document(doc)
else:
thread.children = thread.children + scrape_thread_from_document(doc).children
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread
def scrape_thread_from_document(doc):
"""Scrapes the given document into a Thread object."""
thread = Thread(title=doc("h2").eq(0).text())
for post_entry in doc("article.post-entry").items():
2016-11-26 23:42:30 -06:00
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually
# returns the rest of the thread's contents instead of just that post.
# So we need to pick out only the first (username/signature/postbody)
# to get around this.
2016-11-27 00:48:55 -06:00
date_element = post_entry.find(".date").eq(0)
post_content_container = post_entry.find(".post-content-container").eq(0)
2016-11-27 00:48:55 -06:00
user_header = post_entry.find("header").eq(0)
signature = post_content_container.find(".signature").eq(0)
post_content_container.remove(".signature")
if signature:
signature = signature.html().strip()
else:
signature = None
if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else:
timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
thread.children.append(Post(
2016-11-27 00:48:55 -06:00
author=User(
name=user_header.find("p > a").eq(0).text(),
avatar=user_header.find("img[alt='avatar']").attr.src,
title=user_header.find(".auto-title").text(),
subtitle=user_header.find(".custom_title").text(),
signature=signature
),
timestamp=timestamp,
body=post_content_container.html().strip()
))
return thread