2016-11-26 23:09:12 -06:00
|
|
|
from ..model import User, Category, Forum, Board, Post, Thread
|
|
|
|
from urllib.parse import urlparse
|
2016-11-27 01:10:04 -06:00
|
|
|
from time import strptime, mktime
|
|
|
|
import dateutil.parser
|
2016-11-26 23:09:12 -06:00
|
|
|
from pyquery import PyQuery as pq
|
2016-11-27 13:03:13 -06:00
|
|
|
from retrying import retry
|
2016-11-26 23:09:12 -06:00
|
|
|
|
2016-11-27 01:10:04 -06:00
|
|
|
time_format = "%b %d %y %I:%M %p"
|
|
|
|
|
2016-11-26 23:09:12 -06:00
|
|
|
def can_scrape_url (url):
|
2016-11-27 00:18:39 -06:00
|
|
|
return ".fr.yuku.com" in url
|
2016-11-26 23:09:12 -06:00
|
|
|
|
|
|
|
def scrape (url):
|
|
|
|
path = urlparse(url).path
|
|
|
|
if path.startswith("/topic/"):
|
|
|
|
return scrape_thread(url)
|
|
|
|
elif path.startswith("/forums/"):
|
|
|
|
return scrape_board(url)
|
|
|
|
elif (not path) or path == "/":
|
|
|
|
return scrape_index(url)
|
|
|
|
|
2016-11-27 13:03:13 -06:00
|
|
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
2016-11-26 23:09:12 -06:00
|
|
|
def scrape_index (url):
|
|
|
|
print("Scraping forum index from url: {}".format(url))
|
|
|
|
urlparts = urlparse(url)
|
|
|
|
|
|
|
|
d = pq(url=url)
|
|
|
|
forum = Forum(title=d("title").text())
|
2016-11-26 23:42:30 -06:00
|
|
|
for category_element in d("div.span9 > div.row-fluid").items():
|
2016-11-27 00:14:16 -06:00
|
|
|
category = Category(title=category_element.find("h3").text())
|
2016-11-26 23:09:12 -06:00
|
|
|
forum.categories.append(category)
|
2016-11-27 00:11:42 -06:00
|
|
|
for board_link in category_element.find("a[href^='/forums/']").items():
|
2016-11-26 23:42:30 -06:00
|
|
|
board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
|
2016-11-27 01:13:45 -06:00
|
|
|
board.description = board_link.closest("div").find("p").eq(0).text()
|
2016-11-26 23:09:12 -06:00
|
|
|
category.children.append(board)
|
|
|
|
print("Finished scraping all boards in category: {}".format(category.title))
|
|
|
|
|
|
|
|
return forum
|
|
|
|
|
2016-11-27 13:03:13 -06:00
|
|
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
2016-11-26 23:09:12 -06:00
|
|
|
def scrape_board (url):
|
|
|
|
print("Scraping board from url: {}".format(url))
|
|
|
|
urlparts = urlparse(url)
|
|
|
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
|
|
|
|
|
|
|
d = pq(url=url)
|
2016-11-27 01:19:59 -06:00
|
|
|
board = Board(title=d("h1").text())
|
2016-11-26 23:42:30 -06:00
|
|
|
for thread_link in d("a[href^='/topic/']").items():
|
|
|
|
if thread_link.closest(".topic-pager"):
|
2016-11-26 23:09:12 -06:00
|
|
|
continue
|
2016-11-26 23:42:30 -06:00
|
|
|
thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
|
2016-11-26 23:09:12 -06:00
|
|
|
board.children.append(thread)
|
|
|
|
|
|
|
|
nextlink = d("a[accesskey=n]")
|
|
|
|
if nextlink:
|
|
|
|
board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children
|
|
|
|
|
|
|
|
if not urlparts.query.startswith("page="):
|
|
|
|
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
|
|
|
|
|
|
|
return board
|
|
|
|
|
2016-11-27 13:03:13 -06:00
|
|
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
2016-11-26 23:09:12 -06:00
|
|
|
def scrape_thread (url):
|
|
|
|
print("Scraping thread from url: {}".format(url))
|
|
|
|
urlparts = urlparse(url)
|
|
|
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
|
|
|
|
|
|
|
d = pq(url=url)
|
2016-11-27 00:16:21 -06:00
|
|
|
thread = Thread(title=d("h2").eq(0).text())
|
2016-11-26 23:42:30 -06:00
|
|
|
for post_entry in d("article.post-entry").items():
|
|
|
|
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
|
|
|
# <article>'s aren't being closed correctly so each selector actually
|
|
|
|
# returns the rest of the thread's contents instead of just that post.
|
|
|
|
# So we need to pick out only the first (username/signature/postbody)
|
|
|
|
# to get around this.
|
2016-11-27 00:48:55 -06:00
|
|
|
date_element = post_entry.find(".date").eq(0)
|
2016-11-27 00:03:30 -06:00
|
|
|
post_content_container = post_entry.find(".post-content-container").eq(0)
|
2016-11-27 00:48:55 -06:00
|
|
|
user_header = post_entry.find("header").eq(0)
|
2016-11-27 00:03:30 -06:00
|
|
|
signature = post_content_container.find(".signature").eq(0)
|
|
|
|
post_content_container.remove(".signature")
|
|
|
|
|
|
|
|
if signature:
|
|
|
|
signature = signature.html().strip()
|
|
|
|
else:
|
|
|
|
signature = None
|
|
|
|
|
2016-11-27 01:10:04 -06:00
|
|
|
if date_element.find("time"):
|
|
|
|
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
|
|
|
|
else:
|
|
|
|
timestamp = mktime(strptime(date_element.text(), time_format))
|
|
|
|
|
2016-11-26 23:09:12 -06:00
|
|
|
thread.children.append(Post(
|
2016-11-27 00:48:55 -06:00
|
|
|
author=User(
|
|
|
|
name=user_header.find("p > a").eq(0).text(),
|
|
|
|
avatar=user_header.find("img[alt='avatar']").attr.src,
|
|
|
|
title=user_header.find(".auto-title").text(),
|
|
|
|
subtitle=user_header.find(".custom_title").text(),
|
|
|
|
signature=signature
|
|
|
|
),
|
2016-11-27 01:10:04 -06:00
|
|
|
timestamp=timestamp,
|
2016-11-27 00:03:30 -06:00
|
|
|
body=post_content_container.html().strip()
|
2016-11-26 23:09:12 -06:00
|
|
|
))
|
|
|
|
|
|
|
|
nextlink = d("a[accesskey=n]")
|
|
|
|
if nextlink:
|
|
|
|
thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children
|
|
|
|
|
|
|
|
if not urlparts.query.startswith("page="):
|
|
|
|
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
|
|
|
|
|
|
|
return thread
|