109 lines
4.1 KiB
Python
Raw Normal View History

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
time_format = "%b %d %y %I:%M %p"
def can_scrape_url (url):
return ".fr.yuku.com" in url
def scrape (url):
path = urlparse(url).path
if path.startswith("/topic/"):
return scrape_thread(url)
elif path.startswith("/forums/"):
return scrape_board(url)
elif (not path) or path == "/":
return scrape_index(url)
def scrape_index (url):
print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url)
d = pq(url=url)
forum = Forum(title=d("title").text())
2016-11-26 23:42:30 -06:00
for category_element in d("div.span9 > div.row-fluid").items():
2016-11-27 00:14:16 -06:00
category = Category(title=category_element.find("h3").text())
forum.categories.append(category)
2016-11-27 00:11:42 -06:00
for board_link in category_element.find("a[href^='/forums/']").items():
2016-11-26 23:42:30 -06:00
board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
board.description = board_link.closest("div").find("p").eq(0).text()
category.children.append(board)
print("Finished scraping all boards in category: {}".format(category.title))
return forum
def scrape_board (url):
print("Scraping board from url: {}".format(url))
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
d = pq(url=url)
2016-11-27 00:16:21 -06:00
board = Board(title=d("h1").eq(0).text())
2016-11-26 23:42:30 -06:00
for thread_link in d("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"):
continue
2016-11-26 23:42:30 -06:00
thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
board.children.append(thread)
nextlink = d("a[accesskey=n]")
if nextlink:
board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children
if not urlparts.query.startswith("page="):
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board
def scrape_thread (url):
print("Scraping thread from url: {}".format(url))
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
d = pq(url=url)
2016-11-27 00:16:21 -06:00
thread = Thread(title=d("h2").eq(0).text())
2016-11-26 23:42:30 -06:00
for post_entry in d("article.post-entry").items():
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually
# returns the rest of the thread's contents instead of just that post.
# So we need to pick out only the first (username/signature/postbody)
# to get around this.
2016-11-27 00:48:55 -06:00
date_element = post_entry.find(".date").eq(0)
post_content_container = post_entry.find(".post-content-container").eq(0)
2016-11-27 00:48:55 -06:00
user_header = post_entry.find("header").eq(0)
signature = post_content_container.find(".signature").eq(0)
post_content_container.remove(".signature")
if signature:
signature = signature.html().strip()
else:
signature = None
if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else:
timestamp = mktime(strptime(date_element.text(), time_format))
thread.children.append(Post(
2016-11-27 00:48:55 -06:00
author=User(
name=user_header.find("p > a").eq(0).text(),
avatar=user_header.find("img[alt='avatar']").attr.src,
title=user_header.find(".auto-title").text(),
subtitle=user_header.find(".custom_title").text(),
signature=signature
),
timestamp=timestamp,
body=post_content_container.html().strip()
))
nextlink = d("a[accesskey=n]")
if nextlink:
thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children
if not urlparts.query.startswith("page="):
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread