131 lines
4.6 KiB
Python
Raw Normal View History

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
2016-11-27 13:03:13 -06:00
from retrying import retry
time_format = "%b %d %y %I:%M %p"
def can_scrape_url (url):
return ".fr.yuku.com" in url
def scrape (url):
path = urlparse(url).path
if path.startswith("/topic/"):
return scrape_thread_from_url(url)
elif path.startswith("/forums/"):
return scrape_board_from_url(url)
elif (not path) or path == "/":
return scrape_index(url)
2016-11-27 13:03:13 -06:00
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document (url):
return pq(url=url)
def get_paged_document (url):
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
while True:
d = get_document(url=url)
yield d
nextlink = d("a[accesskey=n]")
if not nextlink:
break
url = "{}{}".format(baseurl, nextlink.attr.href)
print(" --> Following next page link to: {}".format(url))
def scrape_index (url):
print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url)
d = get_document(url=url)
forum = Forum(title=d("title").text())
2016-11-26 23:42:30 -06:00
for category_element in d("div.span9 > div.row-fluid").items():
2016-11-27 00:14:16 -06:00
category = Category(title=category_element.find("h3").text())
forum.categories.append(category)
2016-11-27 00:11:42 -06:00
for board_link in category_element.find("a[href^='/forums/']").items():
board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
board.description = board_link.closest("div").find("p").eq(0).text()
category.children.append(board)
print("Finished scraping all boards in category: {}".format(category.title))
return forum
def scrape_board_from_url (url):
print("Scraping board from url: {}".format(url))
board = None
for d in get_paged_document(url):
if not board:
board = scrape_board_from_document(d)
else:
board.children = board.children + scrape_board_from_document(d).children
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board
def scrape_board_from_document (d):
board = Board(title=d("h1").text())
2016-11-26 23:42:30 -06:00
for thread_link in d("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"):
continue
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
board.children.append(thread)
return board
def scrape_thread_from_url (url):
print("Scraping thread from url: {}".format(url))
thread = None
for d in get_paged_document(url):
if not thread:
thread = scrape_thread_from_document(d)
else:
thread.children = thread.children + scrape_thread_from_document(d).children
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread
def scrape_thread_from_document (d):
2016-11-27 00:16:21 -06:00
thread = Thread(title=d("h2").eq(0).text())
2016-11-26 23:42:30 -06:00
for post_entry in d("article.post-entry").items():
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually
# returns the rest of the thread's contents instead of just that post.
# So we need to pick out only the first (username/signature/postbody)
# to get around this.
2016-11-27 00:48:55 -06:00
date_element = post_entry.find(".date").eq(0)
post_content_container = post_entry.find(".post-content-container").eq(0)
2016-11-27 00:48:55 -06:00
user_header = post_entry.find("header").eq(0)
signature = post_content_container.find(".signature").eq(0)
post_content_container.remove(".signature")
if signature:
signature = signature.html().strip()
else:
signature = None
if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else:
timestamp = mktime(strptime(date_element.text(), time_format))
thread.children.append(Post(
2016-11-27 00:48:55 -06:00
author=User(
name=user_header.find("p > a").eq(0).text(),
avatar=user_header.find("img[alt='avatar']").attr.src,
title=user_header.find(".auto-title").text(),
subtitle=user_header.find(".custom_title").text(),
signature=signature
),
timestamp=timestamp,
body=post_content_container.html().strip()
))
return thread