81 lines
3.0 KiB
Python
Raw Normal View History

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from pyquery import PyQuery as pq
def can_scrape_url (url):
return ".yuku.com" in url
def scrape (url):
path = urlparse(url).path
if path.startswith("/topic/"):
return scrape_thread(url)
elif path.startswith("/forums/"):
return scrape_board(url)
elif (not path) or path == "/":
return scrape_index(url)
def scrape_index (url):
print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url)
d = pq(url=url)
forum = Forum(title=d("title").text())
2016-11-26 23:42:30 -06:00
for category_element in d("div.span9 > div.row-fluid").items():
category = Category(title=category_element.find("h3").text)
forum.categories.append(category)
2016-11-26 23:42:30 -06:00
for board_link in category_element.find("a[href^='/forums/']"):
board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
category.children.append(board)
print("Finished scraping all boards in category: {}".format(category.title))
return forum
def scrape_board (url):
print("Scraping board from url: {}".format(url))
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
d = pq(url=url)
board = Board(title=d("h1").text())
2016-11-26 23:42:30 -06:00
for thread_link in d("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"):
continue
2016-11-26 23:42:30 -06:00
thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
board.children.append(thread)
nextlink = d("a[accesskey=n]")
if nextlink:
board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children
if not urlparts.query.startswith("page="):
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board
def scrape_thread (url):
print("Scraping thread from url: {}".format(url))
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
d = pq(url=url)
thread = Thread(title=d("h2").text())
2016-11-26 23:42:30 -06:00
for post_entry in d("article.post-entry").items():
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually
# returns the rest of the thread's contents instead of just that post.
# So we need to pick out only the first (username/signature/postbody)
# to get around this.
thread.children.append(Post(
2016-11-26 23:42:30 -06:00
author=User(name=post_entry("header > p > a").eq(0).text()),
body=post_entry(".post-content-container").eq(0).text()
))
nextlink = d("a[accesskey=n]")
if nextlink:
thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children
if not urlparts.query.startswith("page="):
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread