the-great-escape/tge/scrapers/yuku.py

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from pyquery import PyQuery as pq

def can_scrape_url (url):
    return ".yuku.com" in url

def scrape (url):
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread(url)
    elif path.startswith("/forums/"):
        return scrape_board(url)
    elif (not path) or path == "/":
        return scrape_index(url)

def scrape_index (url):
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

    d = pq(url=url)
    forum = Forum(title=d("title").text())
    for category_element in d("div.span9 > div.row-fluid"):
        category = Category(title=category_element.find("h3").text)
        forum.categories.append(category)
        for board_link in pq(category_element)("a[href^='/forums/']"):
            board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attrib['href']))
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

def scrape_board (url):
    print("Scraping board from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    d = pq(url=url)
    board = Board(title=d("h1").text())
    for thread_link in d("a[href^='/topic/']"):
        if d(thread_link).closest(".topic-pager"):
            continue
        thread = scrape_thread("{}{}".format(baseurl, thread_link.attrib['href']))
        board.children.append(thread)

    nextlink = d("a[accesskey=n]")
    if nextlink:
        board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children

    if not urlparts.query.startswith("page="):
        print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))

    return board

def scrape_thread (url):
    print("Scraping thread from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    d = pq(url=url)
    thread = Thread(title=d("h2").text())
    for post_entry in d("article.post-entry"):
        thread.children.append(Post(
            author=pq(post_entry)("header > p > a").text(),
            body=pq(post_entry)(".post-content-container").text()
        ))

    nextlink = d("a[accesskey=n]")
    if nextlink:
        thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children

    if not urlparts.query.startswith("page="):
        print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))

    return thread
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from ..model import User, Category, Forum, Board, Post, Thread`
			`from urllib.parse import urlparse`
			`from pyquery import PyQuery as pq`

			`def can_scrape_url (url):`
			`return ".yuku.com" in url`

			`def scrape (url):`
			`path = urlparse(url).path`
			`if path.startswith("/topic/"):`
			`return scrape_thread(url)`
			`elif path.startswith("/forums/"):`
			`return scrape_board(url)`
			`elif (not path) or path == "/":`
			`return scrape_index(url)`

			`def scrape_index (url):`
			`print("Scraping forum index from url: {}".format(url))`
			`urlparts = urlparse(url)`

			`d = pq(url=url)`
			`forum = Forum(title=d("title").text())`
			`for category_element in d("div.span9 > div.row-fluid"):`
			`category = Category(title=category_element.find("h3").text)`
			`forum.categories.append(category)`
			`for board_link in pq(category_element)("a[href^='/forums/']"):`
			`board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attrib['href']))`
			`category.children.append(board)`
			`print("Finished scraping all boards in category: {}".format(category.title))`

			`return forum`

			`def scrape_board (url):`
			`print("Scraping board from url: {}".format(url))`
			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

			`d = pq(url=url)`
			`board = Board(title=d("h1").text())`
			`for thread_link in d("a[href^='/topic/']"):`
			`if d(thread_link).closest(".topic-pager"):`
			`continue`
			`thread = scrape_thread("{}{}".format(baseurl, thread_link.attrib['href']))`
			`board.children.append(thread)`

			`nextlink = d("a[accesskey=n]")`
			`if nextlink:`
			`board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children`

			`if not urlparts.query.startswith("page="):`
			`print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))`

			`return board`

			`def scrape_thread (url):`
			`print("Scraping thread from url: {}".format(url))`
			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

			`d = pq(url=url)`
			`thread = Thread(title=d("h2").text())`
			`for post_entry in d("article.post-entry"):`
			`thread.children.append(Post(`
			`author=pq(post_entry)("header > p > a").text(),`
			`body=pq(post_entry)(".post-content-container").text()`
			`))`

			`nextlink = d("a[accesskey=n]")`
			`if nextlink:`
			`thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children`

			`if not urlparts.query.startswith("page="):`
			`print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))`

			`return thread`