the-great-escape/tge/scrapers/yuku.py

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
from retrying import retry

time_format = "%b %d %y %I:%M %p"

def can_scrape_url (url):
    return ".fr.yuku.com" in url

def scrape (url):
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread_from_url(url)
    elif path.startswith("/forums/"):
        return scrape_board_from_url(url)
    elif (not path) or path == "/":
        return scrape_index(url)

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document (url):
    return pq(url=url)

def get_paged_document (url):
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    while True:
        d = get_document(url=url)
        yield d

        nextlink = d("a[accesskey=n]")
        if not nextlink:
            break

        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))

def scrape_index (url):
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

    d = get_document(url=url)
    forum = Forum(title=d("title").text())
    for category_element in d("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
            board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

def scrape_board_from_url (url):
    print("Scraping board from url: {}".format(url))

    board = None
    for d in get_paged_document(url):
        if not board:
            board = scrape_board_from_document(d)
        else:
            board.children = board.children + scrape_board_from_document(d).children

    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board

def scrape_board_from_document (d):
    board = Board(title=d("h1").text())
    for thread_link in d("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
        board.children.append(thread)

    return board

def scrape_thread_from_url (url):
    print("Scraping thread from url: {}".format(url))

    thread = None
    for d in get_paged_document(url):
        if not thread:
            thread = scrape_thread_from_document(d)
        else:
            thread.children = thread.children + scrape_thread_from_document(d).children

    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread

def scrape_thread_from_document (d):
    thread = Thread(title=d("h2").eq(0).text())
    for post_entry in d("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
        #     So we need to pick out only the first (username/signature/postbody)
        #     to get around this.
        date_element = post_entry.find(".date").eq(0)
        post_content_container = post_entry.find(".post-content-container").eq(0)
        user_header = post_entry.find("header").eq(0)
        signature = post_content_container.find(".signature").eq(0)
        post_content_container.remove(".signature")

        if signature:
            signature = signature.html().strip()
        else:
            signature = None

        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
            timestamp = mktime(strptime(date_element.text(), time_format))

        thread.children.append(Post(
            author=User(
                name=user_header.find("p > a").eq(0).text(),
                avatar=user_header.find("img[alt='avatar']").attr.src,
                title=user_header.find(".auto-title").text(),
                subtitle=user_header.find(".custom_title").text(),
                signature=signature
            ),
            timestamp=timestamp,
            body=post_content_container.html().strip()
        ))

    return thread