the-great-escape/tge/scrapers/yuku.py

"""Scraper for Yuku Forumer forums."""

# pylint: disable=no-member

from urllib.parse import urlparse
from time import strptime, mktime

import dateutil.parser
from pyquery import PyQuery as pq
from retrying import retry

from ..model import User, Category, Forum, Board, Post, Thread

TIME_FORMAT = "%b %d %y %I:%M %p"

def can_scrape_url(url):
    """Returns true if this url can be scraped by this scraper."""
    return ".fr.yuku.com" in url

def scrape(url):
    """Scrapes the URL into an object."""
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread_from_url(url)
    elif path.startswith("/forums/"):
        return scrape_board_from_url(url)
    elif (not path) or path == "/":
        return scrape_index(url)

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document(url):
    """Returns a pyquery document for the specified url, retrying if necessary."""
    return pq(url=url)

def get_paged_document(url):
    """Returns a generator that yields all pages of the specified url."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    while True:
        doc = get_document(url=url)
        yield doc

        nextlink = doc("a[accesskey=n]")
        if not nextlink:
            break

        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))

def scrape_index(url):
    """Scrapes the forum index at url into a Forum object."""
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

    doc = get_document(url=url)
    forum = Forum(title=doc("title").text())
    for category_element in doc("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
            full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
            board = scrape_board_from_url(full_url)
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

def scrape_board_from_url(url):
    """Scrapes the board index at url into a Board object."""
    print("Scraping board from url: {}".format(url))

    board = None
    for doc in get_paged_document(url):
        if not board:
            board = scrape_board_from_document(url, doc)
        else:
            board.children = board.children + scrape_board_from_document(url, doc).children

    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board

def scrape_board_from_document(url, doc):
    """Scrapes the given document into a Board object."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    board = Board(title=doc("h1").text())
    for thread_link in doc("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
        board.children.append(thread)

    return board

def scrape_thread_from_url(url):
    """Scrapes the given thread url into a Thread object."""
    print("Scraping thread from url: {}".format(url))

    thread = None
    for doc in get_paged_document(url):
        if not thread:
            thread = scrape_thread_from_document(doc)
        else:
            thread.children = thread.children + scrape_thread_from_document(doc).children

    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread

def scrape_thread_from_document(doc):
    """Scrapes the given document into a Thread object."""
    thread = Thread(title=doc("h2").eq(0).text())
    for post_entry in doc("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
        #     So we need to pick out only the first (username/signature/postbody)
        #     to get around this.
        date_element = post_entry.find(".date").eq(0)
        post_content_container = post_entry.find(".post-content-container").eq(0)
        user_header = post_entry.find("header").eq(0)
        signature = post_content_container.find(".signature").eq(0)
        post_content_container.remove(".signature")

        if signature:
            signature = signature.html().strip()
        else:
            signature = None

        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
            timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))

        thread.children.append(Post(
            author=User(
                name=user_header.find("p > a").eq(0).text(),
                avatar=user_header.find("img[alt='avatar']").attr.src,
                title=user_header.find(".auto-title").text(),
                subtitle=user_header.find(".custom_title").text(),
                signature=signature
            ),
            timestamp=timestamp,
            body=post_content_container.html().strip()
        ))

    return thread