the-great-escape/tge/scrapers/yuku.py

"""Scraper for Yuku Forumer forums."""

# pylint: disable=no-member

from urllib.parse import urlparse
from time import strptime, mktime

import dateutil.parser
from pyquery import PyQuery as pq
from retrying import retry

from ..model import User, Category, Forum, Board, Post, Thread

TIME_FORMAT = "%b %d %y %I:%M %p"

def can_scrape_url(url):
    """Returns true if this url can be scraped by this scraper."""
    return ".fr.yuku.com" in url

def scrape(url):
    """Scrapes the URL into an object."""
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread_from_url(url)
    elif path.startswith("/forums/"):
        return scrape_board_from_url(url)
    elif (not path) or path == "/":
        return scrape_index(url)

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document(url):
    """Returns a pyquery document for the specified url, retrying if necessary."""
    return pq(url=url)

def get_paged_document(url):
    """Returns a generator that yields all pages of the specified url."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    while True:
        doc = get_document(url=url)
        yield doc

        nextlink = doc("a[accesskey=n]")
        if not nextlink:
            break

        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))

def scrape_index(url):
    """Scrapes the forum index at url into a Forum object."""
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

    doc = get_document(url=url)
    forum = Forum(title=doc("title").text())
    for category_element in doc("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
            full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
            board = scrape_board_from_url(full_url)
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

def scrape_board_from_url(url):
    """Scrapes the board index at url into a Board object."""
    print("Scraping board from url: {}".format(url))

    board = None
    for doc in get_paged_document(url):
        if not board:
            board = scrape_board_from_document(url, doc)
        else:
            board.children = board.children + scrape_board_from_document(url, doc).children

    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board

def scrape_board_from_document(url, doc):
    """Scrapes the given document into a Board object."""
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    board = Board(title=doc("h1").text())
    for thread_link in doc("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
        board.children.append(thread)

    return board

def scrape_thread_from_url(url):
    """Scrapes the given thread url into a Thread object."""
    print("Scraping thread from url: {}".format(url))

    thread = None
    for doc in get_paged_document(url):
        if not thread:
            thread = scrape_thread_from_document(doc)
        else:
            thread.children = thread.children + scrape_thread_from_document(doc).children

    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread

def scrape_thread_from_document(doc):
    """Scrapes the given document into a Thread object."""
    thread = Thread(title=doc("h2").eq(0).text())
    for post_entry in doc("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
        #     So we need to pick out only the first (username/signature/postbody)
        #     to get around this.
        date_element = post_entry.find(".date").eq(0)
        post_content_container = post_entry.find(".post-content-container").eq(0)
        user_header = post_entry.find("header").eq(0)
        signature = post_content_container.find(".signature").eq(0)
        post_content_container.remove(".signature")

        if signature:
            signature = signature.html().strip()
        else:
            signature = None

        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
            timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))

        thread.children.append(Post(
            author=User(
                name=user_header.find("p > a").eq(0).text(),
                avatar=user_header.find("img[alt='avatar']").attr.src,
                title=user_header.find(".auto-title").text(),
                subtitle=user_header.find(".custom_title").text(),
                signature=signature
            ),
            timestamp=timestamp,
            body=post_content_container.html().strip()
        ))

    return thread
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`"""Scraper for Yuku Forumer forums."""`

			`# pylint: disable=no-member`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from urllib.parse import urlparse`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`from time import strptime, mktime`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`import dateutil.parser`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from pyquery import PyQuery as pq`
Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`from retrying import retry`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`from ..model import User, Category, Forum, Board, Post, Thread`

			`TIME_FORMAT = "%b %d %y %I:%M %p"`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def can_scrape_url(url):`
			`"""Returns true if this url can be scraped by this scraper."""`
for now, limit to forumer forums (fr.yuku.com) as I'm not sure if this scraper will support non-forumer ones 2016-11-27 00:18:39 -06:00			`return ".fr.yuku.com" in url`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def scrape(url):`
			`"""Scrapes the URL into an object."""`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`path = urlparse(url).path`
			`if path.startswith("/topic/"):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`return scrape_thread_from_url(url)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`elif path.startswith("/forums/"):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`return scrape_board_from_url(url)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`elif (not path) or path == "/":`
			`return scrape_index(url)`

Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def get_document(url):`
			`"""Returns a pyquery document for the specified url, retrying if necessary."""`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`return pq(url=url)`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def get_paged_document(url):`
			`"""Returns a generator that yields all pages of the specified url."""`
dry up pagination logic using a generator 2016-11-27 17:50:25 -06:00			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`
need to pass url to scrape_board_from_document 2016-11-27 17:52:47 -06:00
dry up pagination logic using a generator 2016-11-27 17:50:25 -06:00			`while True:`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`doc = get_document(url=url)`
			`yield doc`
dry up pagination logic using a generator 2016-11-27 17:50:25 -06:00
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`nextlink = doc("a[accesskey=n]")`
dry up pagination logic using a generator 2016-11-27 17:50:25 -06:00			`if not nextlink:`
			`break`

			`url = "{}{}".format(baseurl, nextlink.attr.href)`
			`print(" --> Following next page link to: {}".format(url))`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def scrape_index(url):`
			`"""Scrapes the forum index at url into a Forum object."""`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`print("Scraping forum index from url: {}".format(url))`
			`urlparts = urlparse(url)`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`doc = get_document(url=url)`
			`forum = Forum(title=doc("title").text())`
			`for category_element in doc("div.span9 > div.row-fluid").items():`
.text() not text 2016-11-27 00:14:16 -06:00			`category = Category(title=category_element.find("h3").text())`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`forum.categories.append(category)`
need to use .items() grumble grumble 2016-11-27 00:11:42 -06:00			`for board_link in category_element.find("a[href^='/forums/']").items():`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)`
			`board = scrape_board_from_url(full_url)`
grab board description from forum index (we can't get it from the board index) 2016-11-27 01:13:45 -06:00			`board.description = board_link.closest("div").find("p").eq(0).text()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`category.children.append(board)`
			`print("Finished scraping all boards in category: {}".format(category.title))`

			`return forum`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def scrape_board_from_url(url):`
			`"""Scrapes the board index at url into a Board object."""`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`print("Scraping board from url: {}".format(url))`

Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`board = None`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`for doc in get_paged_document(url):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`if not board:`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`board = scrape_board_from_document(url, doc)`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`else:`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`board.children = board.children + scrape_board_from_document(url, doc).children`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00
			`print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))`
			`return board`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def scrape_board_from_document(url, doc):`
			`"""Scrapes the given document into a Board object."""`
need to pass url to scrape_board_from_document 2016-11-27 17:52:47 -06:00			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`board = Board(title=doc("h1").text())`
			`for thread_link in doc("a[href^='/topic/']").items():`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`if thread_link.closest(".topic-pager"):`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`continue`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`board.children.append(thread)`

			`return board`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def scrape_thread_from_url(url):`
			`"""Scrapes the given thread url into a Thread object."""`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`print("Scraping thread from url: {}".format(url))`

Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`thread = None`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`for doc in get_paged_document(url):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`if not thread:`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`thread = scrape_thread_from_document(doc)`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`else:`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`thread.children = thread.children + scrape_thread_from_document(doc).children`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00
			`print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))`
			`return thread`

style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`def scrape_thread_from_document(doc):`
			`"""Scrapes the given document into a Thread object."""`
			`thread = Thread(title=doc("h2").eq(0).text())`
			`for post_entry in doc("article.post-entry").items():`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`# 26 November 2016: Yuku's broken HTML is breaking this parsing logic`
			`# <article>'s aren't being closed correctly so each selector actually`
			`# returns the rest of the thread's contents instead of just that post.`
			`# So we need to pick out only the first (username/signature/postbody)`
			`# to get around this.`
add extra post & user info 2016-11-27 00:48:55 -06:00			`date_element = post_entry.find(".date").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`post_content_container = post_entry.find(".post-content-container").eq(0)`
add extra post & user info 2016-11-27 00:48:55 -06:00			`user_header = post_entry.find("header").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`signature = post_content_container.find(".signature").eq(0)`
			`post_content_container.remove(".signature")`

			`if signature:`
			`signature = signature.html().strip()`
			`else:`
			`signature = None`

use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`if date_element.find("time"):`
			`timestamp = dateutil.parser.parse(date_element.text()).timestamp()`
			`else:`
style and convention fixes to make pylint happy 2016-12-16 00:29:59 -06:00			`timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`thread.children.append(Post(`
add extra post & user info 2016-11-27 00:48:55 -06:00			`author=User(`
			`name=user_header.find("p > a").eq(0).text(),`
			`avatar=user_header.find("img[alt='avatar']").attr.src,`
			`title=user_header.find(".auto-title").text(),`
			`subtitle=user_header.find(".custom_title").text(),`
			`signature=signature`
			`),`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`timestamp=timestamp,`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`body=post_content_container.html().strip()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`))`

			`return thread`