the-great-escape/tge/scrapers/yuku.py

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
from retrying import retry

time_format = "%b %d %y %I:%M %p"

def can_scrape_url (url):
    return ".fr.yuku.com" in url

def scrape (url):
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread_from_url(url)
    elif path.startswith("/forums/"):
        return scrape_board_from_url(url)
    elif (not path) or path == "/":
        return scrape_index(url)

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_document (url):
    return pq(url=url)

def get_paged_document (url):
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
    
    while True:
        d = get_document(url=url)
        yield d

        nextlink = d("a[accesskey=n]")
        if not nextlink:
            break

        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))

def scrape_index (url):
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

    d = get_document(url=url)
    forum = Forum(title=d("title").text())
    for category_element in d("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
            board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

def scrape_board_from_url (url):
    print("Scraping board from url: {}".format(url))

    board = None
    for d in get_paged_document(url):
        if not board:
            board = scrape_board_from_document(d)
        else:
            board.children = board.children + scrape_board_from_document(d).children

    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board

def scrape_board_from_document (d):
    board = Board(title=d("h1").text())
    for thread_link in d("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
        board.children.append(thread)

    return board

def scrape_thread_from_url (url):
    print("Scraping thread from url: {}".format(url))

    thread = None
    for d in get_paged_document(url):
        if not thread:
            thread = scrape_thread_from_document(d)
        else:
            thread.children = thread.children + scrape_thread_from_document(d).children

    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread

def scrape_thread_from_document (d):
    thread = Thread(title=d("h2").eq(0).text())
    for post_entry in d("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
        #     So we need to pick out only the first (username/signature/postbody)
        #     to get around this.
        date_element = post_entry.find(".date").eq(0)
        post_content_container = post_entry.find(".post-content-container").eq(0)
        user_header = post_entry.find("header").eq(0)
        signature = post_content_container.find(".signature").eq(0)
        post_content_container.remove(".signature")

        if signature:
            signature = signature.html().strip()
        else:
            signature = None

        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
            timestamp = mktime(strptime(date_element.text(), time_format))

        thread.children.append(Post(
            author=User(
                name=user_header.find("p > a").eq(0).text(),
                avatar=user_header.find("img[alt='avatar']").attr.src,
                title=user_header.find(".auto-title").text(),
                subtitle=user_header.find(".custom_title").text(),
                signature=signature
            ),
            timestamp=timestamp,
            body=post_content_container.html().strip()
        ))

    return thread
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from ..model import User, Category, Forum, Board, Post, Thread`
			`from urllib.parse import urlparse`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`from time import strptime, mktime`
			`import dateutil.parser`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from pyquery import PyQuery as pq`
Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`from retrying import retry`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`time_format = "%b %d %y %I:%M %p"`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`def can_scrape_url (url):`
for now, limit to forumer forums (fr.yuku.com) as I'm not sure if this scraper will support non-forumer ones 2016-11-27 00:18:39 -06:00			`return ".fr.yuku.com" in url`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00
			`def scrape (url):`
			`path = urlparse(url).path`
			`if path.startswith("/topic/"):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`return scrape_thread_from_url(url)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`elif path.startswith("/forums/"):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`return scrape_board_from_url(url)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`elif (not path) or path == "/":`
			`return scrape_index(url)`

Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`def get_document (url):`
			`return pq(url=url)`

dry up pagination logic using a generator 2016-11-27 17:50:25 -06:00			`def get_paged_document (url):`
			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

			`while True:`
			`d = get_document(url=url)`
			`yield d`

			`nextlink = d("a[accesskey=n]")`
			`if not nextlink:`
			`break`

			`url = "{}{}".format(baseurl, nextlink.attr.href)`
			`print(" --> Following next page link to: {}".format(url))`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`def scrape_index (url):`
			`print("Scraping forum index from url: {}".format(url))`
			`urlparts = urlparse(url)`

Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`d = get_document(url=url)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`forum = Forum(title=d("title").text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for category_element in d("div.span9 > div.row-fluid").items():`
.text() not text 2016-11-27 00:14:16 -06:00			`category = Category(title=category_element.find("h3").text())`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`forum.categories.append(category)`
need to use .items() grumble grumble 2016-11-27 00:11:42 -06:00			`for board_link in category_element.find("a[href^='/forums/']").items():`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))`
grab board description from forum index (we can't get it from the board index) 2016-11-27 01:13:45 -06:00			`board.description = board_link.closest("div").find("p").eq(0).text()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`category.children.append(board)`
			`print("Finished scraping all boards in category: {}".format(category.title))`

			`return forum`

Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`def scrape_board_from_url (url):`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`print("Scraping board from url: {}".format(url))`

Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`board = None`
dry up pagination logic using a generator 2016-11-27 17:50:25 -06:00			`for d in get_paged_document(url):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`if not board:`
			`board = scrape_board_from_document(d)`
			`else:`
			`board.children = board.children + scrape_board_from_document(d).children`

			`print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))`
			`return board`

			`def scrape_board_from_document (d):`
there's multiple h1's on the page and the one we want is like .eq(2) or something. But once you start addressing nodes by index like that you get real brittle and can break easily. I don't think we have a problem with just selecting all h1's here. 2016-11-27 01:19:59 -06:00			`board = Board(title=d("h1").text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for thread_link in d("a[href^='/topic/']").items():`
			`if thread_link.closest(".topic-pager"):`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`continue`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`board.children.append(thread)`

			`return board`

Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`def scrape_thread_from_url (url):`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`print("Scraping thread from url: {}".format(url))`

Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`thread = None`
dry up pagination logic using a generator 2016-11-27 17:50:25 -06:00			`for d in get_paged_document(url):`
Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. 2016-11-27 17:42:46 -06:00			`if not thread:`
			`thread = scrape_thread_from_document(d)`
			`else:`
			`thread.children = thread.children + scrape_thread_from_document(d).children`

			`print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))`
			`return thread`

			`def scrape_thread_from_document (d):`
only want first h1/h2 etc 2016-11-27 00:16:21 -06:00			`thread = Thread(title=d("h2").eq(0).text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for post_entry in d("article.post-entry").items():`
			`# 26 November 2016: Yuku's broken HTML is breaking this parsing logic`
			`# <article>'s aren't being closed correctly so each selector actually`
			`# returns the rest of the thread's contents instead of just that post.`
			`# So we need to pick out only the first (username/signature/postbody)`
			`# to get around this.`
add extra post & user info 2016-11-27 00:48:55 -06:00			`date_element = post_entry.find(".date").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`post_content_container = post_entry.find(".post-content-container").eq(0)`
add extra post & user info 2016-11-27 00:48:55 -06:00			`user_header = post_entry.find("header").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`signature = post_content_container.find(".signature").eq(0)`
			`post_content_container.remove(".signature")`

			`if signature:`
			`signature = signature.html().strip()`
			`else:`
			`signature = None`

use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`if date_element.find("time"):`
			`timestamp = dateutil.parser.parse(date_element.text()).timestamp()`
			`else:`
			`timestamp = mktime(strptime(date_element.text(), time_format))`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`thread.children.append(Post(`
add extra post & user info 2016-11-27 00:48:55 -06:00			`author=User(`
			`name=user_header.find("p > a").eq(0).text(),`
			`avatar=user_header.find("img[alt='avatar']").attr.src,`
			`title=user_header.find(".auto-title").text(),`
			`subtitle=user_header.find(".custom_title").text(),`
			`signature=signature`
			`),`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`timestamp=timestamp,`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`body=post_content_container.html().strip()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`))`

			`return thread`