the-great-escape/tge/scrapers/yuku.py

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
from retrying import retry

time_format = "%b %d %y %I:%M %p"

def can_scrape_url (url):
    return ".fr.yuku.com" in url

def scrape (url):
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread(url)
    elif path.startswith("/forums/"):
        return scrape_board(url)
    elif (not path) or path == "/":
        return scrape_index(url)

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def scrape_index (url):
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

    d = pq(url=url)
    forum = Forum(title=d("title").text())
    for category_element in d("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
            board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def scrape_board (url):
    print("Scraping board from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    d = pq(url=url)
    board = Board(title=d("h1").text())
    for thread_link in d("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
        board.children.append(thread)

    nextlink = d("a[accesskey=n]")
    if nextlink:
        board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children

    if not urlparts.query.startswith("page="):
        print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))

    return board

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def scrape_thread (url):
    print("Scraping thread from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    d = pq(url=url)
    thread = Thread(title=d("h2").eq(0).text())
    for post_entry in d("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
        #     So we need to pick out only the first (username/signature/postbody)
        #     to get around this.
        date_element = post_entry.find(".date").eq(0)
        post_content_container = post_entry.find(".post-content-container").eq(0)
        user_header = post_entry.find("header").eq(0)
        signature = post_content_container.find(".signature").eq(0)
        post_content_container.remove(".signature")

        if signature:
            signature = signature.html().strip()
        else:
            signature = None

        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
            timestamp = mktime(strptime(date_element.text(), time_format))

        thread.children.append(Post(
            author=User(
                name=user_header.find("p > a").eq(0).text(),
                avatar=user_header.find("img[alt='avatar']").attr.src,
                title=user_header.find(".auto-title").text(),
                subtitle=user_header.find(".custom_title").text(),
                signature=signature
            ),
            timestamp=timestamp,
            body=post_content_container.html().strip()
        ))

    nextlink = d("a[accesskey=n]")
    if nextlink:
        thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children

    if not urlparts.query.startswith("page="):
        print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))

    return thread
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from ..model import User, Category, Forum, Board, Post, Thread`
			`from urllib.parse import urlparse`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`from time import strptime, mktime`
			`import dateutil.parser`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from pyquery import PyQuery as pq`
Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`from retrying import retry`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`time_format = "%b %d %y %I:%M %p"`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`def can_scrape_url (url):`
for now, limit to forumer forums (fr.yuku.com) as I'm not sure if this scraper will support non-forumer ones 2016-11-27 00:18:39 -06:00			`return ".fr.yuku.com" in url`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00
			`def scrape (url):`
			`path = urlparse(url).path`
			`if path.startswith("/topic/"):`
			`return scrape_thread(url)`
			`elif path.startswith("/forums/"):`
			`return scrape_board(url)`
			`elif (not path) or path == "/":`
			`return scrape_index(url)`

Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`def scrape_index (url):`
			`print("Scraping forum index from url: {}".format(url))`
			`urlparts = urlparse(url)`

			`d = pq(url=url)`
			`forum = Forum(title=d("title").text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for category_element in d("div.span9 > div.row-fluid").items():`
.text() not text 2016-11-27 00:14:16 -06:00			`category = Category(title=category_element.find("h3").text())`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`forum.categories.append(category)`
need to use .items() grumble grumble 2016-11-27 00:11:42 -06:00			`for board_link in category_element.find("a[href^='/forums/']").items():`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))`
grab board description from forum index (we can't get it from the board index) 2016-11-27 01:13:45 -06:00			`board.description = board_link.closest("div").find("p").eq(0).text()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`category.children.append(board)`
			`print("Finished scraping all boards in category: {}".format(category.title))`

			`return forum`

Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`def scrape_board (url):`
			`print("Scraping board from url: {}".format(url))`
			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

			`d = pq(url=url)`
there's multiple h1's on the page and the one we want is like .eq(2) or something. But once you start addressing nodes by index like that you get real brittle and can break easily. I don't think we have a problem with just selecting all h1's here. 2016-11-27 01:19:59 -06:00			`board = Board(title=d("h1").text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for thread_link in d("a[href^='/topic/']").items():`
			`if thread_link.closest(".topic-pager"):`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`continue`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`board.children.append(thread)`

			`nextlink = d("a[accesskey=n]")`
			`if nextlink:`
			`board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children`

			`if not urlparts.query.startswith("page="):`
			`print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))`

			`return board`

Add exponential backoff for retrying 2016-11-27 13:03:13 -06:00			`@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`def scrape_thread (url):`
			`print("Scraping thread from url: {}".format(url))`
			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

			`d = pq(url=url)`
only want first h1/h2 etc 2016-11-27 00:16:21 -06:00			`thread = Thread(title=d("h2").eq(0).text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for post_entry in d("article.post-entry").items():`
			`# 26 November 2016: Yuku's broken HTML is breaking this parsing logic`
			`# <article>'s aren't being closed correctly so each selector actually`
			`# returns the rest of the thread's contents instead of just that post.`
			`# So we need to pick out only the first (username/signature/postbody)`
			`# to get around this.`
add extra post & user info 2016-11-27 00:48:55 -06:00			`date_element = post_entry.find(".date").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`post_content_container = post_entry.find(".post-content-container").eq(0)`
add extra post & user info 2016-11-27 00:48:55 -06:00			`user_header = post_entry.find("header").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`signature = post_content_container.find(".signature").eq(0)`
			`post_content_container.remove(".signature")`

			`if signature:`
			`signature = signature.html().strip()`
			`else:`
			`signature = None`

use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`if date_element.find("time"):`
			`timestamp = dateutil.parser.parse(date_element.text()).timestamp()`
			`else:`
			`timestamp = mktime(strptime(date_element.text(), time_format))`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`thread.children.append(Post(`
add extra post & user info 2016-11-27 00:48:55 -06:00			`author=User(`
			`name=user_header.find("p > a").eq(0).text(),`
			`avatar=user_header.find("img[alt='avatar']").attr.src,`
			`title=user_header.find(".auto-title").text(),`
			`subtitle=user_header.find(".custom_title").text(),`
			`signature=signature`
			`),`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`timestamp=timestamp,`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`body=post_content_container.html().strip()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`))`

			`nextlink = d("a[accesskey=n]")`
			`if nextlink:`
			`thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children`

			`if not urlparts.query.startswith("page="):`
			`print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))`

			`return thread`