the-great-escape/tge/scrapers/yuku.py

from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq

time_format = "%b %d %y %I:%M %p"

def can_scrape_url (url):
    return ".fr.yuku.com" in url

def scrape (url):
    path = urlparse(url).path
    if path.startswith("/topic/"):
        return scrape_thread(url)
    elif path.startswith("/forums/"):
        return scrape_board(url)
    elif (not path) or path == "/":
        return scrape_index(url)

def scrape_index (url):
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)

    d = pq(url=url)
    forum = Forum(title=d("title").text())
    for category_element in d("div.span9 > div.row-fluid").items():
        category = Category(title=category_element.find("h3").text())
        forum.categories.append(category)
        for board_link in category_element.find("a[href^='/forums/']").items():
            board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
            board.description = board_link.closest("div").find("p").eq(0).text()
            category.children.append(board)
        print("Finished scraping all boards in category: {}".format(category.title))

    return forum

def scrape_board (url):
    print("Scraping board from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    d = pq(url=url)
    board = Board(title=d("h1").eq(0).text())
    for thread_link in d("a[href^='/topic/']").items():
        if thread_link.closest(".topic-pager"):
            continue
        thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
        board.children.append(thread)

    nextlink = d("a[accesskey=n]")
    if nextlink:
        board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children

    if not urlparts.query.startswith("page="):
        print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))

    return board

def scrape_thread (url):
    print("Scraping thread from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    d = pq(url=url)
    thread = Thread(title=d("h2").eq(0).text())
    for post_entry in d("article.post-entry").items():
        # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
        #     <article>'s aren't being closed correctly so each selector actually
        #     returns the rest of the thread's contents instead of just that post.
        #     So we need to pick out only the first (username/signature/postbody)
        #     to get around this.
        date_element = post_entry.find(".date").eq(0)
        post_content_container = post_entry.find(".post-content-container").eq(0)
        user_header = post_entry.find("header").eq(0)
        signature = post_content_container.find(".signature").eq(0)
        post_content_container.remove(".signature")

        if signature:
            signature = signature.html().strip()
        else:
            signature = None

        if date_element.find("time"):
            timestamp = dateutil.parser.parse(date_element.text()).timestamp()
        else:
            timestamp = mktime(strptime(date_element.text(), time_format))

        thread.children.append(Post(
            author=User(
                name=user_header.find("p > a").eq(0).text(),
                avatar=user_header.find("img[alt='avatar']").attr.src,
                title=user_header.find(".auto-title").text(),
                subtitle=user_header.find(".custom_title").text(),
                signature=signature
            ),
            timestamp=timestamp,
            body=post_content_container.html().strip()
        ))

    nextlink = d("a[accesskey=n]")
    if nextlink:
        thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children

    if not urlparts.query.startswith("page="):
        print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))

    return thread
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from ..model import User, Category, Forum, Board, Post, Thread`
			`from urllib.parse import urlparse`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`from time import strptime, mktime`
			`import dateutil.parser`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`from pyquery import PyQuery as pq`

use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`time_format = "%b %d %y %I:%M %p"`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`def can_scrape_url (url):`
for now, limit to forumer forums (fr.yuku.com) as I'm not sure if this scraper will support non-forumer ones 2016-11-27 00:18:39 -06:00			`return ".fr.yuku.com" in url`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00
			`def scrape (url):`
			`path = urlparse(url).path`
			`if path.startswith("/topic/"):`
			`return scrape_thread(url)`
			`elif path.startswith("/forums/"):`
			`return scrape_board(url)`
			`elif (not path) or path == "/":`
			`return scrape_index(url)`

			`def scrape_index (url):`
			`print("Scraping forum index from url: {}".format(url))`
			`urlparts = urlparse(url)`

			`d = pq(url=url)`
			`forum = Forum(title=d("title").text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for category_element in d("div.span9 > div.row-fluid").items():`
.text() not text 2016-11-27 00:14:16 -06:00			`category = Category(title=category_element.find("h3").text())`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`forum.categories.append(category)`
need to use .items() grumble grumble 2016-11-27 00:11:42 -06:00			`for board_link in category_element.find("a[href^='/forums/']").items():`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))`
grab board description from forum index (we can't get it from the board index) 2016-11-27 01:13:45 -06:00			`board.description = board_link.closest("div").find("p").eq(0).text()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`category.children.append(board)`
			`print("Finished scraping all boards in category: {}".format(category.title))`

			`return forum`

			`def scrape_board (url):`
			`print("Scraping board from url: {}".format(url))`
			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

			`d = pq(url=url)`
only want first h1/h2 etc 2016-11-27 00:16:21 -06:00			`board = Board(title=d("h1").eq(0).text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for thread_link in d("a[href^='/topic/']").items():`
			`if thread_link.closest(".topic-pager"):`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`continue`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`board.children.append(thread)`

			`nextlink = d("a[accesskey=n]")`
			`if nextlink:`
			`board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children`

			`if not urlparts.query.startswith("page="):`
			`print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))`

			`return board`

			`def scrape_thread (url):`
			`print("Scraping thread from url: {}".format(url))`
			`urlparts = urlparse(url)`
			`baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)`

			`d = pq(url=url)`
only want first h1/h2 etc 2016-11-27 00:16:21 -06:00			`thread = Thread(title=d("h2").eq(0).text())`
fix for yuku's broken postbit markup 2016-11-26 23:42:30 -06:00			`for post_entry in d("article.post-entry").items():`
			`# 26 November 2016: Yuku's broken HTML is breaking this parsing logic`
			`# <article>'s aren't being closed correctly so each selector actually`
			`# returns the rest of the thread's contents instead of just that post.`
			`# So we need to pick out only the first (username/signature/postbody)`
			`# to get around this.`
add extra post & user info 2016-11-27 00:48:55 -06:00			`date_element = post_entry.find(".date").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`post_content_container = post_entry.find(".post-content-container").eq(0)`
add extra post & user info 2016-11-27 00:48:55 -06:00			`user_header = post_entry.find("header").eq(0)`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`signature = post_content_container.find(".signature").eq(0)`
			`post_content_container.remove(".signature")`

			`if signature:`
			`signature = signature.html().strip()`
			`else:`
			`signature = None`

use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`if date_element.find("time"):`
			`timestamp = dateutil.parser.parse(date_element.text()).timestamp()`
			`else:`
			`timestamp = mktime(strptime(date_element.text(), time_format))`

initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`thread.children.append(Post(`
add extra post & user info 2016-11-27 00:48:55 -06:00			`author=User(`
			`name=user_header.find("p > a").eq(0).text(),`
			`avatar=user_header.find("img[alt='avatar']").attr.src,`
			`title=user_header.find(".auto-title").text(),`
			`subtitle=user_header.find(".custom_title").text(),`
			`signature=signature`
			`),`
use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present. 2016-11-27 01:10:04 -06:00			`timestamp=timestamp,`
fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up 2016-11-27 00:03:30 -06:00			`body=post_content_container.html().strip()`
initial commit for the-great-escape yuku scraper 2016-11-26 23:09:12 -06:00			`))`

			`nextlink = d("a[accesskey=n]")`
			`if nextlink:`
			`thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children`

			`if not urlparts.query.startswith("page="):`
			`print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))`

			`return thread`