"""Scraper for Yuku Forumer forums.""" # pylint: disable=no-member from urllib.parse import urlparse from time import strptime, mktime import dateutil.parser from pyquery import PyQuery as pq from retrying import retry from ..model import User, Category, Forum, Board, Post, Thread TIME_FORMAT = "%b %d %y %I:%M %p" def can_scrape_url(url): """Returns true if this url can be scraped by this scraper.""" return ".fr.yuku.com" in url def scrape(url): """Scrapes the URL into an object.""" path = urlparse(url).path if path.startswith("/topic/"): return scrape_thread_from_url(url) elif path.startswith("/forums/"): return scrape_board_from_url(url) elif (not path) or path == "/": return scrape_index(url) @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) def get_document(url): """Returns a pyquery document for the specified url, retrying if necessary.""" return pq(url=url) def get_paged_document(url): """Returns a generator that yields all pages of the specified url.""" urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) while True: doc = get_document(url=url) yield doc nextlink = doc("a[accesskey=n]") if not nextlink: break url = "{}{}".format(baseurl, nextlink.attr.href) print(" --> Following next page link to: {}".format(url)) def scrape_index(url): """Scrapes the forum index at url into a Forum object.""" print("Scraping forum index from url: {}".format(url)) urlparts = urlparse(url) doc = get_document(url=url) forum = Forum(title=doc("title").text()) for category_element in doc("div.span9 > div.row-fluid").items(): category = Category(title=category_element.find("h3").text()) forum.categories.append(category) for board_link in category_element.find("a[href^='/forums/']").items(): full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href) board = scrape_board_from_url(full_url) board.description = board_link.closest("div").find("p").eq(0).text() category.children.append(board) print("Finished scraping all boards in category: {}".format(category.title)) return forum def scrape_board_from_url(url): """Scrapes the board index at url into a Board object.""" print("Scraping board from url: {}".format(url)) board = None for doc in get_paged_document(url): if not board: board = scrape_board_from_document(url, doc) else: board.children = board.children + scrape_board_from_document(url, doc).children print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) return board def scrape_board_from_document(url, doc): """Scrapes the given document into a Board object.""" urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) board = Board(title=doc("h1").text()) for thread_link in doc("a[href^='/topic/']").items(): if thread_link.closest(".topic-pager"): continue thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href)) board.children.append(thread) return board def scrape_thread_from_url(url): """Scrapes the given thread url into a Thread object.""" print("Scraping thread from url: {}".format(url)) thread = None for doc in get_paged_document(url): if not thread: thread = scrape_thread_from_document(doc) else: thread.children = thread.children + scrape_thread_from_document(doc).children print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) return thread def scrape_thread_from_document(doc): """Scrapes the given document into a Thread object.""" thread = Thread(title=doc("h2").eq(0).text()) for post_entry in doc("article.post-entry").items(): # 26 November 2016: Yuku's broken HTML is breaking this parsing logic #
's aren't being closed correctly so each selector actually # returns the rest of the thread's contents instead of just that post. # So we need to pick out only the first (username/signature/postbody) # to get around this. date_element = post_entry.find(".date").eq(0) post_content_container = post_entry.find(".post-content-container").eq(0) user_header = post_entry.find("header").eq(0) signature = post_content_container.find(".signature").eq(0) post_content_container.remove(".signature") if signature: signature = signature.html().strip() else: signature = None if date_element.find("time"): timestamp = dateutil.parser.parse(date_element.text()).timestamp() else: timestamp = mktime(strptime(date_element.text(), TIME_FORMAT)) thread.children.append(Post( author=User( name=user_header.find("p > a").eq(0).text(), avatar=user_header.find("img[alt='avatar']").attr.src, title=user_header.find(".auto-title").text(), subtitle=user_header.find(".custom_title").text(), signature=signature ), timestamp=timestamp, body=post_content_container.html().strip() )) return thread