from ..model import User, Category, Forum, Board, Post, Thread from urllib.parse import urlparse from time import strptime, mktime import dateutil.parser from pyquery import PyQuery as pq from retrying import retry time_format = "%b %d %y %I:%M %p" def can_scrape_url (url): return ".fr.yuku.com" in url def scrape (url): path = urlparse(url).path if path.startswith("/topic/"): return scrape_thread_from_url(url) elif path.startswith("/forums/"): return scrape_board_from_url(url) elif (not path) or path == "/": return scrape_index(url) @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) def get_document (url): return pq(url=url) def get_paged_document (url): urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) while True: d = get_document(url=url) yield d nextlink = d("a[accesskey=n]") if not nextlink: break url = "{}{}".format(baseurl, nextlink.attr.href) print(" --> Following next page link to: {}".format(url)) def scrape_index (url): print("Scraping forum index from url: {}".format(url)) urlparts = urlparse(url) d = get_document(url=url) forum = Forum(title=d("title").text()) for category_element in d("div.span9 > div.row-fluid").items(): category = Category(title=category_element.find("h3").text()) forum.categories.append(category) for board_link in category_element.find("a[href^='/forums/']").items(): board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) board.description = board_link.closest("div").find("p").eq(0).text() category.children.append(board) print("Finished scraping all boards in category: {}".format(category.title)) return forum def scrape_board_from_url (url): print("Scraping board from url: {}".format(url)) board = None for d in get_paged_document(url): if not board: board = scrape_board_from_document(d) else: board.children = board.children + scrape_board_from_document(d).children print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) return board def scrape_board_from_document (d): board = Board(title=d("h1").text()) for thread_link in d("a[href^='/topic/']").items(): if thread_link.closest(".topic-pager"): continue thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href)) board.children.append(thread) return board def scrape_thread_from_url (url): print("Scraping thread from url: {}".format(url)) thread = None for d in get_paged_document(url): if not thread: thread = scrape_thread_from_document(d) else: thread.children = thread.children + scrape_thread_from_document(d).children print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) return thread def scrape_thread_from_document (d): thread = Thread(title=d("h2").eq(0).text()) for post_entry in d("article.post-entry").items(): # 26 November 2016: Yuku's broken HTML is breaking this parsing logic #
's aren't being closed correctly so each selector actually # returns the rest of the thread's contents instead of just that post. # So we need to pick out only the first (username/signature/postbody) # to get around this. date_element = post_entry.find(".date").eq(0) post_content_container = post_entry.find(".post-content-container").eq(0) user_header = post_entry.find("header").eq(0) signature = post_content_container.find(".signature").eq(0) post_content_container.remove(".signature") if signature: signature = signature.html().strip() else: signature = None if date_element.find("time"): timestamp = dateutil.parser.parse(date_element.text()).timestamp() else: timestamp = mktime(strptime(date_element.text(), time_format)) thread.children.append(Post( author=User( name=user_header.find("p > a").eq(0).text(), avatar=user_header.find("img[alt='avatar']").attr.src, title=user_header.find(".auto-title").text(), subtitle=user_header.find(".custom_title").text(), signature=signature ), timestamp=timestamp, body=post_content_container.html().strip() )) return thread