from ..model import User, Category, Forum, Board, Post, Thread from urllib.parse import urlparse from pyquery import PyQuery as pq def can_scrape_url (url): return ".yuku.com" in url def scrape (url): path = urlparse(url).path if path.startswith("/topic/"): return scrape_thread(url) elif path.startswith("/forums/"): return scrape_board(url) elif (not path) or path == "/": return scrape_index(url) def scrape_index (url): print("Scraping forum index from url: {}".format(url)) urlparts = urlparse(url) d = pq(url=url) forum = Forum(title=d("title").text()) for category_element in d("div.span9 > div.row-fluid").items(): category = Category(title=category_element.find("h3").text()) forum.categories.append(category) for board_link in category_element.find("a[href^='/forums/']").items(): board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) category.children.append(board) print("Finished scraping all boards in category: {}".format(category.title)) return forum def scrape_board (url): print("Scraping board from url: {}".format(url)) urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) d = pq(url=url) board = Board(title=d("h1").eq(0).text()) for thread_link in d("a[href^='/topic/']").items(): if thread_link.closest(".topic-pager"): continue thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href)) board.children.append(thread) nextlink = d("a[accesskey=n]") if nextlink: board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children if not urlparts.query.startswith("page="): print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) return board def scrape_thread (url): print("Scraping thread from url: {}".format(url)) urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) d = pq(url=url) thread = Thread(title=d("h2").eq(0).text()) for post_entry in d("article.post-entry").items(): # 26 November 2016: Yuku's broken HTML is breaking this parsing logic #
's aren't being closed correctly so each selector actually # returns the rest of the thread's contents instead of just that post. # So we need to pick out only the first (username/signature/postbody) # to get around this. post_content_container = post_entry.find(".post-content-container").eq(0) signature = post_content_container.find(".signature").eq(0) post_content_container.remove(".signature") if signature: signature = signature.html().strip() else: signature = None thread.children.append(Post( author=User(name=post_entry.find("header > p > a").eq(0).text(), signature=signature), body=post_content_container.html().strip() )) nextlink = d("a[accesskey=n]") if nextlink: thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children if not urlparts.query.startswith("page="): print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) return thread