From 4cd9b22eb96ad018c1fed415318ec22d9ada9870 Mon Sep 17 00:00:00 2001 From: Adrian Malacoda Date: Sun, 27 Nov 2016 17:42:46 -0600 Subject: [PATCH] Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying. --- tge/scrapers/yuku.py | 75 +++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py index 3d9ebd3..91f1df7 100644 --- a/tge/scrapers/yuku.py +++ b/tge/scrapers/yuku.py @@ -13,60 +13,92 @@ def can_scrape_url (url): def scrape (url): path = urlparse(url).path if path.startswith("/topic/"): - return scrape_thread(url) + return scrape_thread_from_url(url) elif path.startswith("/forums/"): - return scrape_board(url) + return scrape_board_from_url(url) elif (not path) or path == "/": return scrape_index(url) @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) +def get_document (url): + return pq(url=url) + def scrape_index (url): print("Scraping forum index from url: {}".format(url)) urlparts = urlparse(url) - d = pq(url=url) + d = get_document(url=url) forum = Forum(title=d("title").text()) for category_element in d("div.span9 > div.row-fluid").items(): category = Category(title=category_element.find("h3").text()) forum.categories.append(category) for board_link in category_element.find("a[href^='/forums/']").items(): - board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) + board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) board.description = board_link.closest("div").find("p").eq(0).text() category.children.append(board) print("Finished scraping all boards in category: {}".format(category.title)) return forum -@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) -def scrape_board (url): +def scrape_board_from_url (url): print("Scraping board from url: {}".format(url)) urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) - d = pq(url=url) + board = None + while True: + d = get_document(url=url) + + if not board: + board = scrape_board_from_document(d) + else: + board.children = board.children + scrape_board_from_document(d).children + + nextlink = d("a[accesskey=n]") + if not nextlink: + break + + url = "{}{}".format(baseurl, nextlink.attr.href) + print(" --> Following next page link to: {}".format(url)) + + print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) + return board + +def scrape_board_from_document (d): board = Board(title=d("h1").text()) for thread_link in d("a[href^='/topic/']").items(): if thread_link.closest(".topic-pager"): continue - thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href)) + thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href)) board.children.append(thread) - nextlink = d("a[accesskey=n]") - if nextlink: - board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children - - if not urlparts.query.startswith("page="): - print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) - return board -@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) -def scrape_thread (url): +def scrape_thread_from_url (url): print("Scraping thread from url: {}".format(url)) urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) - d = pq(url=url) + thread = None + while True: + d = get_document(url=url) + + if not thread: + thread = scrape_thread_from_document(d) + else: + thread.children = thread.children + scrape_thread_from_document(d).children + + nextlink = d("a[accesskey=n]") + if not nextlink: + break + + url = "{}{}".format(baseurl, nextlink.attr.href) + print(" --> Following next page link to: {}".format(url)) + + print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) + return thread + +def scrape_thread_from_document (d): thread = Thread(title=d("h2").eq(0).text()) for post_entry in d("article.post-entry").items(): # 26 November 2016: Yuku's broken HTML is breaking this parsing logic @@ -102,11 +134,4 @@ def scrape_thread (url): body=post_content_container.html().strip() )) - nextlink = d("a[accesskey=n]") - if nextlink: - thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children - - if not urlparts.query.startswith("page="): - print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) - return thread