From 4cd9b22eb96ad018c1fed415318ec22d9ada9870 Mon Sep 17 00:00:00 2001
From: Adrian Malacoda <adrian.malacoda@monarch-pass.net>
Date: Sun, 27 Nov 2016 17:42:46 -0600
Subject: [PATCH] Use a loop to iterate thread/board pages, not recursion. For
 large threads this can cause a stack overflow. Also, since we're no longer
 doing the http request in the same function that does the scraping, we need
 to limit the @retry to the function that actually does the http call as
 that's what we want to be retrying.

---
 tge/scrapers/yuku.py | 75 +++++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 25 deletions(-)

diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py
index 3d9ebd3..91f1df7 100644
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@@ -13,60 +13,92 @@ def can_scrape_url (url):
 def scrape (url):
     path = urlparse(url).path
     if path.startswith("/topic/"):
-        return scrape_thread(url)
+        return scrape_thread_from_url(url)
     elif path.startswith("/forums/"):
-        return scrape_board(url)
+        return scrape_board_from_url(url)
     elif (not path) or path == "/":
         return scrape_index(url)
 
 @retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
+def get_document (url):
+    return pq(url=url)
+
 def scrape_index (url):
     print("Scraping forum index from url: {}".format(url))
     urlparts = urlparse(url)
 
-    d = pq(url=url)
+    d = get_document(url=url)
     forum = Forum(title=d("title").text())
     for category_element in d("div.span9 > div.row-fluid").items():
         category = Category(title=category_element.find("h3").text())
         forum.categories.append(category)
         for board_link in category_element.find("a[href^='/forums/']").items():
-            board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
+            board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
             board.description = board_link.closest("div").find("p").eq(0).text()
             category.children.append(board)
         print("Finished scraping all boards in category: {}".format(category.title))
 
     return forum
 
-@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
-def scrape_board (url):
+def scrape_board_from_url (url):
     print("Scraping board from url: {}".format(url))
     urlparts = urlparse(url)
     baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
 
-    d = pq(url=url)
+    board = None
+    while True:
+        d = get_document(url=url)
+
+        if not board:
+            board = scrape_board_from_document(d)
+        else:
+            board.children = board.children + scrape_board_from_document(d).children
+
+        nextlink = d("a[accesskey=n]")
+        if not nextlink:
+            break
+
+        url = "{}{}".format(baseurl, nextlink.attr.href)
+        print(" --> Following next page link to: {}".format(url))
+
+    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
+    return board
+
+def scrape_board_from_document (d):
     board = Board(title=d("h1").text())
     for thread_link in d("a[href^='/topic/']").items():
         if thread_link.closest(".topic-pager"):
             continue
-        thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
+        thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
         board.children.append(thread)
 
-    nextlink = d("a[accesskey=n]")
-    if nextlink:
-        board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children
-
-    if not urlparts.query.startswith("page="):
-        print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
-
     return board
 
-@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
-def scrape_thread (url):
+def scrape_thread_from_url (url):
     print("Scraping thread from url: {}".format(url))
     urlparts = urlparse(url)
     baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
 
-    d = pq(url=url)
+    thread = None
+    while True:
+        d = get_document(url=url)
+
+        if not thread:
+            thread = scrape_thread_from_document(d)
+        else:
+            thread.children = thread.children + scrape_thread_from_document(d).children
+
+        nextlink = d("a[accesskey=n]")
+        if not nextlink:
+            break
+
+        url = "{}{}".format(baseurl, nextlink.attr.href)
+        print(" --> Following next page link to: {}".format(url))
+
+    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
+    return thread
+
+def scrape_thread_from_document (d):
     thread = Thread(title=d("h2").eq(0).text())
     for post_entry in d("article.post-entry").items():
         # 26 November 2016: Yuku's broken HTML is breaking this parsing logic
@@ -102,11 +134,4 @@ def scrape_thread (url):
             body=post_content_container.html().strip()
         ))
 
-    nextlink = d("a[accesskey=n]")
-    if nextlink:
-        thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children
-
-    if not urlparts.query.startswith("page="):
-        print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
-
     return thread