dry up pagination logic using a generator

2016-11-27 17:50:25 -06:00 · 2016-11-27 17:50:25 -06:00 · acdf659e4a
commit acdf659e4a
parent 4cd9b22eb9
1 changed files with 17 additions and 24 deletions
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@ -23,6 +23,21 @@ def scrape (url):
 def get_document (url):
    return pq(url=url)

+def get_paged_document (url):
+    urlparts = urlparse(url)
+    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
+    
+    while True:
+        d = get_document(url=url)
+        yield d
+
+        nextlink = d("a[accesskey=n]")
+        if not nextlink:
+            break
+
+        url = "{}{}".format(baseurl, nextlink.attr.href)
+        print(" --> Following next page link to: {}".format(url))
+
 def scrape_index (url):
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)
@ -42,25 +57,14 @@ def scrape_index (url):

 def scrape_board_from_url (url):
    print("Scraping board from url: {}".format(url))
-    urlparts = urlparse(url)
-    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    board = None
-    while True:
-        d = get_document(url=url)
-
+    for d in get_paged_document(url):
        if not board:
            board = scrape_board_from_document(d)
        else:
            board.children = board.children + scrape_board_from_document(d).children

-        nextlink = d("a[accesskey=n]")
-        if not nextlink:
-            break
-
-        url = "{}{}".format(baseurl, nextlink.attr.href)
-        print(" --> Following next page link to: {}".format(url))
-
    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board

@ -76,25 +80,14 @@ def scrape_board_from_document (d):

 def scrape_thread_from_url (url):
    print("Scraping thread from url: {}".format(url))
-    urlparts = urlparse(url)
-    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)

    thread = None
-    while True:
-        d = get_document(url=url)
-
+    for d in get_paged_document(url):
        if not thread:
            thread = scrape_thread_from_document(d)
        else:
            thread.children = thread.children + scrape_thread_from_document(d).children

-        nextlink = d("a[accesskey=n]")
-        if not nextlink:
-            break
-
-        url = "{}{}".format(baseurl, nextlink.attr.href)
-        print(" --> Following next page link to: {}".format(url))
-
    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread