dry up pagination logic using a generator

2016-11-27 17:50:25 -06:00 · 2016-11-27 17:50:25 -06:00 · acdf659e4a
commit acdf659e4a
parent 4cd9b22eb9
1 changed files with 17 additions and 24 deletions
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@ -23,6 +23,21 @@ def scrape (url):
 def get_document (url):
    return pq(url=url)
 def get_paged_document (url):
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
    while True:
        d = get_document(url=url)
        yield d
        nextlink = d("a[accesskey=n]")
        if not nextlink:
            break
        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))
 def scrape_index (url):
    print("Scraping forum index from url: {}".format(url))
    urlparts = urlparse(url)
@ -42,25 +57,14 @@ def scrape_index (url):
 def scrape_board_from_url (url):
    print("Scraping board from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
    board = None
-    while True:
+    for d in get_paged_document(url):
        d = get_document(url=url)
        if not board:
            board = scrape_board_from_document(d)
        else:
            board.children = board.children + scrape_board_from_document(d).children
        nextlink = d("a[accesskey=n]")
        if not nextlink:
            break
        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))
    print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
    return board
@ -76,25 +80,14 @@ def scrape_board_from_document (d):
 def scrape_thread_from_url (url):
    print("Scraping thread from url: {}".format(url))
    urlparts = urlparse(url)
    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
    thread = None
-    while True:
+    for d in get_paged_document(url):
        d = get_document(url=url)
        if not thread:
            thread = scrape_thread_from_document(d)
        else:
            thread.children = thread.children + scrape_thread_from_document(d).children
        nextlink = d("a[accesskey=n]")
        if not nextlink:
            break
        url = "{}{}".format(baseurl, nextlink.attr.href)
        print(" --> Following next page link to: {}".format(url))
    print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
    return thread