From acdf659e4a45490a28fb6a52be6c8ef0696e32c6 Mon Sep 17 00:00:00 2001 From: Adrian Malacoda Date: Sun, 27 Nov 2016 17:50:25 -0600 Subject: [PATCH] dry up pagination logic using a generator --- tge/scrapers/yuku.py | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py index 91f1df7..4a5e22d 100644 --- a/tge/scrapers/yuku.py +++ b/tge/scrapers/yuku.py @@ -23,6 +23,21 @@ def scrape (url): def get_document (url): return pq(url=url) +def get_paged_document (url): + urlparts = urlparse(url) + baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) + + while True: + d = get_document(url=url) + yield d + + nextlink = d("a[accesskey=n]") + if not nextlink: + break + + url = "{}{}".format(baseurl, nextlink.attr.href) + print(" --> Following next page link to: {}".format(url)) + def scrape_index (url): print("Scraping forum index from url: {}".format(url)) urlparts = urlparse(url) @@ -42,25 +57,14 @@ def scrape_index (url): def scrape_board_from_url (url): print("Scraping board from url: {}".format(url)) - urlparts = urlparse(url) - baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) board = None - while True: - d = get_document(url=url) - + for d in get_paged_document(url): if not board: board = scrape_board_from_document(d) else: board.children = board.children + scrape_board_from_document(d).children - nextlink = d("a[accesskey=n]") - if not nextlink: - break - - url = "{}{}".format(baseurl, nextlink.attr.href) - print(" --> Following next page link to: {}".format(url)) - print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) return board @@ -76,25 +80,14 @@ def scrape_board_from_document (d): def scrape_thread_from_url (url): print("Scraping thread from url: {}".format(url)) - urlparts = urlparse(url) - baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) thread = None - while True: - d = get_document(url=url) - + for d in get_paged_document(url): if not thread: thread = scrape_thread_from_document(d) else: thread.children = thread.children + scrape_thread_from_document(d).children - nextlink = d("a[accesskey=n]") - if not nextlink: - break - - url = "{}{}".format(baseurl, nextlink.attr.href) - print(" --> Following next page link to: {}".format(url)) - print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) return thread