dry up pagination logic using a generator

This commit is contained in:
Adrian Malacoda 2016-11-27 17:50:25 -06:00
parent 4cd9b22eb9
commit acdf659e4a

View File

@ -23,6 +23,21 @@ def scrape (url):
def get_document (url):
return pq(url=url)
def get_paged_document (url):
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
while True:
d = get_document(url=url)
yield d
nextlink = d("a[accesskey=n]")
if not nextlink:
break
url = "{}{}".format(baseurl, nextlink.attr.href)
print(" --> Following next page link to: {}".format(url))
def scrape_index (url):
print("Scraping forum index from url: {}".format(url))
urlparts = urlparse(url)
@ -42,25 +57,14 @@ def scrape_index (url):
def scrape_board_from_url (url):
print("Scraping board from url: {}".format(url))
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
board = None
while True:
d = get_document(url=url)
for d in get_paged_document(url):
if not board:
board = scrape_board_from_document(d)
else:
board.children = board.children + scrape_board_from_document(d).children
nextlink = d("a[accesskey=n]")
if not nextlink:
break
url = "{}{}".format(baseurl, nextlink.attr.href)
print(" --> Following next page link to: {}".format(url))
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board
@ -76,25 +80,14 @@ def scrape_board_from_document (d):
def scrape_thread_from_url (url):
print("Scraping thread from url: {}".format(url))
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
thread = None
while True:
d = get_document(url=url)
for d in get_paged_document(url):
if not thread:
thread = scrape_thread_from_document(d)
else:
thread.children = thread.children + scrape_thread_from_document(d).children
nextlink = d("a[accesskey=n]")
if not nextlink:
break
url = "{}{}".format(baseurl, nextlink.attr.href)
print(" --> Following next page link to: {}".format(url))
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
return thread