dry up pagination logic using a generator
This commit is contained in:
parent
4cd9b22eb9
commit
acdf659e4a
@ -23,6 +23,21 @@ def scrape (url):
|
|||||||
def get_document (url):
|
def get_document (url):
|
||||||
return pq(url=url)
|
return pq(url=url)
|
||||||
|
|
||||||
|
def get_paged_document (url):
|
||||||
|
urlparts = urlparse(url)
|
||||||
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
d = get_document(url=url)
|
||||||
|
yield d
|
||||||
|
|
||||||
|
nextlink = d("a[accesskey=n]")
|
||||||
|
if not nextlink:
|
||||||
|
break
|
||||||
|
|
||||||
|
url = "{}{}".format(baseurl, nextlink.attr.href)
|
||||||
|
print(" --> Following next page link to: {}".format(url))
|
||||||
|
|
||||||
def scrape_index (url):
|
def scrape_index (url):
|
||||||
print("Scraping forum index from url: {}".format(url))
|
print("Scraping forum index from url: {}".format(url))
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
@ -42,25 +57,14 @@ def scrape_index (url):
|
|||||||
|
|
||||||
def scrape_board_from_url (url):
|
def scrape_board_from_url (url):
|
||||||
print("Scraping board from url: {}".format(url))
|
print("Scraping board from url: {}".format(url))
|
||||||
urlparts = urlparse(url)
|
|
||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
|
||||||
|
|
||||||
board = None
|
board = None
|
||||||
while True:
|
for d in get_paged_document(url):
|
||||||
d = get_document(url=url)
|
|
||||||
|
|
||||||
if not board:
|
if not board:
|
||||||
board = scrape_board_from_document(d)
|
board = scrape_board_from_document(d)
|
||||||
else:
|
else:
|
||||||
board.children = board.children + scrape_board_from_document(d).children
|
board.children = board.children + scrape_board_from_document(d).children
|
||||||
|
|
||||||
nextlink = d("a[accesskey=n]")
|
|
||||||
if not nextlink:
|
|
||||||
break
|
|
||||||
|
|
||||||
url = "{}{}".format(baseurl, nextlink.attr.href)
|
|
||||||
print(" --> Following next page link to: {}".format(url))
|
|
||||||
|
|
||||||
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
||||||
return board
|
return board
|
||||||
|
|
||||||
@ -76,25 +80,14 @@ def scrape_board_from_document (d):
|
|||||||
|
|
||||||
def scrape_thread_from_url (url):
|
def scrape_thread_from_url (url):
|
||||||
print("Scraping thread from url: {}".format(url))
|
print("Scraping thread from url: {}".format(url))
|
||||||
urlparts = urlparse(url)
|
|
||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
|
||||||
|
|
||||||
thread = None
|
thread = None
|
||||||
while True:
|
for d in get_paged_document(url):
|
||||||
d = get_document(url=url)
|
|
||||||
|
|
||||||
if not thread:
|
if not thread:
|
||||||
thread = scrape_thread_from_document(d)
|
thread = scrape_thread_from_document(d)
|
||||||
else:
|
else:
|
||||||
thread.children = thread.children + scrape_thread_from_document(d).children
|
thread.children = thread.children + scrape_thread_from_document(d).children
|
||||||
|
|
||||||
nextlink = d("a[accesskey=n]")
|
|
||||||
if not nextlink:
|
|
||||||
break
|
|
||||||
|
|
||||||
url = "{}{}".format(baseurl, nextlink.attr.href)
|
|
||||||
print(" --> Following next page link to: {}".format(url))
|
|
||||||
|
|
||||||
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
||||||
return thread
|
return thread
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user