need to pass url to scrape_board_from_document

This commit is contained in:
Adrian Malacoda 2016-11-27 17:52:47 -06:00
parent acdf659e4a
commit 77775ae0be

View File

@ -61,14 +61,17 @@ def scrape_board_from_url (url):
board = None board = None
for d in get_paged_document(url): for d in get_paged_document(url):
if not board: if not board:
board = scrape_board_from_document(d) board = scrape_board_from_document(url, d)
else: else:
board.children = board.children + scrape_board_from_document(d).children board.children = board.children + scrape_board_from_document(url, d).children
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
return board return board
def scrape_board_from_document (d): def scrape_board_from_document (url, d):
urlparts = urlparse(url)
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
board = Board(title=d("h1").text()) board = Board(title=d("h1").text())
for thread_link in d("a[href^='/topic/']").items(): for thread_link in d("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"): if thread_link.closest(".topic-pager"):