need to pass url to scrape_board_from_document
This commit is contained in:
parent
acdf659e4a
commit
77775ae0be
@ -26,7 +26,7 @@ def get_document (url):
|
|||||||
def get_paged_document (url):
|
def get_paged_document (url):
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
d = get_document(url=url)
|
d = get_document(url=url)
|
||||||
yield d
|
yield d
|
||||||
@ -61,14 +61,17 @@ def scrape_board_from_url (url):
|
|||||||
board = None
|
board = None
|
||||||
for d in get_paged_document(url):
|
for d in get_paged_document(url):
|
||||||
if not board:
|
if not board:
|
||||||
board = scrape_board_from_document(d)
|
board = scrape_board_from_document(url, d)
|
||||||
else:
|
else:
|
||||||
board.children = board.children + scrape_board_from_document(d).children
|
board.children = board.children + scrape_board_from_document(url, d).children
|
||||||
|
|
||||||
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
||||||
return board
|
return board
|
||||||
|
|
||||||
def scrape_board_from_document (d):
|
def scrape_board_from_document (url, d):
|
||||||
|
urlparts = urlparse(url)
|
||||||
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
board = Board(title=d("h1").text())
|
board = Board(title=d("h1").text())
|
||||||
for thread_link in d("a[href^='/topic/']").items():
|
for thread_link in d("a[href^='/topic/']").items():
|
||||||
if thread_link.closest(".topic-pager"):
|
if thread_link.closest(".topic-pager"):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user