diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py index 4a5e22d..ed63cfc 100644 --- a/tge/scrapers/yuku.py +++ b/tge/scrapers/yuku.py @@ -26,7 +26,7 @@ def get_document (url): def get_paged_document (url): urlparts = urlparse(url) baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) - + while True: d = get_document(url=url) yield d @@ -61,14 +61,17 @@ def scrape_board_from_url (url): board = None for d in get_paged_document(url): if not board: - board = scrape_board_from_document(d) + board = scrape_board_from_document(url, d) else: - board.children = board.children + scrape_board_from_document(d).children + board.children = board.children + scrape_board_from_document(url, d).children print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) return board -def scrape_board_from_document (d): +def scrape_board_from_document (url, d): + urlparts = urlparse(url) + baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) + board = Board(title=d("h1").text()) for thread_link in d("a[href^='/topic/']").items(): if thread_link.closest(".topic-pager"):