diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py index 42aa24b..b881821 100644 --- a/tge/scrapers/yuku.py +++ b/tge/scrapers/yuku.py @@ -20,11 +20,11 @@ def scrape_index (url): d = pq(url=url) forum = Forum(title=d("title").text()) - for category_element in d("div.span9 > div.row-fluid"): + for category_element in d("div.span9 > div.row-fluid").items(): category = Category(title=category_element.find("h3").text) forum.categories.append(category) - for board_link in pq(category_element)("a[href^='/forums/']"): - board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attrib['href'])) + for board_link in category_element.find("a[href^='/forums/']"): + board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)) category.children.append(board) print("Finished scraping all boards in category: {}".format(category.title)) @@ -37,10 +37,10 @@ def scrape_board (url): d = pq(url=url) board = Board(title=d("h1").text()) - for thread_link in d("a[href^='/topic/']"): - if d(thread_link).closest(".topic-pager"): + for thread_link in d("a[href^='/topic/']").items(): + if thread_link.closest(".topic-pager"): continue - thread = scrape_thread("{}{}".format(baseurl, thread_link.attrib['href'])) + thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href)) board.children.append(thread) nextlink = d("a[accesskey=n]") @@ -59,10 +59,15 @@ def scrape_thread (url): d = pq(url=url) thread = Thread(title=d("h2").text()) - for post_entry in d("article.post-entry"): + for post_entry in d("article.post-entry").items(): + # 26 November 2016: Yuku's broken HTML is breaking this parsing logic + #
's aren't being closed correctly so each selector actually + # returns the rest of the thread's contents instead of just that post. + # So we need to pick out only the first (username/signature/postbody) + # to get around this. thread.children.append(Post( - author=User(name=pq(post_entry)("header > p > a").text()), - body=pq(post_entry)(".post-content-container").text() + author=User(name=post_entry("header > p > a").eq(0).text()), + body=post_entry(".post-content-container").eq(0).text() )) nextlink = d("a[accesskey=n]")