From 741573d30ac05494977c16b59869a764a135daa3 Mon Sep 17 00:00:00 2001 From: Adrian Malacoda Date: Sun, 27 Nov 2016 00:16:21 -0600 Subject: [PATCH] only want first h1/h2 etc --- tge/scrapers/yuku.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py index 2043471..c7b8c33 100644 --- a/tge/scrapers/yuku.py +++ b/tge/scrapers/yuku.py @@ -36,7 +36,7 @@ def scrape_board (url): baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) d = pq(url=url) - board = Board(title=d("h1").text()) + board = Board(title=d("h1").eq(0).text()) for thread_link in d("a[href^='/topic/']").items(): if thread_link.closest(".topic-pager"): continue @@ -58,7 +58,7 @@ def scrape_thread (url): baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) d = pq(url=url) - thread = Thread(title=d("h2").text()) + thread = Thread(title=d("h2").eq(0).text()) for post_entry in d("article.post-entry").items(): # 26 November 2016: Yuku's broken HTML is breaking this parsing logic #
's aren't being closed correctly so each selector actually