only want first h1/h2 etc

This commit is contained in:
Adrian Malacoda 2016-11-27 00:16:21 -06:00
parent ea46ae8853
commit 741573d30a

View File

@ -36,7 +36,7 @@ def scrape_board (url):
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
d = pq(url=url)
board = Board(title=d("h1").text())
board = Board(title=d("h1").eq(0).text())
for thread_link in d("a[href^='/topic/']").items():
if thread_link.closest(".topic-pager"):
continue
@ -58,7 +58,7 @@ def scrape_thread (url):
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
d = pq(url=url)
thread = Thread(title=d("h2").text())
thread = Thread(title=d("h2").eq(0).text())
for post_entry in d("article.post-entry").items():
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
# <article>'s aren't being closed correctly so each selector actually