only want first h1/h2 etc
This commit is contained in:
parent
ea46ae8853
commit
741573d30a
@ -36,7 +36,7 @@ def scrape_board (url):
|
|||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
d = pq(url=url)
|
d = pq(url=url)
|
||||||
board = Board(title=d("h1").text())
|
board = Board(title=d("h1").eq(0).text())
|
||||||
for thread_link in d("a[href^='/topic/']").items():
|
for thread_link in d("a[href^='/topic/']").items():
|
||||||
if thread_link.closest(".topic-pager"):
|
if thread_link.closest(".topic-pager"):
|
||||||
continue
|
continue
|
||||||
@ -58,7 +58,7 @@ def scrape_thread (url):
|
|||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
d = pq(url=url)
|
d = pq(url=url)
|
||||||
thread = Thread(title=d("h2").text())
|
thread = Thread(title=d("h2").eq(0).text())
|
||||||
for post_entry in d("article.post-entry").items():
|
for post_entry in d("article.post-entry").items():
|
||||||
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
||||||
# <article>'s aren't being closed correctly so each selector actually
|
# <article>'s aren't being closed correctly so each selector actually
|
||||||
|
Loading…
x
Reference in New Issue
Block a user