fix for yuku's broken postbit markup
This commit is contained in:
parent
f4540d4030
commit
eabf099f47
@ -20,11 +20,11 @@ def scrape_index (url):
|
|||||||
|
|
||||||
d = pq(url=url)
|
d = pq(url=url)
|
||||||
forum = Forum(title=d("title").text())
|
forum = Forum(title=d("title").text())
|
||||||
for category_element in d("div.span9 > div.row-fluid"):
|
for category_element in d("div.span9 > div.row-fluid").items():
|
||||||
category = Category(title=category_element.find("h3").text)
|
category = Category(title=category_element.find("h3").text)
|
||||||
forum.categories.append(category)
|
forum.categories.append(category)
|
||||||
for board_link in pq(category_element)("a[href^='/forums/']"):
|
for board_link in category_element.find("a[href^='/forums/']"):
|
||||||
board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attrib['href']))
|
board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
|
||||||
category.children.append(board)
|
category.children.append(board)
|
||||||
print("Finished scraping all boards in category: {}".format(category.title))
|
print("Finished scraping all boards in category: {}".format(category.title))
|
||||||
|
|
||||||
@ -37,10 +37,10 @@ def scrape_board (url):
|
|||||||
|
|
||||||
d = pq(url=url)
|
d = pq(url=url)
|
||||||
board = Board(title=d("h1").text())
|
board = Board(title=d("h1").text())
|
||||||
for thread_link in d("a[href^='/topic/']"):
|
for thread_link in d("a[href^='/topic/']").items():
|
||||||
if d(thread_link).closest(".topic-pager"):
|
if thread_link.closest(".topic-pager"):
|
||||||
continue
|
continue
|
||||||
thread = scrape_thread("{}{}".format(baseurl, thread_link.attrib['href']))
|
thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
|
||||||
board.children.append(thread)
|
board.children.append(thread)
|
||||||
|
|
||||||
nextlink = d("a[accesskey=n]")
|
nextlink = d("a[accesskey=n]")
|
||||||
@ -59,10 +59,15 @@ def scrape_thread (url):
|
|||||||
|
|
||||||
d = pq(url=url)
|
d = pq(url=url)
|
||||||
thread = Thread(title=d("h2").text())
|
thread = Thread(title=d("h2").text())
|
||||||
for post_entry in d("article.post-entry"):
|
for post_entry in d("article.post-entry").items():
|
||||||
|
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
||||||
|
# <article>'s aren't being closed correctly so each selector actually
|
||||||
|
# returns the rest of the thread's contents instead of just that post.
|
||||||
|
# So we need to pick out only the first (username/signature/postbody)
|
||||||
|
# to get around this.
|
||||||
thread.children.append(Post(
|
thread.children.append(Post(
|
||||||
author=User(name=pq(post_entry)("header > p > a").text()),
|
author=User(name=post_entry("header > p > a").eq(0).text()),
|
||||||
body=pq(post_entry)(".post-content-container").text()
|
body=post_entry(".post-content-container").eq(0).text()
|
||||||
))
|
))
|
||||||
|
|
||||||
nextlink = d("a[accesskey=n]")
|
nextlink = d("a[accesskey=n]")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user