fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up
This commit is contained in:
parent
6fb7980218
commit
b304297019
@ -65,9 +65,18 @@ def scrape_thread (url):
|
||||
# returns the rest of the thread's contents instead of just that post.
|
||||
# So we need to pick out only the first (username/signature/postbody)
|
||||
# to get around this.
|
||||
post_content_container = post_entry.find(".post-content-container").eq(0)
|
||||
signature = post_content_container.find(".signature").eq(0)
|
||||
post_content_container.remove(".signature")
|
||||
|
||||
if signature:
|
||||
signature = signature.html().strip()
|
||||
else:
|
||||
signature = None
|
||||
|
||||
thread.children.append(Post(
|
||||
author=User(name=post_entry("header > p > a").eq(0).text()),
|
||||
body=post_entry(".post-content-container").eq(0).text()
|
||||
author=User(name=post_entry.find("header > p > a").eq(0).text(), signature=signature),
|
||||
body=post_content_container.html().strip()
|
||||
))
|
||||
|
||||
nextlink = d("a[accesskey=n]")
|
||||
|
Loading…
x
Reference in New Issue
Block a user