fix signature parsing, use html instead of text. Unfortunately there's a lot of garbage here we'll have to clean up

This commit is contained in:
Adrian Malacoda 2016-11-27 00:03:30 -06:00
parent 6fb7980218
commit b304297019

View File

@ -65,9 +65,18 @@ def scrape_thread (url):
# returns the rest of the thread's contents instead of just that post.
# So we need to pick out only the first (username/signature/postbody)
# to get around this.
post_content_container = post_entry.find(".post-content-container").eq(0)
signature = post_content_container.find(".signature").eq(0)
post_content_container.remove(".signature")
if signature:
signature = signature.html().strip()
else:
signature = None
thread.children.append(Post(
author=User(name=post_entry("header > p > a").eq(0).text()),
body=post_entry(".post-content-container").eq(0).text()
author=User(name=post_entry.find("header > p > a").eq(0).text(), signature=signature),
body=post_content_container.html().strip()
))
nextlink = d("a[accesskey=n]")