Use a loop to iterate thread/board pages, not recursion. For large threads this can cause a stack overflow. Also, since we're no longer doing the http request in the same function that does the scraping, we need to limit the @retry to the function that actually does the http call as that's what we want to be retrying.
This commit is contained in:
parent
b67ab06b55
commit
4cd9b22eb9
@ -13,60 +13,92 @@ def can_scrape_url (url):
|
|||||||
def scrape (url):
|
def scrape (url):
|
||||||
path = urlparse(url).path
|
path = urlparse(url).path
|
||||||
if path.startswith("/topic/"):
|
if path.startswith("/topic/"):
|
||||||
return scrape_thread(url)
|
return scrape_thread_from_url(url)
|
||||||
elif path.startswith("/forums/"):
|
elif path.startswith("/forums/"):
|
||||||
return scrape_board(url)
|
return scrape_board_from_url(url)
|
||||||
elif (not path) or path == "/":
|
elif (not path) or path == "/":
|
||||||
return scrape_index(url)
|
return scrape_index(url)
|
||||||
|
|
||||||
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
||||||
|
def get_document (url):
|
||||||
|
return pq(url=url)
|
||||||
|
|
||||||
def scrape_index (url):
|
def scrape_index (url):
|
||||||
print("Scraping forum index from url: {}".format(url))
|
print("Scraping forum index from url: {}".format(url))
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
|
|
||||||
d = pq(url=url)
|
d = get_document(url=url)
|
||||||
forum = Forum(title=d("title").text())
|
forum = Forum(title=d("title").text())
|
||||||
for category_element in d("div.span9 > div.row-fluid").items():
|
for category_element in d("div.span9 > div.row-fluid").items():
|
||||||
category = Category(title=category_element.find("h3").text())
|
category = Category(title=category_element.find("h3").text())
|
||||||
forum.categories.append(category)
|
forum.categories.append(category)
|
||||||
for board_link in category_element.find("a[href^='/forums/']").items():
|
for board_link in category_element.find("a[href^='/forums/']").items():
|
||||||
board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
|
board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
|
||||||
board.description = board_link.closest("div").find("p").eq(0).text()
|
board.description = board_link.closest("div").find("p").eq(0).text()
|
||||||
category.children.append(board)
|
category.children.append(board)
|
||||||
print("Finished scraping all boards in category: {}".format(category.title))
|
print("Finished scraping all boards in category: {}".format(category.title))
|
||||||
|
|
||||||
return forum
|
return forum
|
||||||
|
|
||||||
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
def scrape_board_from_url (url):
|
||||||
def scrape_board (url):
|
|
||||||
print("Scraping board from url: {}".format(url))
|
print("Scraping board from url: {}".format(url))
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
d = pq(url=url)
|
board = None
|
||||||
|
while True:
|
||||||
|
d = get_document(url=url)
|
||||||
|
|
||||||
|
if not board:
|
||||||
|
board = scrape_board_from_document(d)
|
||||||
|
else:
|
||||||
|
board.children = board.children + scrape_board_from_document(d).children
|
||||||
|
|
||||||
|
nextlink = d("a[accesskey=n]")
|
||||||
|
if not nextlink:
|
||||||
|
break
|
||||||
|
|
||||||
|
url = "{}{}".format(baseurl, nextlink.attr.href)
|
||||||
|
print(" --> Following next page link to: {}".format(url))
|
||||||
|
|
||||||
|
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
||||||
|
return board
|
||||||
|
|
||||||
|
def scrape_board_from_document (d):
|
||||||
board = Board(title=d("h1").text())
|
board = Board(title=d("h1").text())
|
||||||
for thread_link in d("a[href^='/topic/']").items():
|
for thread_link in d("a[href^='/topic/']").items():
|
||||||
if thread_link.closest(".topic-pager"):
|
if thread_link.closest(".topic-pager"):
|
||||||
continue
|
continue
|
||||||
thread = scrape_thread("{}{}".format(baseurl, thread_link.attr.href))
|
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
|
||||||
board.children.append(thread)
|
board.children.append(thread)
|
||||||
|
|
||||||
nextlink = d("a[accesskey=n]")
|
|
||||||
if nextlink:
|
|
||||||
board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children
|
|
||||||
|
|
||||||
if not urlparts.query.startswith("page="):
|
|
||||||
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
|
||||||
|
|
||||||
return board
|
return board
|
||||||
|
|
||||||
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
def scrape_thread_from_url (url):
|
||||||
def scrape_thread (url):
|
|
||||||
print("Scraping thread from url: {}".format(url))
|
print("Scraping thread from url: {}".format(url))
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
d = pq(url=url)
|
thread = None
|
||||||
|
while True:
|
||||||
|
d = get_document(url=url)
|
||||||
|
|
||||||
|
if not thread:
|
||||||
|
thread = scrape_thread_from_document(d)
|
||||||
|
else:
|
||||||
|
thread.children = thread.children + scrape_thread_from_document(d).children
|
||||||
|
|
||||||
|
nextlink = d("a[accesskey=n]")
|
||||||
|
if not nextlink:
|
||||||
|
break
|
||||||
|
|
||||||
|
url = "{}{}".format(baseurl, nextlink.attr.href)
|
||||||
|
print(" --> Following next page link to: {}".format(url))
|
||||||
|
|
||||||
|
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
||||||
|
return thread
|
||||||
|
|
||||||
|
def scrape_thread_from_document (d):
|
||||||
thread = Thread(title=d("h2").eq(0).text())
|
thread = Thread(title=d("h2").eq(0).text())
|
||||||
for post_entry in d("article.post-entry").items():
|
for post_entry in d("article.post-entry").items():
|
||||||
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
||||||
@ -102,11 +134,4 @@ def scrape_thread (url):
|
|||||||
body=post_content_container.html().strip()
|
body=post_content_container.html().strip()
|
||||||
))
|
))
|
||||||
|
|
||||||
nextlink = d("a[accesskey=n]")
|
|
||||||
if nextlink:
|
|
||||||
thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children
|
|
||||||
|
|
||||||
if not urlparts.query.startswith("page="):
|
|
||||||
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
|
||||||
|
|
||||||
return thread
|
return thread
|
||||||
|
Loading…
x
Reference in New Issue
Block a user