commit 0307b71016365a97adb6c6bec3e13fa8c7917896 Author: Izwzyzx <184772711+Izwzyzx@users.noreply.github.com> Date: Thu Feb 6 15:37:58 2020 -0600 Initial commit diff --git a/ariados.py b/ariados.py new file mode 100644 index 0000000..9372708 --- /dev/null +++ b/ariados.py @@ -0,0 +1,97 @@ +###################################################### +# ARIADOS v1.0 Yuku Forum Backup # +# Because they ruined our forum and we want it back. # +###################################################### + +import os +import time +import lxml +import requests +from bs4 import BeautifulSoup + +# Forum URL +boardURL = "http://trsrockin.fr.yuku.com" + +# Backup save location +# Backup will be saved in the following folder structure: +# Main Forum Name > Subforum Name > Thread Name > Page.htm +saveDir = "C:\\" + +# ARIADOS will attempt to back up all threads within the following range +threadFirst = 180 +threadLast = 6543 + +# Specify time delay between server requests +timeDelay = 2 + + + +# MAIN PROGRAM +print("--------------------------------") +print("ARIADOS v1.0 - Yuku Forum Backup") +print("--------------------------------") +print(" ") +print("ARIADOS will attempt to back up threads " + str(threadFirst) + " through " + str(threadLast)) +print(" ") +print("--------------------------------") +print(" ") + +iT = threadFirst # Thread iterator + +while iT <= threadLast: + iP = 1 # Page iterator + + time.sleep(timeDelay) + + # Get first page of thread + soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT)).text, "lxml") + + # Only process the page if it isn't a 404 + if soup.title.string != "We are going to be back soon": + boardIndex = soup.select(".breadcrumb > li:nth-of-type(1) > a")[0]["title"] + currentForum = soup.select(".breadcrumb > li:nth-of-type(2) > a")[0]["title"] + currentThread = soup.select(".breadcrumb > li:nth-of-type(3) > a")[0]["title"] + + savePath = saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread + + # Check for folders and make them if missing + if os.path.isdir(saveDir + boardIndex) != 1: + os.mkdir(saveDir + boardIndex) + if os.path.isdir(saveDir + boardIndex + "\\" + currentForum) != 1: + os.mkdir(saveDir + boardIndex + "\\" + currentForum) + if os.path.isdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread) != 1: + os.mkdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread) + + # Check whether page exists and if not, save it + if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1: + print("Accessing thread: " + currentThread) + with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file: + file.write(soup.prettify()) + + # Check number of pages in thread + if len(soup.select(".pagination")) > 0: + threadEnd = int(soup.select("div.pagination ul li a")[len(soup.select("div.pagination ul li a"))-2].string) + + # Loop through thread pages + while iP < threadEnd: + iP += 1 + + time.sleep(timeDelay) + + # Get next page of thread + soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT) + "/?page=" + str(iP)).text, "html.parser") + + # Check whether page exists and if not, save it + if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1: + print(" Saving page " + str(iP) + " of " + str(threadEnd)) + with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file: + file.write(soup.prettify()) + + print("Thread saved at " + savePath) + print(" ") + with open(saveDir + boardIndex + "\\" + "progress.txt", "a") as file: + file.write(str(iT) + "\r\n") + + iT += 1 + +print("Backup operation completed.")