###################################################### # ARIADOS v1.0 Yuku Forum Backup # # Because they ruined our forum and we want it back. # ###################################################### import os import time import lxml import requests from bs4 import BeautifulSoup # Forum URL boardURL = "http://trsrockin.fr.yuku.com" # Backup save location # Backup will be saved in the following folder structure: # Main Forum Name > Subforum Name > Thread Name > Page.htm saveDir = "C:\\" # ARIADOS will attempt to back up all threads within the following range threadFirst = 180 threadLast = 6543 # Specify time delay between server requests timeDelay = 2 # MAIN PROGRAM print("--------------------------------") print("ARIADOS v1.0 - Yuku Forum Backup") print("--------------------------------") print(" ") print("ARIADOS will attempt to back up threads " + str(threadFirst) + " through " + str(threadLast)) print(" ") print("--------------------------------") print(" ") iT = threadFirst # Thread iterator while iT <= threadLast: iP = 1 # Page iterator time.sleep(timeDelay) # Get first page of thread soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT)).text, "lxml") # Only process the page if it isn't a 404 if soup.title.string != "We are going to be back soon": boardIndex = soup.select(".breadcrumb > li:nth-of-type(1) > a")[0]["title"] currentForum = soup.select(".breadcrumb > li:nth-of-type(2) > a")[0]["title"] currentThread = soup.select(".breadcrumb > li:nth-of-type(3) > a")[0]["title"] savePath = saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread # Check for folders and make them if missing if os.path.isdir(saveDir + boardIndex) != 1: os.mkdir(saveDir + boardIndex) if os.path.isdir(saveDir + boardIndex + "\\" + currentForum) != 1: os.mkdir(saveDir + boardIndex + "\\" + currentForum) if os.path.isdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread) != 1: os.mkdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread) # Check whether page exists and if not, save it if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1: print("Accessing thread: " + currentThread) with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file: file.write(soup.prettify()) # Check number of pages in thread if len(soup.select(".pagination")) > 0: threadEnd = int(soup.select("div.pagination ul li a")[len(soup.select("div.pagination ul li a"))-2].string) # Loop through thread pages while iP < threadEnd: iP += 1 time.sleep(timeDelay) # Get next page of thread soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT) + "/?page=" + str(iP)).text, "html.parser") # Check whether page exists and if not, save it if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1: print(" Saving page " + str(iP) + " of " + str(threadEnd)) with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file: file.write(soup.prettify()) print("Thread saved at " + savePath) print(" ") with open(saveDir + boardIndex + "\\" + "progress.txt", "a") as file: file.write(str(iT) + "\r\n") iT += 1 print("Backup operation completed.")