98 lines
3.6 KiB
Python
98 lines
3.6 KiB
Python
######################################################
|
|
# ARIADOS v1.0 Yuku Forum Backup #
|
|
# Because they ruined our forum and we want it back. #
|
|
######################################################
|
|
|
|
import os
|
|
import time
|
|
import lxml
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Forum URL
|
|
boardURL = "http://trsrockin.fr.yuku.com"
|
|
|
|
# Backup save location
|
|
# Backup will be saved in the following folder structure:
|
|
# Main Forum Name > Subforum Name > Thread Name > Page.htm
|
|
saveDir = "C:\\"
|
|
|
|
# ARIADOS will attempt to back up all threads within the following range
|
|
threadFirst = 180
|
|
threadLast = 6543
|
|
|
|
# Specify time delay between server requests
|
|
timeDelay = 2
|
|
|
|
|
|
|
|
# MAIN PROGRAM
|
|
print("--------------------------------")
|
|
print("ARIADOS v1.0 - Yuku Forum Backup")
|
|
print("--------------------------------")
|
|
print(" ")
|
|
print("ARIADOS will attempt to back up threads " + str(threadFirst) + " through " + str(threadLast))
|
|
print(" ")
|
|
print("--------------------------------")
|
|
print(" ")
|
|
|
|
iT = threadFirst # Thread iterator
|
|
|
|
while iT <= threadLast:
|
|
iP = 1 # Page iterator
|
|
|
|
time.sleep(timeDelay)
|
|
|
|
# Get first page of thread
|
|
soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT)).text, "lxml")
|
|
|
|
# Only process the page if it isn't a 404
|
|
if soup.title.string != "We are going to be back soon":
|
|
boardIndex = soup.select(".breadcrumb > li:nth-of-type(1) > a")[0]["title"]
|
|
currentForum = soup.select(".breadcrumb > li:nth-of-type(2) > a")[0]["title"]
|
|
currentThread = soup.select(".breadcrumb > li:nth-of-type(3) > a")[0]["title"]
|
|
|
|
savePath = saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread
|
|
|
|
# Check for folders and make them if missing
|
|
if os.path.isdir(saveDir + boardIndex) != 1:
|
|
os.mkdir(saveDir + boardIndex)
|
|
if os.path.isdir(saveDir + boardIndex + "\\" + currentForum) != 1:
|
|
os.mkdir(saveDir + boardIndex + "\\" + currentForum)
|
|
if os.path.isdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread) != 1:
|
|
os.mkdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread)
|
|
|
|
# Check whether page exists and if not, save it
|
|
if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1:
|
|
print("Accessing thread: " + currentThread)
|
|
with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file:
|
|
file.write(soup.prettify())
|
|
|
|
# Check number of pages in thread
|
|
if len(soup.select(".pagination")) > 0:
|
|
threadEnd = int(soup.select("div.pagination ul li a")[len(soup.select("div.pagination ul li a"))-2].string)
|
|
|
|
# Loop through thread pages
|
|
while iP < threadEnd:
|
|
iP += 1
|
|
|
|
time.sleep(timeDelay)
|
|
|
|
# Get next page of thread
|
|
soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT) + "/?page=" + str(iP)).text, "html.parser")
|
|
|
|
# Check whether page exists and if not, save it
|
|
if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1:
|
|
print(" Saving page " + str(iP) + " of " + str(threadEnd))
|
|
with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file:
|
|
file.write(soup.prettify())
|
|
|
|
print("Thread saved at " + savePath)
|
|
print(" ")
|
|
with open(saveDir + boardIndex + "\\" + "progress.txt", "a") as file:
|
|
file.write(str(iT) + "\r\n")
|
|
|
|
iT += 1
|
|
|
|
print("Backup operation completed.")
|