Ariados/ariados.py

######################################################
#          ARIADOS v1.0  Yuku Forum Backup           #
# Because they ruined our forum and we want it back. #
######################################################

import os
import time
import lxml
import requests
from bs4 import BeautifulSoup

# Forum URL
boardURL = "http://trsrockin.fr.yuku.com"

# Backup save location
#   Backup will be saved in the following folder structure:
#   Main Forum Name > Subforum Name > Thread Name > Page.htm
saveDir = "C:\\"

# ARIADOS will attempt to back up all threads within the following range
threadFirst = 180
threadLast = 6543

# Specify time delay between server requests
timeDelay = 2


# MAIN PROGRAM
print("--------------------------------")
print("ARIADOS v1.0 - Yuku Forum Backup")
print("--------------------------------")
print(" ")
print("ARIADOS will attempt to back up threads " + str(threadFirst) + " through " + str(threadLast))
print(" ")
print("--------------------------------")
print(" ")

iT = threadFirst # Thread iterator

while iT <= threadLast:
    iP = 1 # Page iterator

    time.sleep(timeDelay)

    # Get first page of thread
    soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT)).text, "lxml")

    # Only process the page if it isn't a 404
    if soup.title.string != "We are going to be back soon":
        boardIndex = soup.select(".breadcrumb > li:nth-of-type(1) > a")[0]["title"]
        currentForum = soup.select(".breadcrumb > li:nth-of-type(2) > a")[0]["title"]
        currentThread = soup.select(".breadcrumb > li:nth-of-type(3) > a")[0]["title"]

        savePath = saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread

        # Check for folders and make them if missing
        if os.path.isdir(saveDir + boardIndex) != 1:
            os.mkdir(saveDir + boardIndex)
        if os.path.isdir(saveDir + boardIndex + "\\" + currentForum) != 1:
            os.mkdir(saveDir + boardIndex + "\\" + currentForum)
        if os.path.isdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread) != 1:
            os.mkdir(saveDir + boardIndex + "\\" + currentForum + "\\" + currentThread)

        # Check whether page exists and if not, save it
        if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1:
            print("Accessing thread: " + currentThread)
            with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file:
                file.write(soup.prettify())

        # Check number of pages in thread
        if len(soup.select(".pagination")) > 0:
            threadEnd = int(soup.select("div.pagination ul li a")[len(soup.select("div.pagination ul li a"))-2].string)

            # Loop through thread pages
            while iP < threadEnd:
                iP += 1

                time.sleep(timeDelay)

                # Get next page of thread
                soup = BeautifulSoup(requests.get(boardURL + "/topic/" + str(iT) + "/?page=" + str(iP)).text, "html.parser")

                # Check whether page exists and if not, save it
                if os.path.isfile(savePath + "\\" + str(iP) + ".htm") != 1:
                    print("     Saving page " + str(iP) + " of " + str(threadEnd))
                    with open(savePath + "\\" + str(iP) + ".htm", "x", encoding='utf8') as file:
                        file.write(soup.prettify())

        print("Thread saved at " + savePath)
        print(" ")
        with open(saveDir + boardIndex + "\\" + "progress.txt", "a") as file:
                file.write(str(iT) + "\r\n")

    iT += 1

print("Backup operation completed.")