Ariados/smeargle.py

import os
import lxml
import hashlib
import requests
from bs4 import BeautifulSoup

from os.path import join, getsize

saveDir = "C:\\TRsRockin\\General-Video-Game-Discussion\\"

for root, dirs, files in os.walk(saveDir):

    for i in files:
        filePath = root + "\\" + i

        with open(filePath, "r", encoding='utf8') as file:
            soup = BeautifulSoup(file, "lxml")
            for j in soup.find_all('img'):

                # First, filter out some URLs we don't want
                if (j["src"].count("quantserve") < 1) and (j["src"].count("crowdreport") < 1) and (j["src"].count("archive.org") < 1) and (j["src"].count("derpiboo.ru") < 1) and (j["src"].count("ace-attorney.net") < 1) and len(j["src"]) > 0:

                    # Change HTTPS to HTTP if necessary
                    if j["src"].count("https://") > 0:
                        j["src"] = j["src"].replace("https://", "http://")

                    # Get filetype extension of image
                    filetypeExt = j["src"].split(".")[len(j["src"].split("."))-1]
                    if filetypeExt.count("?") > 0:
                        filetypeExt = filetypeExt.split("?")[0]

                    # More error handling
                    if filetypeExt.count("/") < 1:

                        # Hash URL for unique filename
                        urlHash = hashlib.md5(bytes(j["src"], encoding='utf8')).hexdigest()

                        # Check whether image exists and if not, save it
                        if os.path.isfile("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt) != 1:
                            print("Saving image: " + j["src"])
                            print(" ")
                            r = requests.get(j["src"])
                            with open("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt, "xb") as file:
                                file.write(r.content)

print("Backup operation completed.")