import os import lxml import hashlib import requests from bs4 import BeautifulSoup from os.path import join, getsize saveDir = "C:\\TRsRockin\\General-Video-Game-Discussion\\" for root, dirs, files in os.walk(saveDir): for i in files: filePath = root + "\\" + i with open(filePath, "r", encoding='utf8') as file: soup = BeautifulSoup(file, "lxml") for j in soup.find_all('img'): # First, filter out some URLs we don't want if (j["src"].count("quantserve") < 1) and (j["src"].count("crowdreport") < 1) and (j["src"].count("archive.org") < 1) and (j["src"].count("derpiboo.ru") < 1) and (j["src"].count("ace-attorney.net") < 1) and len(j["src"]) > 0: # Change HTTPS to HTTP if necessary if j["src"].count("https://") > 0: j["src"] = j["src"].replace("https://", "http://") # Get filetype extension of image filetypeExt = j["src"].split(".")[len(j["src"].split("."))-1] if filetypeExt.count("?") > 0: filetypeExt = filetypeExt.split("?")[0] # More error handling if filetypeExt.count("/") < 1: # Hash URL for unique filename urlHash = hashlib.md5(bytes(j["src"], encoding='utf8')).hexdigest() # Check whether image exists and if not, save it if os.path.isfile("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt) != 1: print("Saving image: " + j["src"]) print(" ") r = requests.get(j["src"]) with open("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt, "xb") as file: file.write(r.content) print("Backup operation completed.")