diff --git a/smeargle.py b/smeargle.py new file mode 100644 index 0000000..3d6ab02 --- /dev/null +++ b/smeargle.py @@ -0,0 +1,46 @@ +import os +import lxml +import hashlib +import requests +from bs4 import BeautifulSoup + +from os.path import join, getsize + +saveDir = "C:\\TRsRockin\\General-Video-Game-Discussion\\" + +for root, dirs, files in os.walk(saveDir): + + for i in files: + filePath = root + "\\" + i + + with open(filePath, "r", encoding='utf8') as file: + soup = BeautifulSoup(file, "lxml") + for j in soup.find_all('img'): + + # First, filter out some URLs we don't want + if (j["src"].count("quantserve") < 1) and (j["src"].count("crowdreport") < 1) and (j["src"].count("archive.org") < 1) and (j["src"].count("derpiboo.ru") < 1) and (j["src"].count("ace-attorney.net") < 1) and len(j["src"]) > 0: + + # Change HTTPS to HTTP if necessary + if j["src"].count("https://") > 0: + j["src"] = j["src"].replace("https://", "http://") + + # Get filetype extension of image + filetypeExt = j["src"].split(".")[len(j["src"].split("."))-1] + if filetypeExt.count("?") > 0: + filetypeExt = filetypeExt.split("?")[0] + + # More error handling + if filetypeExt.count("/") < 1: + + # Hash URL for unique filename + urlHash = hashlib.md5(bytes(j["src"], encoding='utf8')).hexdigest() + + # Check whether image exists and if not, save it + if os.path.isfile("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt) != 1: + print("Saving image: " + j["src"]) + print(" ") + r = requests.get(j["src"]) + with open("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt, "xb") as file: + file.write(r.content) + +print("Backup operation completed.")