Ariados/smeargle.py
2020-02-06 15:50:28 -06:00

47 lines
2.0 KiB
Python

import os
import lxml
import hashlib
import requests
from bs4 import BeautifulSoup
from os.path import join, getsize
saveDir = "C:\\TRsRockin\\General-Video-Game-Discussion\\"
for root, dirs, files in os.walk(saveDir):
for i in files:
filePath = root + "\\" + i
with open(filePath, "r", encoding='utf8') as file:
soup = BeautifulSoup(file, "lxml")
for j in soup.find_all('img'):
# First, filter out some URLs we don't want
if (j["src"].count("quantserve") < 1) and (j["src"].count("crowdreport") < 1) and (j["src"].count("archive.org") < 1) and (j["src"].count("derpiboo.ru") < 1) and (j["src"].count("ace-attorney.net") < 1) and len(j["src"]) > 0:
# Change HTTPS to HTTP if necessary
if j["src"].count("https://") > 0:
j["src"] = j["src"].replace("https://", "http://")
# Get filetype extension of image
filetypeExt = j["src"].split(".")[len(j["src"].split("."))-1]
if filetypeExt.count("?") > 0:
filetypeExt = filetypeExt.split("?")[0]
# More error handling
if filetypeExt.count("/") < 1:
# Hash URL for unique filename
urlHash = hashlib.md5(bytes(j["src"], encoding='utf8')).hexdigest()
# Check whether image exists and if not, save it
if os.path.isfile("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt) != 1:
print("Saving image: " + j["src"])
print(" ")
r = requests.get(j["src"])
with open("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt, "xb") as file:
file.write(r.content)
print("Backup operation completed.")