47 lines
2.0 KiB
Python
47 lines
2.0 KiB
Python
import os
|
|
import lxml
|
|
import hashlib
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from os.path import join, getsize
|
|
|
|
saveDir = "C:\\TRsRockin\\General-Video-Game-Discussion\\"
|
|
|
|
for root, dirs, files in os.walk(saveDir):
|
|
|
|
for i in files:
|
|
filePath = root + "\\" + i
|
|
|
|
with open(filePath, "r", encoding='utf8') as file:
|
|
soup = BeautifulSoup(file, "lxml")
|
|
for j in soup.find_all('img'):
|
|
|
|
# First, filter out some URLs we don't want
|
|
if (j["src"].count("quantserve") < 1) and (j["src"].count("crowdreport") < 1) and (j["src"].count("archive.org") < 1) and (j["src"].count("derpiboo.ru") < 1) and (j["src"].count("ace-attorney.net") < 1) and len(j["src"]) > 0:
|
|
|
|
# Change HTTPS to HTTP if necessary
|
|
if j["src"].count("https://") > 0:
|
|
j["src"] = j["src"].replace("https://", "http://")
|
|
|
|
# Get filetype extension of image
|
|
filetypeExt = j["src"].split(".")[len(j["src"].split("."))-1]
|
|
if filetypeExt.count("?") > 0:
|
|
filetypeExt = filetypeExt.split("?")[0]
|
|
|
|
# More error handling
|
|
if filetypeExt.count("/") < 1:
|
|
|
|
# Hash URL for unique filename
|
|
urlHash = hashlib.md5(bytes(j["src"], encoding='utf8')).hexdigest()
|
|
|
|
# Check whether image exists and if not, save it
|
|
if os.path.isfile("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt) != 1:
|
|
print("Saving image: " + j["src"])
|
|
print(" ")
|
|
r = requests.get(j["src"])
|
|
with open("C:\\TRsRockin\\Images\\" + str(urlHash) + "." + filetypeExt, "xb") as file:
|
|
file.write(r.content)
|
|
|
|
print("Backup operation completed.")
|