style and convention fixes to make pylint happy
This commit is contained in:
parent
1db0d315b8
commit
05c766011f
@ -1,3 +1,6 @@
|
|||||||
|
"""The Great Escape is a tool for scraping data from a web forum and
|
||||||
|
exporting it into a format which can be re-imported."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
@ -5,11 +8,29 @@ from . import scrapers, outputters
|
|||||||
from .util import sanitize_title
|
from .util import sanitize_title
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
"""The Great Escape's entry point."""
|
||||||
parser = argparse.ArgumentParser(description="Forum scraper")
|
parser = argparse.ArgumentParser(description="Forum scraper")
|
||||||
parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
|
parser.add_argument(
|
||||||
parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape")
|
"--scraper",
|
||||||
parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
|
dest="scraper",
|
||||||
parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
|
help="Scraper to use; if not specified, tries to guess"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--in",
|
||||||
|
dest="in",
|
||||||
|
required=True,
|
||||||
|
help="URL or file to scrape"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--out",
|
||||||
|
dest="out",
|
||||||
|
help="Path to output; if not specified, is the target forum's url"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outformat",
|
||||||
|
dest="outformat",
|
||||||
|
help="Format to output data out; if not specified, default (JSON-based) format is used"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
source = vars(args)['in']
|
source = vars(args)['in']
|
||||||
@ -22,7 +43,7 @@ def main ():
|
|||||||
print("Guessed scraper: {}".format(scraper.__name__))
|
print("Guessed scraper: {}".format(scraper.__name__))
|
||||||
|
|
||||||
scraped = scraper.scrape(source)
|
scraped = scraper.scrape(source)
|
||||||
|
print(scraped.title)
|
||||||
out = args.out if args.out else sanitize_title(scraped.title)
|
out = args.out if args.out else sanitize_title(scraped.title)
|
||||||
outformat = args.outformat if args.outformat else "json"
|
outformat = args.outformat if args.outformat else "json"
|
||||||
print("Outputting to: {}, using {} outputter".format(out, outformat))
|
print("Outputting to: {}, using {} outputter".format(out, outformat))
|
||||||
|
15
tge/model.py
15
tge/model.py
@ -1,10 +1,20 @@
|
|||||||
|
"""The Great Escape model objects.
|
||||||
|
|
||||||
|
Note that, depending on the forum software, terms might have different meanings.
|
||||||
|
For example, sometimes "board" refers to the entire site and "forum" to a subsection.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# pylint: disable=too-few-public-methods, too-many-arguments
|
||||||
|
|
||||||
class Forum(object):
|
class Forum(object):
|
||||||
|
"""Forum represents an entire web forum."""
|
||||||
def __init__(self, title=None):
|
def __init__(self, title=None):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.users = []
|
self.users = []
|
||||||
self.categories = []
|
self.categories = []
|
||||||
|
|
||||||
class Post(object):
|
class Post(object):
|
||||||
|
"""Post represents a singular post in a thread."""
|
||||||
def __init__(self, title=None, body=None, author=None, timestamp=None):
|
def __init__(self, title=None, body=None, author=None, timestamp=None):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.body = body
|
self.body = body
|
||||||
@ -12,11 +22,13 @@ class Post (object):
|
|||||||
self.timestamp = timestamp
|
self.timestamp = timestamp
|
||||||
|
|
||||||
class Thread(object):
|
class Thread(object):
|
||||||
|
"""Thread represents a thread, or topic, in a board, on a forum."""
|
||||||
def __init__(self, title=None):
|
def __init__(self, title=None):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.children = []
|
self.children = []
|
||||||
|
|
||||||
class User(object):
|
class User(object):
|
||||||
|
"""User represents an individual user of a forum."""
|
||||||
def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None):
|
def __init__(self, name=None, signature=None, avatar=None, title=None, subtitle=None):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.signature = signature
|
self.signature = signature
|
||||||
@ -25,12 +37,15 @@ class User (object):
|
|||||||
self.avatar = avatar
|
self.avatar = avatar
|
||||||
|
|
||||||
class Category(object):
|
class Category(object):
|
||||||
|
"""Category represents a category of boards.
|
||||||
|
Note however in some forum software categories are a type of board."""
|
||||||
def __init__(self, title=None, description=None):
|
def __init__(self, title=None, description=None):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.description = description
|
self.description = description
|
||||||
self.children = []
|
self.children = []
|
||||||
|
|
||||||
class Board(object):
|
class Board(object):
|
||||||
|
"""Board represents a board which contains threads."""
|
||||||
def __init__(self, title=None, description=None):
|
def __init__(self, title=None, description=None):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.description = description
|
self.description = description
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
|
"""Outputters take scraped objects and save them to a certain format."""
|
||||||
|
|
||||||
from . import json, pickle
|
from . import json, pickle
|
||||||
|
|
||||||
outputters = [json, pickle]
|
OUTPUTTERS = [json, pickle]
|
||||||
|
|
||||||
def get_outputter(name):
|
def get_outputter(name):
|
||||||
for outputter in outputters:
|
"""Get the outputter with the specified name."""
|
||||||
|
for outputter in OUTPUTTERS:
|
||||||
if outputter.__name__.endswith(".{}".format(name)):
|
if outputter.__name__.endswith(".{}".format(name)):
|
||||||
return outputter
|
return outputter
|
||||||
|
|
||||||
|
@ -1,10 +1,13 @@
|
|||||||
from ..model import User, Category, Forum, Board, Post, Thread
|
"""JSON outputter."""
|
||||||
from ..util import sanitize_title
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from ..model import Forum, Board, Thread
|
||||||
|
from ..util import sanitize_title
|
||||||
|
|
||||||
def output(data, destination):
|
def output(data, destination):
|
||||||
|
"""Output the given object to the specified folder."""
|
||||||
if isinstance(data, Forum):
|
if isinstance(data, Forum):
|
||||||
output_forum(data, destination)
|
output_forum(data, destination)
|
||||||
elif isinstance(data, Board):
|
elif isinstance(data, Board):
|
||||||
@ -13,17 +16,23 @@ def output (data, destination):
|
|||||||
output_thread(data, destination)
|
output_thread(data, destination)
|
||||||
|
|
||||||
def output_forum(data, destination):
|
def output_forum(data, destination):
|
||||||
|
"""Output the given Forum object to the specified folder."""
|
||||||
os.makedirs(destination)
|
os.makedirs(destination)
|
||||||
|
|
||||||
with open(os.path.join(destination, "index.json"), "w") as out_file:
|
with open(os.path.join(destination, "index.json"), "w") as out_file:
|
||||||
out_file.write(json.dumps({"title": data.title}, indent=4))
|
out_file.write(json.dumps({"title": data.title}, indent=4))
|
||||||
|
|
||||||
for category in data.categories:
|
for category in data.categories:
|
||||||
os.makedirs(os.path.join(destination, sanitize_title(category.title)))
|
category_dir = os.path.join(destination, sanitize_title(category.title))
|
||||||
|
os.makedirs(category_dir)
|
||||||
for board in category.children:
|
for board in category.children:
|
||||||
output_board(board, os.path.join(destination, sanitize_title(category.title), sanitize_title(board.title)))
|
output_board(
|
||||||
|
board,
|
||||||
|
os.path.join(category_dir, sanitize_title(board.title))
|
||||||
|
)
|
||||||
|
|
||||||
def output_board(data, destination):
|
def output_board(data, destination):
|
||||||
|
"""Output the given Board object to the specified folder."""
|
||||||
os.makedirs(destination)
|
os.makedirs(destination)
|
||||||
os.makedirs(os.path.join(destination, "threads"))
|
os.makedirs(os.path.join(destination, "threads"))
|
||||||
with open(os.path.join(destination, "index.json"), "w") as out_file:
|
with open(os.path.join(destination, "index.json"), "w") as out_file:
|
||||||
@ -36,5 +45,6 @@ def output_board (data, destination):
|
|||||||
output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title)))
|
output_thread(thread, os.path.join(destination, "threads", sanitize_title(thread.title)))
|
||||||
|
|
||||||
def output_thread(data, destination):
|
def output_thread(data, destination):
|
||||||
|
"""Output the given Thread object to the specified file."""
|
||||||
with open(destination, "w") as out_file:
|
with open(destination, "w") as out_file:
|
||||||
out_file.write(json.dumps(data, default=vars, indent=4))
|
out_file.write(json.dumps(data, default=vars, indent=4))
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
|
"""Outputter based on Python's pickle module.
|
||||||
|
The output of this outputter can be read with the pickle scraper."""
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
def output(data, destination):
|
def output(data, destination):
|
||||||
|
"""Output the given object into the specified pickle file."""
|
||||||
with open(destination, "wb") as out_file:
|
with open(destination, "wb") as out_file:
|
||||||
pickle.dump(data, out_file)
|
pickle.dump(data, out_file)
|
||||||
|
@ -1,16 +1,21 @@
|
|||||||
|
"""Scrapers accept an input located somewhere (at a URL or local file)
|
||||||
|
and scrape them into objects, which can be dumped by an outputter."""
|
||||||
|
|
||||||
from . import yuku, pickle
|
from . import yuku, pickle
|
||||||
|
|
||||||
scrapers = [yuku, pickle]
|
SCRAPERS = [yuku, pickle]
|
||||||
|
|
||||||
def get_scraper(name):
|
def get_scraper(name):
|
||||||
for scraper in scrapers:
|
"""Get the scraper with the specified name."""
|
||||||
|
for scraper in SCRAPERS:
|
||||||
if scraper.__name__.endswith(".{}".format(name)):
|
if scraper.__name__.endswith(".{}".format(name)):
|
||||||
return scraper
|
return scraper
|
||||||
|
|
||||||
raise Exception("Unknown scraper: {}".format(name))
|
raise Exception("Unknown scraper: {}".format(name))
|
||||||
|
|
||||||
def guess_scraper(url):
|
def guess_scraper(url):
|
||||||
for scraper in scrapers:
|
"""Attempt to guess the correct scraper for handling the given path or URL."""
|
||||||
|
for scraper in SCRAPERS:
|
||||||
if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
|
if "can_scrape_url" in vars(scraper) and scraper.can_scrape_url(url):
|
||||||
return scraper
|
return scraper
|
||||||
|
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
|
"""The pickle scraper reads a pickled file saved by the pickle outputter."""
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
def scrape(source):
|
def scrape(source):
|
||||||
|
"""Load the given pickle file into an object."""
|
||||||
with open(source, "rb") as in_file:
|
with open(source, "rb") as in_file:
|
||||||
return pickle.load(in_file)
|
return pickle.load(in_file)
|
||||||
|
@ -1,16 +1,24 @@
|
|||||||
from ..model import User, Category, Forum, Board, Post, Thread
|
"""Scraper for Yuku Forumer forums."""
|
||||||
|
|
||||||
|
# pylint: disable=no-member
|
||||||
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from time import strptime, mktime
|
from time import strptime, mktime
|
||||||
|
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
from pyquery import PyQuery as pq
|
from pyquery import PyQuery as pq
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
|
|
||||||
time_format = "%b %d %y %I:%M %p"
|
from ..model import User, Category, Forum, Board, Post, Thread
|
||||||
|
|
||||||
|
TIME_FORMAT = "%b %d %y %I:%M %p"
|
||||||
|
|
||||||
def can_scrape_url(url):
|
def can_scrape_url(url):
|
||||||
|
"""Returns true if this url can be scraped by this scraper."""
|
||||||
return ".fr.yuku.com" in url
|
return ".fr.yuku.com" in url
|
||||||
|
|
||||||
def scrape(url):
|
def scrape(url):
|
||||||
|
"""Scrapes the URL into an object."""
|
||||||
path = urlparse(url).path
|
path = urlparse(url).path
|
||||||
if path.startswith("/topic/"):
|
if path.startswith("/topic/"):
|
||||||
return scrape_thread_from_url(url)
|
return scrape_thread_from_url(url)
|
||||||
@ -21,17 +29,19 @@ def scrape (url):
|
|||||||
|
|
||||||
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
||||||
def get_document(url):
|
def get_document(url):
|
||||||
|
"""Returns a pyquery document for the specified url, retrying if necessary."""
|
||||||
return pq(url=url)
|
return pq(url=url)
|
||||||
|
|
||||||
def get_paged_document(url):
|
def get_paged_document(url):
|
||||||
|
"""Returns a generator that yields all pages of the specified url."""
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
d = get_document(url=url)
|
doc = get_document(url=url)
|
||||||
yield d
|
yield doc
|
||||||
|
|
||||||
nextlink = d("a[accesskey=n]")
|
nextlink = doc("a[accesskey=n]")
|
||||||
if not nextlink:
|
if not nextlink:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -39,16 +49,18 @@ def get_paged_document (url):
|
|||||||
print(" --> Following next page link to: {}".format(url))
|
print(" --> Following next page link to: {}".format(url))
|
||||||
|
|
||||||
def scrape_index(url):
|
def scrape_index(url):
|
||||||
|
"""Scrapes the forum index at url into a Forum object."""
|
||||||
print("Scraping forum index from url: {}".format(url))
|
print("Scraping forum index from url: {}".format(url))
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
|
|
||||||
d = get_document(url=url)
|
doc = get_document(url=url)
|
||||||
forum = Forum(title=d("title").text())
|
forum = Forum(title=doc("title").text())
|
||||||
for category_element in d("div.span9 > div.row-fluid").items():
|
for category_element in doc("div.span9 > div.row-fluid").items():
|
||||||
category = Category(title=category_element.find("h3").text())
|
category = Category(title=category_element.find("h3").text())
|
||||||
forum.categories.append(category)
|
forum.categories.append(category)
|
||||||
for board_link in category_element.find("a[href^='/forums/']").items():
|
for board_link in category_element.find("a[href^='/forums/']").items():
|
||||||
board = scrape_board_from_url("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href))
|
full_url = "{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attr.href)
|
||||||
|
board = scrape_board_from_url(full_url)
|
||||||
board.description = board_link.closest("div").find("p").eq(0).text()
|
board.description = board_link.closest("div").find("p").eq(0).text()
|
||||||
category.children.append(board)
|
category.children.append(board)
|
||||||
print("Finished scraping all boards in category: {}".format(category.title))
|
print("Finished scraping all boards in category: {}".format(category.title))
|
||||||
@ -56,24 +68,26 @@ def scrape_index (url):
|
|||||||
return forum
|
return forum
|
||||||
|
|
||||||
def scrape_board_from_url(url):
|
def scrape_board_from_url(url):
|
||||||
|
"""Scrapes the board index at url into a Board object."""
|
||||||
print("Scraping board from url: {}".format(url))
|
print("Scraping board from url: {}".format(url))
|
||||||
|
|
||||||
board = None
|
board = None
|
||||||
for d in get_paged_document(url):
|
for doc in get_paged_document(url):
|
||||||
if not board:
|
if not board:
|
||||||
board = scrape_board_from_document(url, d)
|
board = scrape_board_from_document(url, doc)
|
||||||
else:
|
else:
|
||||||
board.children = board.children + scrape_board_from_document(url, d).children
|
board.children = board.children + scrape_board_from_document(url, doc).children
|
||||||
|
|
||||||
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
||||||
return board
|
return board
|
||||||
|
|
||||||
def scrape_board_from_document (url, d):
|
def scrape_board_from_document(url, doc):
|
||||||
|
"""Scrapes the given document into a Board object."""
|
||||||
urlparts = urlparse(url)
|
urlparts = urlparse(url)
|
||||||
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
board = Board(title=d("h1").text())
|
board = Board(title=doc("h1").text())
|
||||||
for thread_link in d("a[href^='/topic/']").items():
|
for thread_link in doc("a[href^='/topic/']").items():
|
||||||
if thread_link.closest(".topic-pager"):
|
if thread_link.closest(".topic-pager"):
|
||||||
continue
|
continue
|
||||||
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
|
thread = scrape_thread_from_url("{}{}".format(baseurl, thread_link.attr.href))
|
||||||
@ -82,21 +96,23 @@ def scrape_board_from_document (url, d):
|
|||||||
return board
|
return board
|
||||||
|
|
||||||
def scrape_thread_from_url(url):
|
def scrape_thread_from_url(url):
|
||||||
|
"""Scrapes the given thread url into a Thread object."""
|
||||||
print("Scraping thread from url: {}".format(url))
|
print("Scraping thread from url: {}".format(url))
|
||||||
|
|
||||||
thread = None
|
thread = None
|
||||||
for d in get_paged_document(url):
|
for doc in get_paged_document(url):
|
||||||
if not thread:
|
if not thread:
|
||||||
thread = scrape_thread_from_document(d)
|
thread = scrape_thread_from_document(doc)
|
||||||
else:
|
else:
|
||||||
thread.children = thread.children + scrape_thread_from_document(d).children
|
thread.children = thread.children + scrape_thread_from_document(doc).children
|
||||||
|
|
||||||
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
||||||
return thread
|
return thread
|
||||||
|
|
||||||
def scrape_thread_from_document (d):
|
def scrape_thread_from_document(doc):
|
||||||
thread = Thread(title=d("h2").eq(0).text())
|
"""Scrapes the given document into a Thread object."""
|
||||||
for post_entry in d("article.post-entry").items():
|
thread = Thread(title=doc("h2").eq(0).text())
|
||||||
|
for post_entry in doc("article.post-entry").items():
|
||||||
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
# 26 November 2016: Yuku's broken HTML is breaking this parsing logic
|
||||||
# <article>'s aren't being closed correctly so each selector actually
|
# <article>'s aren't being closed correctly so each selector actually
|
||||||
# returns the rest of the thread's contents instead of just that post.
|
# returns the rest of the thread's contents instead of just that post.
|
||||||
@ -116,7 +132,7 @@ def scrape_thread_from_document (d):
|
|||||||
if date_element.find("time"):
|
if date_element.find("time"):
|
||||||
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
|
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
|
||||||
else:
|
else:
|
||||||
timestamp = mktime(strptime(date_element.text(), time_format))
|
timestamp = mktime(strptime(date_element.text(), TIME_FORMAT))
|
||||||
|
|
||||||
thread.children.append(Post(
|
thread.children.append(Post(
|
||||||
author=User(
|
author=User(
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
characters_to_replace = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
|
"""Utility functions."""
|
||||||
|
|
||||||
|
CHARACTERS_TO_REPLACE = ["/", ":", " ", "?", "!", "&", ",", "'", '""']
|
||||||
|
|
||||||
def sanitize_title(title):
|
def sanitize_title(title):
|
||||||
for character in characters_to_replace:
|
"""Sanitizes the given title by removing certain characters."""
|
||||||
|
for character in CHARACTERS_TO_REPLACE:
|
||||||
title = title.replace(character, "-")
|
title = title.replace(character, "-")
|
||||||
return title
|
return title
|
||||||
|
Loading…
x
Reference in New Issue
Block a user