diff --git a/README.md b/README.md index e69de29..481c281 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,7 @@ +# The Great Escape +## Install + virtualenv -p python3 venv + . venv/bin/activate + pip install -e . + +## diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7509db4 --- /dev/null +++ b/setup.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +from distutils.core import setup + +setup( + name='The Great Escape', + version='0.0.1', + description='A forum scraping and migration tool.', + author='Adrian Malacoda', + packages=['tge', 'tge.scrapers'], + install_requires=['pyquery'], + entry_points={ + 'console_scripts': [ + 'tge = tge:main' + ] + } +) diff --git a/tge/__init__.py b/tge/__init__.py new file mode 100644 index 0000000..f315556 --- /dev/null +++ b/tge/__init__.py @@ -0,0 +1,32 @@ +import argparse +from urllib.parse import urlparse + +from . import scrapers, outputters + +def main (): + parser = argparse.ArgumentParser(description="Forum scraper") + parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess") + parser.add_argument("--url", dest="url", required=True, help="URL to scrape") + parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url") + parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used") + args = parser.parse_args() + + url = args.url + if not (url.startswith("https://") or url.startswith("http://")): + url = "http://{}".format(url) + + print("Target URL is: {}".format(url)) + if args.scraper: + scraper = scrapers.get_scraper(args.scraper) + print("Using scraper: {}".format(scraper.__name__)) + else: + scraper = scrapers.guess_scraper(url) + print("Guessed scraper: {}".format(scraper.__name__)) + + scraped = scraper.scrape(url) + + out = args.out if args.out else urlparse(url).netloc + outformat = args.outformat if args.outformat else "json" + print("Outputting to: {}, using {} outputter".format(out, outformat)) + outputter = outputters.get_outputter(outformat) + outputter.output(scraped, out) diff --git a/tge/model.py b/tge/model.py new file mode 100644 index 0000000..68ec27a --- /dev/null +++ b/tge/model.py @@ -0,0 +1,33 @@ +class Forum (object): + def __init__ (self, title=None): + self.title = title + self.users = [] + self.categories = [] + +class Post (object): + def __init__ (self, title=None, body=None, author=None): + self.title = title + self.body = body + self.author = author + +class Thread (object): + def __init__ (self, title=None): + self.title = title + self.children = [] + +class User (object): + def __init__ (self, name=None, signature=None): + self.name = name + self.signature = signature + +class Category (object): + def __init__ (self, title=None, description=None): + self.title = title + self.description = description + self.children = [] + +class Board (object): + def __init__ (self, title=None, description=None): + self.title = title + self.description = description + self.children = [] diff --git a/tge/outputters/__init__.py b/tge/outputters/__init__.py new file mode 100644 index 0000000..d8cb556 --- /dev/null +++ b/tge/outputters/__init__.py @@ -0,0 +1,10 @@ +from . import json + +outputters = [json] + +def get_outputter (name): + for outputter in outputters: + if outputter.__name__.endswith(".{}".format(name)): + return outputter + + raise Exception("Unknown outputter: {}".format(name)) diff --git a/tge/outputters/json.py b/tge/outputters/json.py new file mode 100644 index 0000000..1cabd6b --- /dev/null +++ b/tge/outputters/json.py @@ -0,0 +1,28 @@ +from ..model import User, Category, Forum, Board, Post, Thread + +import json +import os + +def output (data, destination): + if isinstance(data, Forum): + output_forum(data, destination) + elif isinstance(data, Board): + output_board(data, destination) + elif isinstance(data, Thread): + output_thread(data, destination) + +def output_forum (data, destination): + os.makedirs(destination) + for category in data.categories: + os.makedirs(os.path.join(destination, category.title)) + for board in category.children: + output_board(board, os.path.join(destination, category.title, board.title)) + +def output_board (data, destination): + os.makedirs(destination) + for thread in data.children: + output_thread(thread, os.path.join(destination, thread.title)) + +def output_thread (data, destination): + with open(destination, "w") as out_file: + out_file.write(json.dumps(data, default=vars, indent=4)) diff --git a/tge/scrapers/__init__.py b/tge/scrapers/__init__.py new file mode 100644 index 0000000..7807863 --- /dev/null +++ b/tge/scrapers/__init__.py @@ -0,0 +1,17 @@ +from . import yuku + +scrapers = [yuku] + +def get_scraper (name): + for scraper in scrapers: + if scraper.__name__.endswith(".{}".format(name)): + return scraper + + raise Exception("Unknown scraper: {}".format(name)) + +def guess_scraper (url): + for scraper in scrapers: + if scraper.can_scrape_url(url): + return scraper + + raise Exception("Unable to guess scraper for forum url: {}".format(url)) diff --git a/tge/scrapers/yuku.py b/tge/scrapers/yuku.py new file mode 100644 index 0000000..d1de921 --- /dev/null +++ b/tge/scrapers/yuku.py @@ -0,0 +1,75 @@ +from ..model import User, Category, Forum, Board, Post, Thread +from urllib.parse import urlparse +from pyquery import PyQuery as pq + +def can_scrape_url (url): + return ".yuku.com" in url + +def scrape (url): + path = urlparse(url).path + if path.startswith("/topic/"): + return scrape_thread(url) + elif path.startswith("/forums/"): + return scrape_board(url) + elif (not path) or path == "/": + return scrape_index(url) + +def scrape_index (url): + print("Scraping forum index from url: {}".format(url)) + urlparts = urlparse(url) + + d = pq(url=url) + forum = Forum(title=d("title").text()) + for category_element in d("div.span9 > div.row-fluid"): + category = Category(title=category_element.find("h3").text) + forum.categories.append(category) + for board_link in pq(category_element)("a[href^='/forums/']"): + board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attrib['href'])) + category.children.append(board) + print("Finished scraping all boards in category: {}".format(category.title)) + + return forum + +def scrape_board (url): + print("Scraping board from url: {}".format(url)) + urlparts = urlparse(url) + baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) + + d = pq(url=url) + board = Board(title=d("h1").text()) + for thread_link in d("a[href^='/topic/']"): + if d(thread_link).closest(".topic-pager"): + continue + thread = scrape_thread("{}{}".format(baseurl, thread_link.attrib['href'])) + board.children.append(thread) + + nextlink = d("a[accesskey=n]") + if nextlink: + board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children + + if not urlparts.query.startswith("page="): + print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children))) + + return board + +def scrape_thread (url): + print("Scraping thread from url: {}".format(url)) + urlparts = urlparse(url) + baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc) + + d = pq(url=url) + thread = Thread(title=d("h2").text()) + for post_entry in d("article.post-entry"): + thread.children.append(Post( + author=pq(post_entry)("header > p > a").text(), + body=pq(post_entry)(".post-content-container").text() + )) + + nextlink = d("a[accesskey=n]") + if nextlink: + thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children + + if not urlparts.query.startswith("page="): + print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children))) + + return thread