initial commit for the-great-escape yuku scraper

2016-11-26 23:09:12 -06:00
parent e5fb7e5c9a
commit 933e178ce5
8 changed files with 219 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,7 @@
+# The Great Escape
+## Install
+    virtualenv -p python3 venv
+    . venv/bin/activate
+    pip install -e .
+
+## 
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+from distutils.core import setup
+
+setup(
+    name='The Great Escape',
+    version='0.0.1',
+    description='A forum scraping and migration tool.',
+    author='Adrian Malacoda',
+    packages=['tge', 'tge.scrapers'],
+    install_requires=['pyquery'],
+    entry_points={
+        'console_scripts': [
+            'tge = tge:main'
+        ]
+    }
+)
--- a/tge/init.py
+++ b/tge/init.py
@@ -0,0 +1,32 @@
+import argparse
+from urllib.parse import urlparse
+
+from . import scrapers, outputters
+
+def main ():
+    parser = argparse.ArgumentParser(description="Forum scraper")
+    parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
+    parser.add_argument("--url", dest="url", required=True, help="URL to scrape")
+    parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
+    parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
+    args = parser.parse_args()
+
+    url = args.url
+    if not (url.startswith("https://") or url.startswith("http://")):
+        url = "http://{}".format(url)
+
+    print("Target URL is: {}".format(url))
+    if args.scraper:
+        scraper = scrapers.get_scraper(args.scraper)
+        print("Using scraper: {}".format(scraper.__name__))
+    else:
+        scraper = scrapers.guess_scraper(url)
+        print("Guessed scraper: {}".format(scraper.__name__))
+
+    scraped = scraper.scrape(url)
+
+    out = args.out if args.out else urlparse(url).netloc
+    outformat = args.outformat if args.outformat else "json"
+    print("Outputting to: {}, using {} outputter".format(out, outformat))
+    outputter = outputters.get_outputter(outformat)
+    outputter.output(scraped, out)
--- a/tge/model.py
+++ b/tge/model.py
@@ -0,0 +1,33 @@
+class Forum (object):
+    def __init__ (self, title=None):
+        self.title = title
+        self.users = []
+        self.categories = []
+
+class Post (object):
+    def __init__ (self, title=None, body=None, author=None):
+        self.title = title
+        self.body = body
+        self.author = author
+
+class Thread (object):
+    def __init__ (self, title=None):
+        self.title = title
+        self.children = []
+
+class User (object):
+    def __init__ (self, name=None, signature=None):
+        self.name = name
+        self.signature = signature
+
+class Category (object):
+    def __init__ (self, title=None, description=None):
+        self.title = title
+        self.description = description
+        self.children = []
+
+class Board (object):
+    def __init__ (self, title=None, description=None):
+        self.title = title
+        self.description = description
+        self.children = []
--- a/tge/outputters/init.py
+++ b/tge/outputters/init.py
@@ -0,0 +1,10 @@
+from . import json
+
+outputters = [json]
+
+def get_outputter (name):
+    for outputter in outputters:
+        if outputter.__name__.endswith(".{}".format(name)):
+            return outputter
+
+    raise Exception("Unknown outputter: {}".format(name))
--- a/tge/outputters/json.py
+++ b/tge/outputters/json.py
@@ -0,0 +1,28 @@
+from ..model import User, Category, Forum, Board, Post, Thread
+
+import json
+import os
+
+def output (data, destination):
+    if isinstance(data, Forum):
+        output_forum(data, destination)
+    elif isinstance(data, Board):
+        output_board(data, destination)
+    elif isinstance(data, Thread):
+        output_thread(data, destination)
+
+def output_forum (data, destination):
+    os.makedirs(destination)
+    for category in data.categories:
+        os.makedirs(os.path.join(destination, category.title))
+        for board in category.children:
+            output_board(board, os.path.join(destination, category.title, board.title))
+
+def output_board (data, destination):
+    os.makedirs(destination)
+    for thread in data.children:
+        output_thread(thread, os.path.join(destination, thread.title))
+
+def output_thread (data, destination):
+    with open(destination, "w") as out_file:
+        out_file.write(json.dumps(data, default=vars, indent=4))
--- a/tge/scrapers/init.py
+++ b/tge/scrapers/init.py
@@ -0,0 +1,17 @@
+from . import yuku
+
+scrapers = [yuku]
+
+def get_scraper (name):
+    for scraper in scrapers:
+        if scraper.__name__.endswith(".{}".format(name)):
+            return scraper
+
+    raise Exception("Unknown scraper: {}".format(name))
+
+def guess_scraper (url):
+    for scraper in scrapers:
+        if scraper.can_scrape_url(url):
+            return scraper
+
+    raise Exception("Unable to guess scraper for forum url: {}".format(url))
--- a/tge/scrapers/yuku.py
+++ b/tge/scrapers/yuku.py
@@ -0,0 +1,75 @@
+from ..model import User, Category, Forum, Board, Post, Thread
+from urllib.parse import urlparse
+from pyquery import PyQuery as pq
+
+def can_scrape_url (url):
+    return ".yuku.com" in url
+
+def scrape (url):
+    path = urlparse(url).path
+    if path.startswith("/topic/"):
+        return scrape_thread(url)
+    elif path.startswith("/forums/"):
+        return scrape_board(url)
+    elif (not path) or path == "/":
+        return scrape_index(url)
+
+def scrape_index (url):
+    print("Scraping forum index from url: {}".format(url))
+    urlparts = urlparse(url)
+
+    d = pq(url=url)
+    forum = Forum(title=d("title").text())
+    for category_element in d("div.span9 > div.row-fluid"):
+        category = Category(title=category_element.find("h3").text)
+        forum.categories.append(category)
+        for board_link in pq(category_element)("a[href^='/forums/']"):
+            board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attrib['href']))
+            category.children.append(board)
+        print("Finished scraping all boards in category: {}".format(category.title))
+
+    return forum
+
+def scrape_board (url):
+    print("Scraping board from url: {}".format(url))
+    urlparts = urlparse(url)
+    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
+
+    d = pq(url=url)
+    board = Board(title=d("h1").text())
+    for thread_link in d("a[href^='/topic/']"):
+        if d(thread_link).closest(".topic-pager"):
+            continue
+        thread = scrape_thread("{}{}".format(baseurl, thread_link.attrib['href']))
+        board.children.append(thread)
+
+    nextlink = d("a[accesskey=n]")
+    if nextlink:
+        board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children
+
+    if not urlparts.query.startswith("page="):
+        print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
+
+    return board
+
+def scrape_thread (url):
+    print("Scraping thread from url: {}".format(url))
+    urlparts = urlparse(url)
+    baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
+
+    d = pq(url=url)
+    thread = Thread(title=d("h2").text())
+    for post_entry in d("article.post-entry"):
+        thread.children.append(Post(
+            author=pq(post_entry)("header > p > a").text(),
+            body=pq(post_entry)(".post-content-container").text()
+        ))
+
+    nextlink = d("a[accesskey=n]")
+    if nextlink:
+        thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children
+
+    if not urlparts.query.startswith("page="):
+        print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
+
+    return thread