initial commit for the-great-escape yuku scraper
This commit is contained in:
parent
e5fb7e5c9a
commit
933e178ce5
@ -0,0 +1,7 @@
|
|||||||
|
# The Great Escape
|
||||||
|
## Install
|
||||||
|
virtualenv -p python3 venv
|
||||||
|
. venv/bin/activate
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
##
|
17
setup.py
Normal file
17
setup.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from distutils.core import setup
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name='The Great Escape',
|
||||||
|
version='0.0.1',
|
||||||
|
description='A forum scraping and migration tool.',
|
||||||
|
author='Adrian Malacoda',
|
||||||
|
packages=['tge', 'tge.scrapers'],
|
||||||
|
install_requires=['pyquery'],
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'tge = tge:main'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
32
tge/__init__.py
Normal file
32
tge/__init__.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import argparse
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from . import scrapers, outputters
|
||||||
|
|
||||||
|
def main ():
|
||||||
|
parser = argparse.ArgumentParser(description="Forum scraper")
|
||||||
|
parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
|
||||||
|
parser.add_argument("--url", dest="url", required=True, help="URL to scrape")
|
||||||
|
parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
|
||||||
|
parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
url = args.url
|
||||||
|
if not (url.startswith("https://") or url.startswith("http://")):
|
||||||
|
url = "http://{}".format(url)
|
||||||
|
|
||||||
|
print("Target URL is: {}".format(url))
|
||||||
|
if args.scraper:
|
||||||
|
scraper = scrapers.get_scraper(args.scraper)
|
||||||
|
print("Using scraper: {}".format(scraper.__name__))
|
||||||
|
else:
|
||||||
|
scraper = scrapers.guess_scraper(url)
|
||||||
|
print("Guessed scraper: {}".format(scraper.__name__))
|
||||||
|
|
||||||
|
scraped = scraper.scrape(url)
|
||||||
|
|
||||||
|
out = args.out if args.out else urlparse(url).netloc
|
||||||
|
outformat = args.outformat if args.outformat else "json"
|
||||||
|
print("Outputting to: {}, using {} outputter".format(out, outformat))
|
||||||
|
outputter = outputters.get_outputter(outformat)
|
||||||
|
outputter.output(scraped, out)
|
33
tge/model.py
Normal file
33
tge/model.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
class Forum (object):
|
||||||
|
def __init__ (self, title=None):
|
||||||
|
self.title = title
|
||||||
|
self.users = []
|
||||||
|
self.categories = []
|
||||||
|
|
||||||
|
class Post (object):
|
||||||
|
def __init__ (self, title=None, body=None, author=None):
|
||||||
|
self.title = title
|
||||||
|
self.body = body
|
||||||
|
self.author = author
|
||||||
|
|
||||||
|
class Thread (object):
|
||||||
|
def __init__ (self, title=None):
|
||||||
|
self.title = title
|
||||||
|
self.children = []
|
||||||
|
|
||||||
|
class User (object):
|
||||||
|
def __init__ (self, name=None, signature=None):
|
||||||
|
self.name = name
|
||||||
|
self.signature = signature
|
||||||
|
|
||||||
|
class Category (object):
|
||||||
|
def __init__ (self, title=None, description=None):
|
||||||
|
self.title = title
|
||||||
|
self.description = description
|
||||||
|
self.children = []
|
||||||
|
|
||||||
|
class Board (object):
|
||||||
|
def __init__ (self, title=None, description=None):
|
||||||
|
self.title = title
|
||||||
|
self.description = description
|
||||||
|
self.children = []
|
10
tge/outputters/__init__.py
Normal file
10
tge/outputters/__init__.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from . import json
|
||||||
|
|
||||||
|
outputters = [json]
|
||||||
|
|
||||||
|
def get_outputter (name):
|
||||||
|
for outputter in outputters:
|
||||||
|
if outputter.__name__.endswith(".{}".format(name)):
|
||||||
|
return outputter
|
||||||
|
|
||||||
|
raise Exception("Unknown outputter: {}".format(name))
|
28
tge/outputters/json.py
Normal file
28
tge/outputters/json.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from ..model import User, Category, Forum, Board, Post, Thread
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
def output (data, destination):
|
||||||
|
if isinstance(data, Forum):
|
||||||
|
output_forum(data, destination)
|
||||||
|
elif isinstance(data, Board):
|
||||||
|
output_board(data, destination)
|
||||||
|
elif isinstance(data, Thread):
|
||||||
|
output_thread(data, destination)
|
||||||
|
|
||||||
|
def output_forum (data, destination):
|
||||||
|
os.makedirs(destination)
|
||||||
|
for category in data.categories:
|
||||||
|
os.makedirs(os.path.join(destination, category.title))
|
||||||
|
for board in category.children:
|
||||||
|
output_board(board, os.path.join(destination, category.title, board.title))
|
||||||
|
|
||||||
|
def output_board (data, destination):
|
||||||
|
os.makedirs(destination)
|
||||||
|
for thread in data.children:
|
||||||
|
output_thread(thread, os.path.join(destination, thread.title))
|
||||||
|
|
||||||
|
def output_thread (data, destination):
|
||||||
|
with open(destination, "w") as out_file:
|
||||||
|
out_file.write(json.dumps(data, default=vars, indent=4))
|
17
tge/scrapers/__init__.py
Normal file
17
tge/scrapers/__init__.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from . import yuku
|
||||||
|
|
||||||
|
scrapers = [yuku]
|
||||||
|
|
||||||
|
def get_scraper (name):
|
||||||
|
for scraper in scrapers:
|
||||||
|
if scraper.__name__.endswith(".{}".format(name)):
|
||||||
|
return scraper
|
||||||
|
|
||||||
|
raise Exception("Unknown scraper: {}".format(name))
|
||||||
|
|
||||||
|
def guess_scraper (url):
|
||||||
|
for scraper in scrapers:
|
||||||
|
if scraper.can_scrape_url(url):
|
||||||
|
return scraper
|
||||||
|
|
||||||
|
raise Exception("Unable to guess scraper for forum url: {}".format(url))
|
75
tge/scrapers/yuku.py
Normal file
75
tge/scrapers/yuku.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
from ..model import User, Category, Forum, Board, Post, Thread
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from pyquery import PyQuery as pq
|
||||||
|
|
||||||
|
def can_scrape_url (url):
|
||||||
|
return ".yuku.com" in url
|
||||||
|
|
||||||
|
def scrape (url):
|
||||||
|
path = urlparse(url).path
|
||||||
|
if path.startswith("/topic/"):
|
||||||
|
return scrape_thread(url)
|
||||||
|
elif path.startswith("/forums/"):
|
||||||
|
return scrape_board(url)
|
||||||
|
elif (not path) or path == "/":
|
||||||
|
return scrape_index(url)
|
||||||
|
|
||||||
|
def scrape_index (url):
|
||||||
|
print("Scraping forum index from url: {}".format(url))
|
||||||
|
urlparts = urlparse(url)
|
||||||
|
|
||||||
|
d = pq(url=url)
|
||||||
|
forum = Forum(title=d("title").text())
|
||||||
|
for category_element in d("div.span9 > div.row-fluid"):
|
||||||
|
category = Category(title=category_element.find("h3").text)
|
||||||
|
forum.categories.append(category)
|
||||||
|
for board_link in pq(category_element)("a[href^='/forums/']"):
|
||||||
|
board = scrape_board("{}://{}{}".format(urlparts.scheme, urlparts.netloc, board_link.attrib['href']))
|
||||||
|
category.children.append(board)
|
||||||
|
print("Finished scraping all boards in category: {}".format(category.title))
|
||||||
|
|
||||||
|
return forum
|
||||||
|
|
||||||
|
def scrape_board (url):
|
||||||
|
print("Scraping board from url: {}".format(url))
|
||||||
|
urlparts = urlparse(url)
|
||||||
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
|
d = pq(url=url)
|
||||||
|
board = Board(title=d("h1").text())
|
||||||
|
for thread_link in d("a[href^='/topic/']"):
|
||||||
|
if d(thread_link).closest(".topic-pager"):
|
||||||
|
continue
|
||||||
|
thread = scrape_thread("{}{}".format(baseurl, thread_link.attrib['href']))
|
||||||
|
board.children.append(thread)
|
||||||
|
|
||||||
|
nextlink = d("a[accesskey=n]")
|
||||||
|
if nextlink:
|
||||||
|
board.children = board.children + scrape_board("{}{}".format(baseurl, nextlink.attr.href)).children
|
||||||
|
|
||||||
|
if not urlparts.query.startswith("page="):
|
||||||
|
print("Finished scraping board: {} ({} threads)".format(board.title, len(board.children)))
|
||||||
|
|
||||||
|
return board
|
||||||
|
|
||||||
|
def scrape_thread (url):
|
||||||
|
print("Scraping thread from url: {}".format(url))
|
||||||
|
urlparts = urlparse(url)
|
||||||
|
baseurl = "{}://{}".format(urlparts.scheme, urlparts.netloc)
|
||||||
|
|
||||||
|
d = pq(url=url)
|
||||||
|
thread = Thread(title=d("h2").text())
|
||||||
|
for post_entry in d("article.post-entry"):
|
||||||
|
thread.children.append(Post(
|
||||||
|
author=pq(post_entry)("header > p > a").text(),
|
||||||
|
body=pq(post_entry)(".post-content-container").text()
|
||||||
|
))
|
||||||
|
|
||||||
|
nextlink = d("a[accesskey=n]")
|
||||||
|
if nextlink:
|
||||||
|
thread.children = thread.children + scrape_thread("{}{}".format(baseurl, nextlink.attr.href)).children
|
||||||
|
|
||||||
|
if not urlparts.query.startswith("page="):
|
||||||
|
print("Finished scraping thread: {} ({} posts)".format(thread.title, len(thread.children)))
|
||||||
|
|
||||||
|
return thread
|
Loading…
x
Reference in New Issue
Block a user