52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
"""The Great Escape is a tool for scraping data from a web forum and
|
|
exporting it into a format which can be re-imported."""
|
|
|
|
import argparse
|
|
from urllib.parse import urlparse
|
|
|
|
from . import scrapers, outputters
|
|
from .util import sanitize_title
|
|
|
|
def main():
|
|
"""The Great Escape's entry point."""
|
|
parser = argparse.ArgumentParser(description="Forum scraper")
|
|
parser.add_argument(
|
|
"--scraper",
|
|
dest="scraper",
|
|
help="Scraper to use; if not specified, tries to guess"
|
|
)
|
|
parser.add_argument(
|
|
"--in",
|
|
dest="in",
|
|
required=True,
|
|
help="URL or file to scrape"
|
|
)
|
|
parser.add_argument(
|
|
"--out",
|
|
dest="out",
|
|
help="Path to output; if not specified, is the target forum's url"
|
|
)
|
|
parser.add_argument(
|
|
"--outformat",
|
|
dest="outformat",
|
|
help="Format to output data out; if not specified, default (JSON-based) format is used"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
source = vars(args)['in']
|
|
print("Source is: {}".format(source))
|
|
if args.scraper:
|
|
scraper = scrapers.get_scraper(args.scraper)
|
|
print("Using scraper: {}".format(scraper.__name__))
|
|
else:
|
|
scraper = scrapers.guess_scraper(source)
|
|
print("Guessed scraper: {}".format(scraper.__name__))
|
|
|
|
scraped = scraper.scrape(source)
|
|
print(scraped.title)
|
|
out = args.out if args.out else sanitize_title(scraped.title)
|
|
outformat = args.outformat if args.outformat else "json"
|
|
print("Outputting to: {}, using {} outputter".format(out, outformat))
|
|
outputter = outputters.get_outputter(outformat)
|
|
outputter.output(scraped, out)
|