the-great-escape/tge/__init__.py
2016-11-26 23:09:12 -06:00

33 lines
1.4 KiB
Python

import argparse
from urllib.parse import urlparse
from . import scrapers, outputters
def main ():
parser = argparse.ArgumentParser(description="Forum scraper")
parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
parser.add_argument("--url", dest="url", required=True, help="URL to scrape")
parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
args = parser.parse_args()
url = args.url
if not (url.startswith("https://") or url.startswith("http://")):
url = "http://{}".format(url)
print("Target URL is: {}".format(url))
if args.scraper:
scraper = scrapers.get_scraper(args.scraper)
print("Using scraper: {}".format(scraper.__name__))
else:
scraper = scrapers.guess_scraper(url)
print("Guessed scraper: {}".format(scraper.__name__))
scraped = scraper.scrape(url)
out = args.out if args.out else urlparse(url).netloc
outformat = args.outformat if args.outformat else "json"
print("Outputting to: {}, using {} outputter".format(out, outformat))
outputter = outputters.get_outputter(outformat)
outputter.output(scraped, out)