import argparse from urllib.parse import urlparse from . import scrapers, outputters def main (): parser = argparse.ArgumentParser(description="Forum scraper") parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess") parser.add_argument("--url", dest="url", required=True, help="URL to scrape") parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url") parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used") args = parser.parse_args() url = args.url if not (url.startswith("https://") or url.startswith("http://")): url = "http://{}".format(url) print("Target URL is: {}".format(url)) if args.scraper: scraper = scrapers.get_scraper(args.scraper) print("Using scraper: {}".format(scraper.__name__)) else: scraper = scrapers.guess_scraper(url) print("Guessed scraper: {}".format(scraper.__name__)) scraped = scraper.scrape(url) out = args.out if args.out else urlparse(url).netloc outformat = args.outformat if args.outformat else "json" print("Outputting to: {}, using {} outputter".format(out, outformat)) outputter = outputters.get_outputter(outformat) outputter.output(scraped, out)