import argparse from urllib.parse import urlparse from . import scrapers, outputters from .util import sanitize_title def main (): parser = argparse.ArgumentParser(description="Forum scraper") parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess") parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape") parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url") parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used") args = parser.parse_args() source = vars(args)['in'] print("Source is: {}".format(source)) if args.scraper: scraper = scrapers.get_scraper(args.scraper) print("Using scraper: {}".format(scraper.__name__)) else: scraper = scrapers.guess_scraper(source) print("Guessed scraper: {}".format(scraper.__name__)) scraped = scraper.scrape(source) out = args.out if args.out else sanitize_title(scraped.title) outformat = args.outformat if args.outformat else "json" print("Outputting to: {}, using {} outputter".format(out, outformat)) outputter = outputters.get_outputter(outformat) outputter.output(scraped, out)