diff --git a/tge/__init__.py b/tge/__init__.py index f315556..93bf780 100644 --- a/tge/__init__.py +++ b/tge/__init__.py @@ -2,30 +2,28 @@ import argparse from urllib.parse import urlparse from . import scrapers, outputters +from .util import sanitize_title def main (): parser = argparse.ArgumentParser(description="Forum scraper") parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess") - parser.add_argument("--url", dest="url", required=True, help="URL to scrape") + parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape") parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url") parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used") args = parser.parse_args() - url = args.url - if not (url.startswith("https://") or url.startswith("http://")): - url = "http://{}".format(url) - - print("Target URL is: {}".format(url)) + source = vars(args)['in'] + print("Source is: {}".format(source)) if args.scraper: scraper = scrapers.get_scraper(args.scraper) print("Using scraper: {}".format(scraper.__name__)) else: - scraper = scrapers.guess_scraper(url) + scraper = scrapers.guess_scraper(source) print("Guessed scraper: {}".format(scraper.__name__)) - scraped = scraper.scrape(url) + scraped = scraper.scrape(source) - out = args.out if args.out else urlparse(url).netloc + out = args.out if args.out else sanitize_title(scraped.title) outformat = args.outformat if args.outformat else "json" print("Outputting to: {}, using {} outputter".format(out, outformat)) outputter = outputters.get_outputter(outformat)