redo main() so it can work with either the local filesystem or urls. rename --url to --in for consistency.
This commit is contained in:
parent
808677b327
commit
c52f472091
@ -2,30 +2,28 @@ import argparse
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from . import scrapers, outputters
|
from . import scrapers, outputters
|
||||||
|
from .util import sanitize_title
|
||||||
|
|
||||||
def main ():
|
def main ():
|
||||||
parser = argparse.ArgumentParser(description="Forum scraper")
|
parser = argparse.ArgumentParser(description="Forum scraper")
|
||||||
parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
|
parser.add_argument("--scraper", dest="scraper", help="Scraper to use; if not specified, tries to guess")
|
||||||
parser.add_argument("--url", dest="url", required=True, help="URL to scrape")
|
parser.add_argument("--in", dest="in", required=True, help="URL or file to scrape")
|
||||||
parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
|
parser.add_argument("--out", dest="out", help="Path to output; if not specified, is the target forum's url")
|
||||||
parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
|
parser.add_argument("--outformat", dest="outformat", help="Format to output data out; if not specified, default (JSON-based) format is used")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
url = args.url
|
source = vars(args)['in']
|
||||||
if not (url.startswith("https://") or url.startswith("http://")):
|
print("Source is: {}".format(source))
|
||||||
url = "http://{}".format(url)
|
|
||||||
|
|
||||||
print("Target URL is: {}".format(url))
|
|
||||||
if args.scraper:
|
if args.scraper:
|
||||||
scraper = scrapers.get_scraper(args.scraper)
|
scraper = scrapers.get_scraper(args.scraper)
|
||||||
print("Using scraper: {}".format(scraper.__name__))
|
print("Using scraper: {}".format(scraper.__name__))
|
||||||
else:
|
else:
|
||||||
scraper = scrapers.guess_scraper(url)
|
scraper = scrapers.guess_scraper(source)
|
||||||
print("Guessed scraper: {}".format(scraper.__name__))
|
print("Guessed scraper: {}".format(scraper.__name__))
|
||||||
|
|
||||||
scraped = scraper.scrape(url)
|
scraped = scraper.scrape(source)
|
||||||
|
|
||||||
out = args.out if args.out else urlparse(url).netloc
|
out = args.out if args.out else sanitize_title(scraped.title)
|
||||||
outformat = args.outformat if args.outformat else "json"
|
outformat = args.outformat if args.outformat else "json"
|
||||||
print("Outputting to: {}, using {} outputter".format(out, outformat))
|
print("Outputting to: {}, using {} outputter".format(out, outformat))
|
||||||
outputter = outputters.get_outputter(outformat)
|
outputter = outputters.get_outputter(outformat)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user