我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用scrapy.exceptions.UsageError()。
def process_options(self, args, opts): try: self.settings.setdict(arglist_to_dict(opts.set), priority='cmdline') except ValueError: raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False) if opts.logfile: self.settings.set('LOG_ENABLED', True, priority='cmdline') self.settings.set('LOG_FILE', opts.logfile, priority='cmdline') if opts.loglevel: self.settings.set('LOG_ENABLED', True, priority='cmdline') self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline') if opts.nolog: self.settings.set('LOG_ENABLED', False, priority='cmdline') if opts.pidfile: with open(opts.pidfile, "w") as f: f.write(str(os.getpid()) + os.linesep) if opts.pdb: failure.startDebugMode()
def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) if opts.output: if opts.output == '-': self.settings.set('FEED_URI', 'stdout:', priority='cmdline') else: self.settings.set('FEED_URI', opts.output, priority='cmdline') feed_exporters = without_none_values( self.settings.getwithbase('FEED_EXPORTERS')) valid_output_formats = feed_exporters.keys() if not opts.output_format: opts.output_format = os.path.splitext(opts.output)[1].replace(".", "") if opts.output_format not in valid_output_formats: raise UsageError("Unrecognized output format '%s', set one" " using the '-t' switch or as a file extension" " from the supported list %s" % (opts.output_format, tuple(valid_output_formats))) self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline') #??
def run(self, args, opts): if len(args) != 1: raise UsageError() command = args[0] arguments = _parse_arguments(opts.args) spider = ExternalSpider('StreamingSpider', command, arguments) loader = ExternalSpiderLoader.from_settings(self.settings, load_spiders=False) loader.crawl(spider)
def run(self, args, opts): if len(args) != 4: raise UsageError() spider_name, url, login, password = args crawler = self.crawler_process.create_crawler(spider_name) scheduler = Scheduler.from_settings(self.settings) spider = crawler.spidercls.from_crawler(crawler) scheduler.open(spider) add_login(spider, url, login, password, queue=scheduler.queue)
def run(self, args, opts): if len(args) != 1: raise UsageError() crawler = self.crawler_process.create_crawler(args[0]) scheduler = Scheduler.from_settings(self.settings) spider = crawler.spidercls.from_crawler(crawler) scheduler.open(spider) stats = scheduler.queue.get_stats() print('\nQueue size: {len}, domains: {n_domains}\n'.format(**stats)) print_top = 10 printed_count = 0 queues = stats['queues'] print('{:<50}\tCount\tScore'.format('Domain')) for queue, score, count in queues[:print_top]: printed_count += count domain = queue.rsplit(':')[-1] print('{:<50}\t{}\t{:.0f}'.format(domain, count, score)) others_count = sum(count for _, _, count in queues) - printed_count if others_count: print('...') print('{:<50}\t{}'.format( 'other {}:'.format(len(queues) - print_top), others_count)) print() if opts.output: with open(opts.output, 'w') as f: json.dump(stats, f, ensure_ascii=False, indent=True, sort_keys=True) print('Stats dumped to {}'.format(opts.output))
def run(self, args, opts): if not args: raise UsageError() if len(args) == 1 and '*' in args[0]: # paths were not expanded (docker) filenames = glob.glob(args[0]) else: filenames = args del args filtered_filenames = [ f for f in filenames if re.match(r'[a-z0-9]{12}\.csv$', os.path.basename(f))] filenames = filtered_filenames or filenames if not filenames: raise UsageError() response_logs = [] for filename in filenames: with json_lines.open(filename) as f: response_logs.append(pd.DataFrame(f)) print('Read data from {} files'.format(len(filenames))) all_rpms = [rpms for rpms in ( get_rpms(name, rlog, step=opts.step, smooth=opts.smooth) for name, rlog in zip(filenames, response_logs)) if rpms is not None] if all_rpms: print_rpms(all_rpms, opts) print_scores(response_logs, opts)
def set_pages(self, pages): if len(pages) == 0: begin_page = 1 end_page = 999999 else: begin_page = pages[0] end_page = pages[1] if begin_page <= 0: raise UsageError("The number of begin page must not be less than 1!") if begin_page > end_page: raise UsageError("The number of end page must not be less than that of begin page!") self.settings.set('BEGIN_PAGE', begin_page, priority='cmdline') self.settings.set('END_PAGE', end_page, priority='cmdline')
def run(self, args, opts): self.set_pages(opts.pages) self.settings.set('GOOD_ONLY', opts.good_only) self.settings.set('SEE_LZ', opts.see_lz) if opts.filter: try: opts.filter = eval('filter.' + opts.filter) except: raise UsageError("Invalid filter function name!") self.settings.set("FILTER", opts.filter) cfg = config.config() if len(args) >= 3: raise UsageError("Too many arguments!") self.settings.set('MYSQL_HOST', cfg.config['MYSQL_HOST']) self.settings.set('MYSQL_USER', cfg.config['MYSQL_USER']) self.settings.set('MYSQL_PASSWD', cfg.config['MYSQL_PASSWD']) tbname = cfg.config['DEFAULT_TIEBA'] if len(args) >= 1: tbname = args[0] if isinstance(tbname, unicode): tbname = tbname.encode('utf8') dbname = None for key in cfg.config['MYSQL_DBNAME'].keys(): if key.encode('utf8') == tbname: dbname = cfg.config['MYSQL_DBNAME'][key] if len(args) >= 2: dbname = args[1] cfg.config['MYSQL_DBNAME'][tbname.decode('utf8')] = dbname if not dbname: raise UsageError("Please input database name!") self.settings.set('TIEBA_NAME', tbname, priority='cmdline') self.settings.set('MYSQL_DBNAME', dbname, priority='cmdline') config.init_database(cfg.config['MYSQL_HOST'], cfg.config['MYSQL_USER'], cfg.config['MYSQL_PASSWD'], dbname) log = config.log(tbname, dbname, self.settings['BEGIN_PAGE'], opts.good_only, opts.see_lz) self.settings.set('SIMPLE_LOG', log) self.crawler_process.crawl('tieba', **opts.spargs) self.crawler_process.start() cfg.save()