我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.utils.project.get_project_settings()。
def run(): configure_logging() # importing project settings for further usage # mainly because of the middlewares settings = get_project_settings() runner = CrawlerRunner(settings) # running spiders sequentially (non-distributed) @defer.inlineCallbacks def crawl(): yield runner.crawl(IPTesterSpider) yield runner.crawl(UATesterSpider) reactor.stop() crawl() reactor.run() # block until the last call
def runspider(name): configure_logging(install_root_handler=False) logging.basicConfig( filename='log/%s.log' % name, format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG ) process = CrawlerProcess(get_project_settings()) try: logging.info('runspider start spider:%s' % name) process.crawl(name) process.start() except Exception as e: logging.exception('runspider spider:%s exception:%s' % (name, e)) logging.debug('finish this spider:%s\n\n' % name)
def start_requests(self): settings = get_project_settings() city_list = settings["CITY_LIST"] if self.city: city_cn_name = city_list.get(self.city) yield scrapy.FormRequest( url=self.base_url + self.city + "_gongyu", formdata={"startDate": self.start_date, "endDate": self.end_date}, callback=self.parse, meta={'city_en_name': self.city, "city_cn_name": city_cn_name} ) else: for city_en_name, city_cn_name in city_list.items(): yield scrapy.FormRequest( url=self.base_url + city_en_name + "_gongyu", formdata={"startDate": self.start_date, "endDate": self.end_date}, callback=self.parse, meta={'city_en_name': city_en_name, "city_cn_name": city_cn_name} )
def crawl(spider, *args, **kwargs): """Run a spider. Args: spider (str): The Scrapy `name` of the spider. """ settings = get_project_settings() if kwargs.get('ignore_robots_txt') is True: settings.attributes.get('ROBOTSTXT_OBEY').value = False proc = CrawlerProcess(settings) try: proc.crawl(spider, *args, **kwargs) proc.start() except KeyError as err: # Log a warning if the scraper name is invalid instead of # causing the job to fail. # NOTE: If there is any other type of error, the job will fail, and all # the jobs that depend on it will fail as well. logger.warning(err.args[0])
def get_feeds_settings(file_=None): if file_: logger.debug('Parsing configuration file {} ...'.format(file_.name)) # Parse configuration file and store result under FEEDS_CONFIG of # scrapy's settings API. parser = configparser.ConfigParser() parser.read_file(file_) config = {s: dict(parser.items(s)) for s in parser.sections()} else: config = {} settings = get_project_settings() settings.set('FEEDS_CONFIG', config) # Mapping of feeds config section to setting names. for settings_key, config_key in FEEDS_CFGFILE_MAPPING.items(): config_value = config.get('feeds', {}).get(config_key) if config_value: settings.set(settings_key, config_value) return settings
def runspider(name, product_id): configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % product_id, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.DEBUG ) process = CrawlerProcess(get_project_settings()) try: logging.info('runscrapy start spider:%s' % name) data = { 'product_id': product_id } process.crawl(name, **data) process.start() except Exception, e: logging.error('runscrapy spider:%s exception:%s' % (name, e)) pass logging.info('finish this spider:%s\n\n' % name)
def link_parse(self, response): deeps = get_project_settings()['SPIDER_DEEP'] # ?????????????? links = response.xpath("//li[@class='hideli']/a/@href").extract() if len(links) == 0: yield self.parse_content(response) else: for link_item in links: yield Request(DOMAIN + link_item, callback=self.parse_content) # ?????? link_page = response.xpath("//a[@class='page_a']/@href").extract() print "link_page:", link_page for page_item in link_page: page_id_list = page_item.split("&p=") this_page_list = response.url.split("&p=") this_index = 1 if len(this_page_list) == 2: this_index = this_page_list[-1] if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps: print page_item yield Request(DOMAIN + "/news" + page_item, callback=self.link_parse)
def link_parse(self, response): deeps = get_project_settings()['SPIDER_DEEP'] # ??????????? links = response.xpath("//li[@class='pbox clr']/div[@class='word']/a/@href").extract() if len(links) > 0: for link in links: yield Request(DOMAIN + link, callback=self.parse_content) page_url = response.url page_size = page_url.split("page_") # ???size=2??????? if len(page_size) == 2: page_index = page_url.split("page_")[1].replace('.html', '') if 1 < int(page_index) < deeps: yield Request(page_url, callback=self.link_parse) # ?????????
def link_parse(self, response): deeps = get_project_settings()['SPIDER_DEEP'] # ?????????????? links = response.xpath("//article/a/@href").extract() if len(links) == 0: yield self.parse_content(response) else: for link_item in links: yield Request(DOMAIN + link_item, callback=self.parse_content) # ?????? link_page = response.xpath("//div[@class='pagination']/ul/li/a/@href").extract() print "link_page:", link_page for page_item in link_page: page_id_list = page_item.split("_") this_page_list = response.url.split("_") this_index = 1 if len(this_page_list) == 3: this_index = this_page_list[-1].replace('.html', '') if len(page_id_list) == 3 and int(this_index) < int(page_id_list[-1].replace('.html', '')) < deeps: print page_item yield Request(DOMAIN + page_item, callback=self.link_parse)
def link_parse(self, response): deeps = get_project_settings()['SPIDER_DEEP'] # ?????????????? links = response.xpath("//li[@class='itm itm_new']/a/@href").extract() if len(links) == 0: yield self.parse_content(response) else: for link_item in links: yield Request(DOMAIN + link_item, callback=self.parse_content) # ?????? link_page = response.xpath("//li[@class='itm itm_new']/span/a/@href").extract() print "link_page:", link_page for page_item in link_page: page_id_list = page_item.split("pg=") this_page_list = response.url.split("pg=") this_index = 1 if len(this_page_list) == 2: this_index = this_page_list[-1] if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps: print page_item yield Request(page_item, callback=self.link_parse) # ?????????
def link_parse(self, response): deeps = get_project_settings()['SPIDER_DEEP'] # ?????????????? links = response.xpath("//article/header/h1/a/@href").extract() if len(links) == 0: yield self.parse_content(response) else: for link_item in links: yield Request(link_item, callback=self.parse_content) # ?????? link_page = response.xpath("//div[@class='nav-previous']/a/@href").extract() print "link_page:", link_page for page_item in link_page: page_id_list = page_item.split("page/") this_page_list = response.url.split("page/") this_index = 1 if len(this_page_list) == 2: this_index = this_page_list[-1] if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps: print page_item yield Request(this_index, callback=self.link_parse)
def get_proxy(self): if get_project_settings().get('IS_USE_PROXY', True): if len(self.proxys) <= 10: self.update_proxy() if len(self.proxys) > 0: self.index = self.index + 1 self.index = self.index % len(self.proxys) proxy = 'http://%s:%s' % (self.proxys[self.index].get('ip'), self.proxys[self.index].get('port')) utils.log('++++++++++proxy:%s++++++++++' % proxy) return proxy return None else: return None
def ProcessRun(): process = CrawlerProcess(get_project_settings()) # ????spider process.crawl("news") # process.crawl("favorite_spider") # ???? spider for spider_name in process.spider_loader.list(): # print spider_name process.crawl(spider_name) process.start()
def __init__(self): settings = get_project_settings() self.__class__.postfix = settings.get('POSTFIX')
def __init__(self): # settings = get_project_settings() # self.__class__.sqlite_name = settings.get('sqlite_name') # self.conn = sqlite3.connect(str(self.__class__.sqlite_name)) self.conn = sqlite3.connect('sample.db')
def connectSQLite(): # settings = get_project_settings() # sqlite_name = settings.get('sqlite_name') # conn = sqlite3.connect(str(sqlite_name)) conn = sqlite3.connect('sample.db') return conn
def run(max_page=5): settings = get_project_settings() settings.set('MAX_PAGE', max_page, 'project') crawler_process = CrawlerProcess(settings) crawler_process.crawl(CaoLiuSpider) crawler_process.start()
def get_settings(): settings = get_project_settings() LOG_PATH = settings['LOG'] if not os.path.exists(LOG_PATH): os.makedirs(LOG_PATH) LOG_FILE = os.path.join(LOG_PATH, str(date.today())) if not os.path.exists(LOG_FILE): f = open(LOG_FILE, 'w') f.close() settings.set('LOG_FILE', LOG_FILE) return settings
def start_requests(self): settings = get_project_settings() city_list = settings["CITY_LIST"] if self.city: city_cn_name = city_list.get(self.city) yield scrapy.Request( url=self.format_url(city_cn_name, '0'), callback=self.parse, meta={ 'city_en_name': self.city, "city_cn_name": city_cn_name, "current_offset": '0', "handle_httpstatus_list": [400, 500, 404] }, ) else: for city_en_name, city_cn_name in city_list.items(): yield scrapy.Request( url=self.format_url(city_cn_name, '0'), callback=self.parse, meta={ 'city_en_name': city_en_name, "city_cn_name": city_cn_name, "current_offset": '0', "handle_httpstatus_list": [400, 500, 404] }, )
def crawl(spider_name, results_dir): """ Run one or more spiders """ settings = get_project_settings() # prevent scrapy from configuring its own logging, since we already have it settings.set('LOG_ENABLED', False) process = CrawlerProcess(settings) for s in spider_name: process.settings.set('FEED_URI', 'file://%s.jsonlines' % os.path.join(results_dir, s)) process.settings.set('FEED_FORMAT', 'jsonlines') spider = process.spider_loader.load(s) process.crawl(spider) process.start()
def __init__(self, keyword, oneof=u'', exclude=u'', max_page=0, save_star=500, save_thumbs=True, save_dir=u'big', *args, **kwargs): super(ImageCrawler, self).__init__(*args, **kwargs) settings = get_project_settings() self.pixiv_id = settings['PIXIV_ID'] self.pixiv_pass = settings['PIXIV_PASS'] self.max_page = int(max_page) print keyword if platform.system() == 'Windows': self.keyword = keyword.decode('gbk').replace('##', ' ') if oneof is not None and oneof != u'': self.keyword += u' (' + oneof.decode('gbk').replace('##', ' OR ') + u')' if exclude is not None and exclude != u'': excludes = exclude.split('##') for excl in excludes: self.keyword += u' -' + excl.decode('gbk') self.img_save_dir = save_dir.decode('gbk') else: self.keyword = keyword.replace('##', ' ') if oneof is not None and oneof != u'': self.keyword += u' (' + oneof.replace('##', ' OR ') + u')' if exclude is not None and exclude != u'': excludes = exclude.split('##') for excl in excludes: self.keyword += u' -' + excl self.img_save_dir = save_dir self.save_star = int(save_star) self.save_thumbs = save_thumbs == 'True' or save_thumbs == True print self.keyword, self.max_page, self.save_star, self.save_thumbs
def create_table(self): conn = sqlite3.connect(get_project_settings()['DATABASE_POSITION']) conn.text_factory = str try: conn.execute( """CREATE TABLE pixiv_item( id INTEGER PRIMARY KEY, title TEXT, link TEXT, star INTEGER, multi INTEGER, keyword TEXT, publish TIMESTAMP)""") conn.commit() except: pass return conn
def main(): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl("pixiv") process.start()
def read_star(): db = sqlite3.connect(get_project_settings()['DATABASE_POSITION']) db.row_factory = dict_factory cursor = db.cursor() cursor.execute('select * from pixiv_item where star is not null ORDER BY -star') global star_array, unstar star_array = cursor.fetchall()
def __init__(self, config): self.settings = get_project_settings() self.settings.set('DOWNLOAD_MAXSIZE', config.get('max_file_size', 1024 * 1024 * 2)) self.downloader = HTTPDownloadHandler(self.settings) self.proxies = {} self.valid_extensions = config.getlist('file_valid_extensions', "jpg, png") _proxies = config.items('proxy', ()) for proxy_type, proxy in _proxies: self.proxies[proxy_type] = get_proxy(proxy, proxy_type)
def crawl(spiders, query, start, end, page): spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end)) process = CrawlerProcess(get_project_settings()) process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page) process.start()
def run(cls): runner = CrawlerRunner(get_project_settings()) @defer.inlineCallbacks def deferred_crawl(): for spider, args, kwargs in cls.queue: try: yield runner.crawl(spider, *args, **kwargs) except KeyError as err: # Log a warning if the scraper name is invalid instead of # causing the job to fail. # NOTE: If there is any other type of error, the job will # fail, and all the jobs that depend on it will fail as # well. logger.warning(err.args[0]) # XXX: If all the names fail, then trying to run # `reactor.stop()` will give an "Unhandled error in # Deferred" complaint and hang. It will also hang in # general if no spiders have been run. I assume there's # some twisted-way to handle this, but for now, just log an # error. if reactor.running: reactor.stop() else: logger.critical("LocalQueue: No valid scraper names found.") deferred_crawl() reactor.run()
def __init__(self): self.path = self._script_path() try: self.settings = project.get_project_settings() # get settings self.configPath = self.settings.get("RESOURCE_DIR") except: pass if 'configPath' in self.__dict__: self.path = self.configPath
def __init__(self): self.settings = project.get_project_settings() # get settings self.MYSQL_HOST = self.settings.get('MYSQL_HOST') self.MYSQL_PORT = self.settings.getint('MYSQL_PORT') self.MYSQL_USER = self.settings.get('MYSQL_USER') self.MYSQL_PASSWD = self.settings.get('MYSQL_PASSWD') self.MYSQL_DB = self.settings.get('MYSQL_DB') self._conn()
def __init__(self, collection_name): self.settings = project.get_project_settings() # get settings self.MONGO_URL = self.settings.get("MONGO_URL","localhost") self.client = MongoClient( host=self.mongo_url, tz_aware=True) self.db = self.client['crawl_db'] self.posts = self.db[collection_name]
def __init__(self): print 'preparing ------------------' self.start_page = 1 self.modelUrl = 'https://www.lagou.com/jobs/' self.DBK = get_project_settings().getdict('DBK') #??settings??DBK?? hp.NEWHTTPS() #????IP? self.oldPages = self.getOldpages() #?????????
def __init__(self): self.DBK = get_project_settings().getdict('DBK')
def list(): """List all available spiders.""" settings = get_project_settings() settings['LOG_ENABLED'] = False process = CrawlerProcess(settings) for s in sorted(process.spider_loader.list()): print(s)
def run(self, args, opts): conn = redis.Redis(decode_responses=True) runner = CrawlerRunner(get_project_settings()) try: rules = Rule.loads() if not rules: raise ValueError except ValueError: print('Error in loading Redis rules, fallback to CSV rules') rules = Rule.loads('csv') for rule in rules: rule.save() if rule.name in self.excludes: continue if conn.hget('Rule:' + rule.name, 'status') == 'started': d = runner.crawl(ProxySpider, rule) # Set status to stopped if crawler finished d.addBoth(lambda _: conn.hset( 'Rule:' + rule.name, 'status', 'finished')) rule_maintainer = RuleMaintainer(conn, runner) proxy_maintainer = ProxyMaintainer(conn) schedule_maintainer = ScheduleMaintainer(conn) lc = task.LoopingCall(rule_maintainer) lc.start(1) lc = task.LoopingCall(proxy_maintainer) lc.start(0.5) lc = task.LoopingCall(schedule_maintainer) lc.start(10) reactor.run()
def __init__(self): self.settings=get_project_settings() #??settings?????????? self.host=self.settings['MYSQL_HOST'] self.port=self.settings['MYSQL_PORT'] self.user=self.settings['MYSQL_USER'] self.passwd=self.settings['MYSQL_PASSWD'] self.db=self.settings['MYSQL_DBNAME']
def __init__(self): self.is_running = False dispatcher.connect(self.pause_crawler, signals.engine_stopped) self.setting = get_project_settings() self.process = None
def getReviewCount(url): # Get the number of reviews process = CrawlerProcess(get_project_settings()) process.crawl(review_count_spider, start_url=url) process.start()
def __new__(cls, *args, **kwargs): start_list = ['http://www.coindesk.com/category/news/'] i = 2 deeps = get_project_settings()['SPIDER_DEEP'] while i < deeps: start_list.append('http://www.coindesk.com/category/news/page/' + bytes(i) + "/") i += 1 CoinDesk.start_urls = start_list print CoinDesk.start_urls return super(CoinDesk, cls).__new__(cls, *args, **kwargs)
def __init__(self): self.settings=get_project_settings() self.host=self.settings['MYSQL_HOST'] self.port=self.settings['MYSQL_PORT'] self.user=self.settings['MYSQL_USER'] self.passwd=self.settings['MYSQL_PASSWD'] self.db=self.settings['MYSQL_DBNAME']
def import_settings(self): settings = get_project_settings() self.password = settings['AUTH_PASSWORD'] self.http_proxy = settings['HTTP_PROXY'] self.control_port = settings['CONTROL_PORT'] self.max_req_per_ip = settings['MAX_REQ_PER_IP']
def runTest(self): settings = get_project_settings() settings.set('SPIDER_MODULES', ['classes.spiders']) try: sys.path.append(scrapy_path) runner = CrawlerRunner(settings) spiders = runner.spider_loader.list() self.assertEqual(set(class_pipeline.get_spiders()), set(spiders)) except: pass
def __init__(self): # getting the settings of the project (settings.py) self.settings = get_project_settings() # processing input arguments self.process_args() # meeting the arguments with the settings self.change_settings() # open mongo here just to check if mongod service is running # if it isn't, might as well not start crawling if self.args.file == None: self.open_mongo() self.dump_collection() # running the spiders self.run_crawler() if self.args.file: self.sort_file() else: if self.args.server == False: # working with the mongo db self.sort() # close mongo self.close_mongo()
def __init__(self): scrapy.spiders.Spider.__init__(self) self.global_settings = get_project_settings() if self.global_settings['PLATFORM'] in ['win', 'mac']: self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH']) elif self.global_settings['PLATFORM'] in ['linux']: self.driver = webdriver.PhantomJS() self.driver.set_page_load_timeout(30) self.driver.implicitly_wait(10) self.type_id_list = self.global_settings['CRAWLER']['type_id_list'] self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id']) self.url_template = self.global_settings['CRAWLER']['url_template']
def _run_spiders(ticker_list, start_date, end_date): configure_logging() runner = CrawlerRunner(settings=get_project_settings()) spider_dict = { 'symbols': ticker_list, 'start_date': start_date, 'end_date': end_date } runner.crawl(EdgarSpider, **spider_dict) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def crawl(url): """Initialize crawling sequence.""" settings = get_project_settings() settings.url = url settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT settings["DEPTH_LEVEL"] = DEPTH_LEVEL process = CrawlerProcess(settings) class ThisSpider(CrawlingSpider): """Create a spider to crawl with.""" start_urls = [url] process.crawl(ThisSpider) process.start()
def harvest(url): """Initialize harvest sequence.""" settings = get_project_settings() settings.url = url process = CrawlerProcess(settings) process.crawl(HarvestSpider, url=url) process.start()
def __init__(self, **kwargs): settings = get_project_settings() self.create_display() self.load_proxy_list() self.get_item_seen(settings)