Python scrapy.utils.project 模块,get_project_settings() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.utils.project.get_project_settings()

项目:PythonScrapyBasicSetup    作者:matejbasic    | 项目源码 | 文件源码
def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)
项目:hotel_crawler    作者:popwalker    | 项目源码 | 文件源码
def start_requests(self):
        settings = get_project_settings()
        city_list = settings["CITY_LIST"]

        if self.city:
            city_cn_name = city_list.get(self.city)
            yield scrapy.FormRequest(
                url=self.base_url + self.city + "_gongyu",
                formdata={"startDate": self.start_date, "endDate": self.end_date},
                callback=self.parse,
                meta={'city_en_name': self.city, "city_cn_name": city_cn_name}
            )
        else:
            for city_en_name, city_cn_name in city_list.items():
                yield scrapy.FormRequest(
                    url=self.base_url + city_en_name + "_gongyu",
                    formdata={"startDate": self.start_date, "endDate": self.end_date},
                    callback=self.parse,
                    meta={'city_en_name': city_en_name, "city_cn_name": city_cn_name}
            )
项目:osp-scraper    作者:opensyllabus    | 项目源码 | 文件源码
def crawl(spider, *args, **kwargs):
    """Run a spider.

    Args:
        spider (str): The Scrapy `name` of the spider.
    """
    settings = get_project_settings()
    if kwargs.get('ignore_robots_txt') is True:
        settings.attributes.get('ROBOTSTXT_OBEY').value = False

    proc = CrawlerProcess(settings)
    try:
        proc.crawl(spider, *args, **kwargs)
        proc.start()
    except KeyError as err:
        # Log a warning if the scraper name is invalid instead of
        # causing the job to fail.
        # NOTE: If there is any other type of error, the job will fail, and all
        # the jobs that depend on it will fail as well.
        logger.warning(err.args[0])
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def get_feeds_settings(file_=None):
    if file_:
        logger.debug('Parsing configuration file {} ...'.format(file_.name))
        # Parse configuration file and store result under FEEDS_CONFIG of
        # scrapy's settings API.
        parser = configparser.ConfigParser()
        parser.read_file(file_)
        config = {s: dict(parser.items(s)) for s in parser.sections()}
    else:
        config = {}

    settings = get_project_settings()
    settings.set('FEEDS_CONFIG', config)

    # Mapping of feeds config section to setting names.
    for settings_key, config_key in FEEDS_CFGFILE_MAPPING.items():
        config_value = config.get('feeds', {}).get(config_key)
        if config_value:
            settings.set(settings_key, config_value)

    return settings
项目:jd_comment    作者:awolfly9    | 项目源码 | 文件源码
def runspider(name, product_id):
    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % product_id,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runscrapy start spider:%s' % name)
        data = {
            'product_id': product_id
        }
        process.crawl(name, **data)
        process.start()
    except Exception, e:
        logging.error('runscrapy spider:%s exception:%s' % (name, e))
        pass

    logging.info('finish this spider:%s\n\n' % name)
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def link_parse(self, response):
        deeps = get_project_settings()['SPIDER_DEEP']
        # ??????????????
        links = response.xpath("//li[@class='hideli']/a/@href").extract()
        if len(links) == 0:
            yield self.parse_content(response)
        else:
            for link_item in links:
                yield Request(DOMAIN + link_item, callback=self.parse_content)
        # ??????
        link_page = response.xpath("//a[@class='page_a']/@href").extract()
        print "link_page:", link_page
        for page_item in link_page:
            page_id_list = page_item.split("&p=")
            this_page_list = response.url.split("&p=")
            this_index = 1
            if len(this_page_list) == 2:
                this_index = this_page_list[-1]
            if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
                print page_item
                yield Request(DOMAIN + "/news" + page_item, callback=self.link_parse)
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def link_parse(self, response):
        deeps = get_project_settings()['SPIDER_DEEP']
        # ???????????
        links = response.xpath("//li[@class='pbox clr']/div[@class='word']/a/@href").extract()
        if len(links) > 0:
            for link in links:
                yield Request(DOMAIN + link, callback=self.parse_content)
        page_url = response.url
        page_size = page_url.split("page_")
        # ???size=2???????
        if len(page_size) == 2:
            page_index = page_url.split("page_")[1].replace('.html', '')
            if 1 < int(page_index) < deeps:
                yield Request(page_url, callback=self.link_parse)

    # ?????????
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def link_parse(self, response):
        deeps = get_project_settings()['SPIDER_DEEP']
        # ??????????????
        links = response.xpath("//li[@class='hideli']/a/@href").extract()
        if len(links) == 0:
            yield self.parse_content(response)
        else:
            for link_item in links:
                yield Request(DOMAIN + link_item, callback=self.parse_content)
        # ??????
        link_page = response.xpath("//a[@class='page_a']/@href").extract()
        print "link_page:", link_page
        for page_item in link_page:
            page_id_list = page_item.split("&p=")
            this_page_list = response.url.split("&p=")
            this_index = 1
            if len(this_page_list) == 2:
                this_index = this_page_list[-1]
            if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
                print page_item
                yield Request(DOMAIN + "/news" + page_item, callback=self.link_parse)
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def link_parse(self, response):
        deeps = get_project_settings()['SPIDER_DEEP']
        # ??????????????
        links = response.xpath("//article/a/@href").extract()
        if len(links) == 0:
            yield self.parse_content(response)
        else:
            for link_item in links:
                yield Request(DOMAIN + link_item, callback=self.parse_content)
        # ??????
        link_page = response.xpath("//div[@class='pagination']/ul/li/a/@href").extract()
        print "link_page:", link_page
        for page_item in link_page:
            page_id_list = page_item.split("_")
            this_page_list = response.url.split("_")
            this_index = 1
            if len(this_page_list) == 3:
                this_index = this_page_list[-1].replace('.html', '')
            if len(page_id_list) == 3 and int(this_index) < int(page_id_list[-1].replace('.html', '')) < deeps:
                print page_item
                yield Request(DOMAIN + page_item, callback=self.link_parse)
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def link_parse(self, response):
        deeps = get_project_settings()['SPIDER_DEEP']
        # ??????????????
        links = response.xpath("//li[@class='itm itm_new']/a/@href").extract()
        if len(links) == 0:
            yield self.parse_content(response)
        else:
            for link_item in links:
                yield Request(DOMAIN + link_item, callback=self.parse_content)
        # ??????
        link_page = response.xpath("//li[@class='itm itm_new']/span/a/@href").extract()
        print "link_page:", link_page
        for page_item in link_page:
            page_id_list = page_item.split("pg=")
            this_page_list = response.url.split("pg=")
            this_index = 1
            if len(this_page_list) == 2:
                this_index = this_page_list[-1]
            if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
                print page_item
                yield Request(page_item, callback=self.link_parse)

    # ?????????
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def link_parse(self, response):
        deeps = get_project_settings()['SPIDER_DEEP']
        # ??????????????
        links = response.xpath("//article/header/h1/a/@href").extract()
        if len(links) == 0:
            yield self.parse_content(response)
        else:
            for link_item in links:
                yield Request(link_item, callback=self.parse_content)
        # ??????
        link_page = response.xpath("//div[@class='nav-previous']/a/@href").extract()
        print "link_page:", link_page
        for page_item in link_page:
            page_id_list = page_item.split("page/")
            this_page_list = response.url.split("page/")
            this_index = 1
            if len(this_page_list) == 2:
                this_index = this_page_list[-1]
            if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
                print page_item
                yield Request(this_index, callback=self.link_parse)
项目:IPProxyTool    作者:awolfly9    | 项目源码 | 文件源码
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)
项目:unity    作者:awolfly9    | 项目源码 | 文件源码
def get_proxy(self):
        if get_project_settings().get('IS_USE_PROXY', True):
            if len(self.proxys) <= 10:
                self.update_proxy()

            if len(self.proxys) > 0:
                self.index = self.index + 1
                self.index = self.index % len(self.proxys)

                proxy = 'http://%s:%s' % (self.proxys[self.index].get('ip'), self.proxys[self.index].get('port'))
                utils.log('++++++++++proxy:%s++++++++++' % proxy)
                return proxy

            return None
        else:
            return None
项目:Python_Stock_Github    作者:DavidFnck    | 项目源码 | 文件源码
def ProcessRun():

    process = CrawlerProcess(get_project_settings())
    # ????spider
    process.crawl("news")
    # process.crawl("favorite_spider")
    # ???? spider
    for spider_name in process.spider_loader.list():
        # print spider_name
        process.crawl(spider_name)
    process.start()
项目:Broad_Crawler    作者:rafacheng    | 项目源码 | 文件源码
def __init__(self):
        settings = get_project_settings()
        self.__class__.postfix = settings.get('POSTFIX')
项目:Broad_Crawler    作者:rafacheng    | 项目源码 | 文件源码
def __init__(self):
        # settings = get_project_settings()
        # self.__class__.sqlite_name = settings.get('sqlite_name')
        # self.conn = sqlite3.connect(str(self.__class__.sqlite_name))
        self.conn = sqlite3.connect('sample.db')
项目:Broad_Crawler    作者:rafacheng    | 项目源码 | 文件源码
def connectSQLite():
    # settings = get_project_settings()
    # sqlite_name = settings.get('sqlite_name')
    # conn = sqlite3.connect(str(sqlite_name))
    conn = sqlite3.connect('sample.db')
    return conn
项目:caoliuscrapy    作者:leyle    | 项目源码 | 文件源码
def run(max_page=5):
    settings = get_project_settings()
    settings.set('MAX_PAGE', max_page, 'project')
    crawler_process = CrawlerProcess(settings)
    crawler_process.crawl(CaoLiuSpider)
    crawler_process.start()
项目:ArticlePusher    作者:aforwardz    | 项目源码 | 文件源码
def get_settings():
    settings = get_project_settings()
    LOG_PATH = settings['LOG']
    if not os.path.exists(LOG_PATH):
        os.makedirs(LOG_PATH)

    LOG_FILE = os.path.join(LOG_PATH, str(date.today()))
    if not os.path.exists(LOG_FILE):
        f = open(LOG_FILE, 'w')
        f.close()
    settings.set('LOG_FILE', LOG_FILE)
    return settings
项目:hotel_crawler    作者:popwalker    | 项目源码 | 文件源码
def start_requests(self):
        settings = get_project_settings()
        city_list = settings["CITY_LIST"]
        if self.city:
            city_cn_name = city_list.get(self.city)
            yield scrapy.Request(
                url=self.format_url(city_cn_name, '0'),
                callback=self.parse,
                meta={
                    'city_en_name': self.city,
                    "city_cn_name": city_cn_name,
                    "current_offset": '0',
                    "handle_httpstatus_list": [400, 500, 404]
                },
            )
        else:
            for city_en_name, city_cn_name in city_list.items():
                yield scrapy.Request(
                    url=self.format_url(city_cn_name, '0'),
                    callback=self.parse,
                    meta={
                        'city_en_name': city_en_name,
                        "city_cn_name": city_cn_name,
                        "current_offset": '0',
                        "handle_httpstatus_list": [400, 500, 404]
                    },
                )
项目:StrepHit    作者:Wikidata    | 项目源码 | 文件源码
def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()
项目:Pixiv-Spider    作者:cathor01    | 项目源码 | 文件源码
def __init__(self, keyword, oneof=u'', exclude=u'', max_page=0, save_star=500, save_thumbs=True, save_dir=u'big', *args, **kwargs):
        super(ImageCrawler, self).__init__(*args, **kwargs)
        settings = get_project_settings()

        self.pixiv_id = settings['PIXIV_ID']
        self.pixiv_pass = settings['PIXIV_PASS']
        self.max_page = int(max_page)
        print keyword
        if platform.system() == 'Windows':
            self.keyword = keyword.decode('gbk').replace('##', ' ')
            if oneof is not None and oneof != u'':
                self.keyword += u' (' + oneof.decode('gbk').replace('##', ' OR ') + u')'
            if exclude is not None and exclude != u'':
                excludes = exclude.split('##')
                for excl in excludes:
                    self.keyword += u' -' + excl.decode('gbk')
            self.img_save_dir = save_dir.decode('gbk')
        else:
            self.keyword = keyword.replace('##', ' ')
            if oneof is not None and oneof != u'':
                self.keyword += u' (' + oneof.replace('##', ' OR ') + u')'
            if exclude is not None and exclude != u'':
                excludes = exclude.split('##')
                for excl in excludes:
                    self.keyword += u' -' + excl
            self.img_save_dir = save_dir
        self.save_star = int(save_star)
        self.save_thumbs = save_thumbs == 'True' or save_thumbs == True
        print self.keyword, self.max_page, self.save_star, self.save_thumbs
项目:Pixiv-Spider    作者:cathor01    | 项目源码 | 文件源码
def create_table(self):
        conn = sqlite3.connect(get_project_settings()['DATABASE_POSITION'])
        conn.text_factory = str
        try:
            conn.execute(
                """CREATE TABLE pixiv_item(
                       id INTEGER PRIMARY KEY,
                       title TEXT, link TEXT,
                       star INTEGER, multi INTEGER,
                       keyword TEXT, publish TIMESTAMP)""")
            conn.commit()
        except:
            pass
        return conn
项目:Pixiv-Spider    作者:cathor01    | 项目源码 | 文件源码
def main():
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    process.crawl("pixiv")
    process.start()
项目:Pixiv-Spider    作者:cathor01    | 项目源码 | 文件源码
def read_star():
    db = sqlite3.connect(get_project_settings()['DATABASE_POSITION'])
    db.row_factory = dict_factory
    cursor = db.cursor()
    cursor.execute('select * from pixiv_item where star is not null ORDER BY -star')
    global star_array, unstar
    star_array = cursor.fetchall()
项目:flowder    作者:amir-khakshour    | 项目源码 | 文件源码
def __init__(self, config):
        self.settings = get_project_settings()
        self.settings.set('DOWNLOAD_MAXSIZE', config.get('max_file_size', 1024 * 1024 * 2))

        self.downloader = HTTPDownloadHandler(self.settings)
        self.proxies = {}
        self.valid_extensions = config.getlist('file_valid_extensions', "jpg, png")
        _proxies = config.items('proxy', ())
        for proxy_type, proxy in _proxies:
            self.proxies[proxy_type] = get_proxy(proxy, proxy_type)
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def crawl(spiders, query, start, end, page):
    spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end))
    process = CrawlerProcess(get_project_settings())
    process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page)
    process.start()
项目:osp-scraper    作者:opensyllabus    | 项目源码 | 文件源码
def run(cls):
        runner = CrawlerRunner(get_project_settings())

        @defer.inlineCallbacks
        def deferred_crawl():
            for spider, args, kwargs in cls.queue:
                try:
                    yield runner.crawl(spider, *args, **kwargs)
                except KeyError as err:
                    # Log a warning if the scraper name is invalid instead of
                    # causing the job to fail.
                    # NOTE: If there is any other type of error, the job will
                    # fail, and all the jobs that depend on it will fail as
                    # well.
                    logger.warning(err.args[0])

            # XXX: If all the names fail, then trying to run
            # `reactor.stop()` will give an "Unhandled error in
            # Deferred" complaint and hang.  It will also hang in
            # general if no spiders have been run.  I assume there's
            # some twisted-way to handle this, but for now, just log an
            # error.
            if reactor.running:
                reactor.stop()
            else:
                logger.critical("LocalQueue: No valid scraper names found.")

        deferred_crawl()
        reactor.run()
项目:scrappy    作者:DormyMo    | 项目源码 | 文件源码
def __init__(self):
        self.path = self._script_path()
        try:
            self.settings = project.get_project_settings()  # get settings
            self.configPath = self.settings.get("RESOURCE_DIR")
        except:
            pass
        if 'configPath' in self.__dict__:
            self.path = self.configPath
项目:scrappy    作者:DormyMo    | 项目源码 | 文件源码
def __init__(self):
        self.settings = project.get_project_settings()  # get settings
        self.MYSQL_HOST = self.settings.get('MYSQL_HOST')
        self.MYSQL_PORT = self.settings.getint('MYSQL_PORT')
        self.MYSQL_USER = self.settings.get('MYSQL_USER')
        self.MYSQL_PASSWD = self.settings.get('MYSQL_PASSWD')
        self.MYSQL_DB = self.settings.get('MYSQL_DB')
        self._conn()
项目:scrappy    作者:DormyMo    | 项目源码 | 文件源码
def __init__(self, collection_name):
        self.settings = project.get_project_settings()  # get settings
        self.MONGO_URL = self.settings.get("MONGO_URL","localhost")
        self.client = MongoClient(
            host=self.mongo_url, tz_aware=True)
        self.db = self.client['crawl_db']
        self.posts = self.db[collection_name]
项目:lagouwang    作者:whaike    | 项目源码 | 文件源码
def __init__(self):
        print 'preparing ------------------'
        self.start_page = 1
        self.modelUrl = 'https://www.lagou.com/jobs/'
        self.DBK = get_project_settings().getdict('DBK') #??settings??DBK??
        hp.NEWHTTPS() #????IP?
        self.oldPages = self.getOldpages() #?????????
项目:lagouwang    作者:whaike    | 项目源码 | 文件源码
def __init__(self):
        self.DBK = get_project_settings().getdict('DBK')
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def list():
    """List all available spiders."""
    settings = get_project_settings()
    settings['LOG_ENABLED'] = False
    process = CrawlerProcess(settings)
    for s in sorted(process.spider_loader.list()):
        print(s)
项目:ProxyPool    作者:Time1ess    | 项目源码 | 文件源码
def run(self, args, opts):
        conn = redis.Redis(decode_responses=True)
        runner = CrawlerRunner(get_project_settings())
        try:
            rules = Rule.loads()
            if not rules:
                raise ValueError
        except ValueError:
            print('Error in loading Redis rules, fallback to CSV rules')
            rules = Rule.loads('csv')
        for rule in rules:
            rule.save()
            if rule.name in self.excludes:
                continue
            if conn.hget('Rule:' + rule.name, 'status') == 'started':
                d = runner.crawl(ProxySpider, rule)
                # Set status to stopped if crawler finished
                d.addBoth(lambda _: conn.hset(
                    'Rule:' + rule.name, 'status', 'finished'))
        rule_maintainer = RuleMaintainer(conn, runner)
        proxy_maintainer = ProxyMaintainer(conn)
        schedule_maintainer = ScheduleMaintainer(conn)
        lc = task.LoopingCall(rule_maintainer)
        lc.start(1)
        lc = task.LoopingCall(proxy_maintainer)
        lc.start(0.5)
        lc = task.LoopingCall(schedule_maintainer)
        lc.start(10)
        reactor.run()
项目:dianping    作者:bsns    | 项目源码 | 文件源码
def __init__(self):
        self.settings=get_project_settings() #??settings??????????

        self.host=self.settings['MYSQL_HOST']
        self.port=self.settings['MYSQL_PORT']
        self.user=self.settings['MYSQL_USER']
        self.passwd=self.settings['MYSQL_PASSWD']
        self.db=self.settings['MYSQL_DBNAME']
项目:decoration-design-crawler    作者:imflyn    | 项目源码 | 文件源码
def __init__(self):
        self.is_running = False
        dispatcher.connect(self.pause_crawler, signals.engine_stopped)
        self.setting = get_project_settings()
        self.process = None
项目:decoration-design-crawler    作者:imflyn    | 项目源码 | 文件源码
def __init__(self):
        self.is_running = False
        dispatcher.connect(self.pause_crawler, signals.engine_stopped)
        self.setting = get_project_settings()
        self.process = None
项目:Get-Positive    作者:M-shin    | 项目源码 | 文件源码
def getReviewCount(url):
  # Get the number of reviews
  process = CrawlerProcess(get_project_settings())
  process.crawl(review_count_spider, start_url=url)
  process.start()
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def __new__(cls, *args, **kwargs):
        start_list = ['http://www.coindesk.com/category/news/']
        i = 2
        deeps = get_project_settings()['SPIDER_DEEP']
        while i < deeps:
            start_list.append('http://www.coindesk.com/category/news/page/' + bytes(i) + "/")
            i += 1
        CoinDesk.start_urls = start_list
        print CoinDesk.start_urls
        return super(CoinDesk, cls).__new__(cls, *args, **kwargs)
项目:finTech    作者:keepCodingDream    | 项目源码 | 文件源码
def __init__(self):
        self.settings=get_project_settings()

        self.host=self.settings['MYSQL_HOST']
        self.port=self.settings['MYSQL_PORT']
        self.user=self.settings['MYSQL_USER']
        self.passwd=self.settings['MYSQL_PASSWD']
        self.db=self.settings['MYSQL_DBNAME']
项目:PythonScrapyBasicSetup    作者:matejbasic    | 项目源码 | 文件源码
def import_settings(self):
        settings = get_project_settings()
        self.password = settings['AUTH_PASSWORD']
        self.http_proxy = settings['HTTP_PROXY']
        self.control_port = settings['CONTROL_PORT']
        self.max_req_per_ip = settings['MAX_REQ_PER_IP']
项目:dancedeets-monorepo    作者:mikelambert    | 项目源码 | 文件源码
def runTest(self):
        settings = get_project_settings()
        settings.set('SPIDER_MODULES', ['classes.spiders'])
        try:
            sys.path.append(scrapy_path)
            runner = CrawlerRunner(settings)
            spiders = runner.spider_loader.list()
            self.assertEqual(set(class_pipeline.get_spiders()), set(spiders))
        except:
            pass
项目:tobber    作者:fchamicapereira    | 项目源码 | 文件源码
def __init__(self):

        # getting the settings of the project (settings.py)
        self.settings = get_project_settings()

        # processing input arguments
        self.process_args()

        # meeting the arguments with the settings
        self.change_settings()

        # open mongo here just to check if mongod service is running
        # if it isn't, might as well not start crawling
        if self.args.file == None:
            self.open_mongo()
            self.dump_collection()

        # running the spiders
        self.run_crawler()

        if self.args.file:
            self.sort_file()

        else:

            if self.args.server == False:

                # working with the mongo db
                self.sort()

            # close mongo
            self.close_mongo()
项目:video_url_crawler_demo    作者:czs0x55aa    | 项目源码 | 文件源码
def __init__(self):
        scrapy.spiders.Spider.__init__(self)

        self.global_settings = get_project_settings()
        if self.global_settings['PLATFORM'] in ['win', 'mac']:
            self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
        elif self.global_settings['PLATFORM'] in ['linux']:
            self.driver = webdriver.PhantomJS()
        self.driver.set_page_load_timeout(30)
        self.driver.implicitly_wait(10)

        self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
        self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
        self.url_template = self.global_settings['CRAWLER']['url_template']
项目:py-investment    作者:kprestel    | 项目源码 | 文件源码
def _run_spiders(ticker_list, start_date, end_date):
        configure_logging()
        runner = CrawlerRunner(settings=get_project_settings())

        spider_dict = {
            'symbols': ticker_list,
            'start_date': start_date,
            'end_date': end_date
        }
        runner.crawl(EdgarSpider, **spider_dict)
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run()
项目:Pysearch2.0    作者:Pysearch    | 项目源码 | 文件源码
def crawl(url):
    """Initialize crawling sequence."""
    settings = get_project_settings()
    settings.url = url
    settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT
    settings["DEPTH_LEVEL"] = DEPTH_LEVEL
    process = CrawlerProcess(settings)

    class ThisSpider(CrawlingSpider):
        """Create a spider to crawl with."""

        start_urls = [url]
    process.crawl(ThisSpider)
    process.start()
项目:Pysearch2.0    作者:Pysearch    | 项目源码 | 文件源码
def harvest(url):
    """Initialize harvest sequence."""
    settings = get_project_settings()
    settings.url = url
    process = CrawlerProcess(settings)
    process.crawl(HarvestSpider, url=url)
    process.start()
项目:sogou_weixin    作者:xiaodaguan    | 项目源码 | 文件源码
def __init__(self, **kwargs):

        settings = get_project_settings()

        self.create_display()

        self.load_proxy_list()

        self.get_item_seen(settings)