Python scrapy.exceptions 模块,DropItem() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.exceptions.DropItem()

项目:news-please    作者:fhamborg    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if spider.name == 'RssCrawler':
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
                    pymysql.IntegrityError, TypeError) as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None:
                # Compare the two download dates. index 3 of old_version
                #   corresponds to the download_date attribute in the DB
                if (datetime.datetime.strptime(
                        item['download_date'], "%y-%m-%d %H:%M:%S") -
                        old_version[3]) \
                        < datetime.timedelta(hours=self.delta_time):
                    raise DropItem("Article in DB too recent. Not saving.")

        return item
项目:job_scraper    作者:wlabatey    | 项目源码 | 文件源码
def process_item(self, item, spider):
        keywords = spider.search_terms
        title = item['title'].lower()
        #####
        # We can pass in excluded words the same way as keywords later. Commented out for now.

            # excluded_words = ['asp.net', 'java', 'c#', 'web developer', 'c++', 
                    # 'windows', 'qa', 'support', '.net', 'manager', 'sales', 
                    # 'marketing', 'senior', 'snr', 'salesforce', 'crm']
        #####
        #####
        # if any(keyword in title for keyword in excluded_words):
            # raise DropItem("Job title contained excluded word")
        #####
        if any(keyword in title for keyword in keywords):
            return item
        else:
            raise DropItem("Job title doesn't contain our search terms")
项目:crepriceSpider    作者:zhousenbiao    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem('Missing{0}!'.format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg('??????!', level=log.DEBUG, spider=spider)

        return item

    # def testdb(self):
    #     # ???MongoHQ
    #     con = pymongo.Connection("paulo.mongohq.com",10042)
    #     db = con.mytest
    #     db.authenticate("root", "sa123")
    #     db.urllist.drop()
项目:ssp-transparencia    作者:eltermann    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if isinstance(item, SsptransparenciaBO):
            key = 'bos'
            _id = item['id']
        elif isinstance(item, SsptransparenciaVitima):
            key = 'vitimas'
            _id = '%s::%s' % (item['bo_id'], item['count'])
        elif isinstance(item, SsptransparenciaNatureza):
            key = 'naturezas'
            _id = '%s::%s' % (item['bo_id'], item['count'])

        if _id in self.ids_seen[key]:
            raise DropItem('Duplicate item found: %s' % item)
        else:
            self.ids_seen[key].add(_id)
            return item
项目:housebot    作者:jbkopecky    | 项目源码 | 文件源码
def process_item(self, item, domain):
        now = arrow.now()
        seen = self.check_seen_before(item)
        if len(seen) > 0:
            last_seen = max(seen)
            time_limit = now.replace(**self.time_scale).timestamp
            if last_seen < time_limit:
                self.insert_item_price(item, now.timestamp)
            raise DropItem("Already seen %s, %s" % (item['url'], arrow.get(last_seen).humanize()))
        else:
            self.insert_item_price(item, now.timestamp)
            self.insert_item_main(item)
            self.insert_item_tag_list(item)
            self.insert_item_description(item)
            self.conn.commit()
            return item
项目:jd_spider    作者:samrayleung    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                # key = {}
                # key['sku_id'] = item['sku_id']
                # self.db[item['item_name']].update(key, dict(item), upsert=True)
                self.db[item['item_name']].insert(dict(item))
                logging.debug("add {}".format(item['item_name']))
            except (pymongo.errors.WriteError, KeyError) as err:
                raise DropItem("Duplicated Item: {}".format(item['name']))
        return item
项目:mensa-tracker    作者:annyanich    | 项目源码 | 文件源码
def process_item(self, item, spider):

        db_matches = db.session.query(DBMenuEntry).filter_by(
            category=item['category'],
            mensa=item['mensa'],
            description=item['description'],
            date_valid=item['date_valid'],
            allergens=item['allergens'],
            price=item['price']
        ).all()

        if db_matches:
            # If there is more than one matching entry in the database, we probably
            # already saved a duplicate by accident.  I really hope that doesn't happen.
            assert(len(db_matches) == 1)

            spider.crawler.stats.inc_value('items_already_in_db')
            raise DropItem(
                "Menu item already found in database.\n"
                "Previously scraped on: {previous_scrape_time}".format(
                    previous_scrape_time=str(db_matches[0].time_scraped)))
        else:
            return item
项目:jiandan_2    作者:qiyeboy    | 项目源码 | 文件源码
def item_completed(self, results, item, info):
        '''

        :param results:
        :param item:
        :param info:
        :return:
        ????????????????????????????????????????
         item_completed() ???????
        '''
        spiderName = self.spiderinfo.spider.name
        if spiderName == 'jiandan':
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            return item
项目:Newscrawler    作者:JBH168    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if spider.name == 'RssCrawler':
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except mysql.connector.Error as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None:
                # Compare the two download dates. index 3 of old_version
                #   corresponds to the download_date attribute in the DB
                if (datetime.datetime.strptime(
                        item['download_date'], "%y-%m-%d %H:%M:%S") -
                        old_version[3]) \
                        < datetime.timedelta(hours=self.delta_time):
                    raise DropItem("Article in DB too recent. Not saving.")

        return item
项目:feeds    作者:nblock    | 项目源码 | 文件源码
def process_item(self, item, spider):
        def raise_if_missing(name, item):
            if name not in item:
                raise DropItem('The required field "{}" is missing in: {}.'.
                               format(name, item))

        # Required fields for all items
        for required in ('id', 'title', 'link'):
            raise_if_missing(required, item)

        # Required fields for FeedEntryItems
        if isinstance(item, FeedEntryItem):
            for required in ('updated',):
                raise_if_missing(required, item)

        return item
项目:ProxyPool    作者:Time1ess    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if not isinstance(item, ProxyItem):
            return item
        if not item.get('ip', None) or not item.get('port', None):
            raise DropItem('Bad ProxyItem')
        item.setdefault('addr', 'Unknown')
        item.setdefault('mode', 'Unknown')
        item.setdefault('protocol', 'http')
        item.setdefault('validation_time', 'Unknown')
        proxy = '{}://{}'.format(item['protocol'], item['proxy'])
        if self.conn.sismember('rookie_proxies', proxy) or\
                self.conn.sismember('available_proxies', proxy) or\
                self.conn.sismember('lost_proxies', proxy) or\
                self.conn.sismember('dead_proxies', proxy):
            raise DropItem('Already in the waiting list')
        key = 'proxy_info:'+item['proxy']
        pipe = self.conn.pipeline(False)
        pipe.sadd('rookie_proxies', proxy)
        pipe.zadd('rookies_checking', item['proxy'], time.time())
        pipe.hmset(key, dict(item))
        pipe.hset(key, 'failed_times', 0)
        pipe.execute()
        return item
项目:rojak    作者:pyk    | 项目源码 | 文件源码
def process_item(self, item, spider):
        title = item.get('title', 'title_not_set')
        if title == 'title_not_set':
            err_msg = 'Missing title in: %s' % item.get('url')
            raise DropItem(err_msg)

        raw_content = item.get('raw_content', 'raw_content_not_set')
        if raw_content == 'raw_content_not_set':
            err_msg = 'Missing raw_content in: %s' % item.get('url')
            raise DropItem(err_msg)

        published_at = item.get('published_at', 'published_at_not_set')
        if published_at == 'published_at_not_set':
            err_msg = 'Missing published_at in: %s' % item.get('url')
            raise DropItem(err_msg)

        # Pass item to the next pipeline, if any
        return item
项目:dazdp    作者:guapier    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                # key = {}
                # key['sku_id'] = item['sku_id']
                # self.db[item['item_name']].update(key, dict(item), upsert=True)
                self.db[item['item_name']].insert(dict(item))
                logging.debug("add {}".format(item['item_name']))
            except (pymongo.errors.WriteError, KeyError) as err:
                raise DropItem("Duplicated Item: {}".format(item['name']))
        return item
项目:multimedia_crawler    作者:JFluo2011    | 项目源码 | 文件源码
def process_item(self, item, spider):
        try:
            data = {
                'url': item['url'],
                'file_name': item['file_name'],
                'media_type': item['media_type'],
                'host': item['host'],
                'file_dir': item['file_dir'],
                'download': item['download'],
                'extract': item['extract'],
                'info': item['info'],
                'stack': item['stack'],
                'media_urls': item['media_urls'],
            }
            self.col.update({'url': item['url']}, data, upsert=True)
            # self.col.update({'url': item['url']}, {'$set': {'info': item['info']}})
            # self.col.insert(data)
        except Exception, err:
            logging.error(str(err))
            raise DropItem(str(err))
        return item
项目:multimedia_crawler    作者:JFluo2011    | 项目源码 | 文件源码
def __insert_item(self, item=None):
        item, self.items = self.items, item
        item.pop('index', None)
        try:
            data = {
                'url': item['url'],
                'file_name': item['file_name'],
                'media_type': item['media_type'],
                'host': item['host'],
                'file_dir': item['file_dir'],
                'download': item['download'],
                'extract': item['extract'],
                'info': item['info'],
                'stack': item['stack'],
                'media_urls': item['media_urls'],
            }
            self.col.update({'url': item['url']}, data, upsert=True)
            # self.col.insert(data)
        except Exception, err:
            logging.error(str(err))
            raise DropItem(str(err))
        return item
项目:scrapy_redis_mongodb    作者:smilemilk1992    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if self.site_item_exist(item):
            self.MG_table.insert(dict(item))
            logging.debug("Question added to MongoDB database!")
            # log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider)
            '''
            Scrapy ?? 5 ? logging ???
            CRITICAL - ????(critical)
            ERROR - ????(regular errors)
            WARNING - ????(warning messages)
            INFO - ????(informational messages)
            DEBUG - ????(debugging messages)     ???????DEBUG

            '''
        else:
            raise DropItem("{} is exist".format(item['url']))
        return item
项目:hoaxy-backend    作者:IUNetSci    | 项目源码 | 文件源码
def process_item(self, item, spider):
        """Main function that process URL item (first phase)."""
        # validate URL length
        if len(item['raw']) > MAX_URL_LEN:
            item['raw'] = item['raw'][:MAX_URL_LEN]
            logger.error('Raw URL too long, trucate it! %r', item['raw'])
        # parse raw URL
        purl = get_parsed_url(item['raw'])
        if purl is None or purl.hostname is None:
            raise DropItem('Invalide URL')
        site_id = belongs_to_site(purl.hostname, self.site_tuples)
        if site_id is None:
            raise DropItem('Offsite domain: %s', item)
        item['site_id'] = site_id
        # insert URL into table
        try:
            get_or_create_murl(spider.session, item, spider.platform_id)
        except SQLAlchemyError as e:
            logger.error(e)
            spider.session.rollback()
            raise DropItem('Fail to insert database of url: %s', item)
        return item
项目:remotor    作者:jamiebull1    | 项目源码 | 文件源码
def process_item(self, item, spider):
        """Check if we need to store the item and decide whether to notify.
        """
        # check if already in the database
        stored = self.jobs_collection.find_one({'url': item['url']})
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            if stored:
                item = stored
                item['times_seen'] += 1
                self.jobs_collection.update(
                    {'_id': item['_id']}, dict(item), False)
            else:
                # if not (and if not already set), add date to item
                if not item.get('date_added', False):
                    item['date_added'] = datetime.now().isoformat()
                if not item.get('date_posted', False):
                    item['date_posted'] = datetime.now().isoformat()
                item['times_seen'] = 0
                self.jobs_collection.insert(item)
        return item
项目:pydata_webscraping    作者:jmortega    | 项目源码 | 文件源码
def _convert(self, item, spider):
        image_paths = [im['path'] for im in item['images']]

        datapath = spider.crawler.settings['FILES_STORE']
        image_files = [datapath + path for path in image_paths]

        item['pdf_file'] = '%s.pdf' % item['id']
        dest = '{root}/{spider}/{file}'.format(
            root=datapath,
            spider=item['spider'],
            file=item['pdf_file'],
        )
        print "file:"+dest
        # Use convert command from ImageMagick.
        cmd = ['convert'] + image_files + [dest]
        try:
            # TODO: capture errors
            subprocess.check_call(cmd, stdout=subprocess.PIPE)
        except subprocess.CalledProcessError as detail:
            print detail
            raise DropItem("failed to generate PDF")

        return item
项目:crawl_web    作者:hanxlinsist    | 项目源码 | 文件源码
def process_item(self, item, spider):
        str = ""
        for e in item["bookinfo"]:
            if re.search(r'^\s*$', e):
                print "drop this element"
            else:
                str = str + e + ","
    item["bookinfo"] = str[:-1]

        if item['name']:
            if item['author']:
                return item
            else:
                raise DropItem("Missing name or author in %s" % item)
项目:NewsScrapy    作者:yinzishao    | 项目源码 | 文件源码
def process_item(self, item, spider):
        item_keywords = judge_key_words(item)#??item????????
        if item_keywords:   #????????item
            item["keywords"] = item_keywords
            return item
        else:
            logger = logging.getLogger(spider.name)
            logger.info("No keyword in %s" % item["news_url"])
            raise DropItem("No keyword in %s" % item["news_url"])
项目:scrapy_projects    作者:morefreeze    | 项目源码 | 文件源码
def process_item(self, item, spider):
        """check item weather in item_seen
        """
        if item['hash'] in self.item_seen:
            raise DropItem('Duplicate item found: %s' %item)
        else:
            self.item_seen.add(item['hash'])
            return item
项目:scrapy_projects    作者:morefreeze    | 项目源码 | 文件源码
def process_item(self, item, spider):
        """return ip is duplicate or not

        :item: crawl item including host port
        :returns: return item or DropItem
        """
        if 'ip' not in item:
            raise DropItem('')
        port = item.get('port', 80)
        host = '%s:%s' % (item['ip'], port)
        if self.conn.sismember(settings.HOST_S, host) or self.dup_in_queue(host):
            raise DropItem('%s, cause duplicate' % (host))
        else:
            return item
项目:scrapy_projects    作者:morefreeze    | 项目源码 | 文件源码
def process_item(self, item, spider):
        """save to redis and return item

        :item: crawl item including host port
        :returns: return item or DropItem
        """
        if 'ip' not in item:
            raise DropItem('')
        port = item.get('port', 80)
        host = '%s:%s' % (item['ip'], port)
        self.conn.sadd(self.host_s, host)
        return item
项目:job_scraper    作者:wlabatey    | 项目源码 | 文件源码
def process_item(self, item, spider):
        job_title_company = item['title'] + item['company']
        if job_title_company in self.title_company:
            raise DropItem("Duplicate item found: %s" % (item))
        else: 
            self.title_company.add(job_title_company)
            return item
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        print '--'*40
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                self.collection.insert(dict(item))
                log.msg("Question added to MongoDB database!",
                        level=log.DEBUG, spider=spider)
            except:
                print 'ggggg'*40
        return item
项目:crepriceSpider    作者:zhousenbiao    | 项目源码 | 文件源码
def process_item(self, item, spider):
        print "------"
        if item.keys() >= 5:
            if item in self.has:
                raise DropItem("Duplicate item found: %s" % item)
            else:
                self.has.add(item)
                return item

# mongodb??
项目:amazon-crawler    作者:ahmedezzeldin93    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item['pid'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['id'])
            return item
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item['link'] in self.seen:
            raise DropItem('Duplicate link %s' % item['link'])
        self.seen.add(item['link'])
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(line)
        return item
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if not re.match('.*comment.*',item['link']):
            if re.match('^http.*qq.com.*\.s?html?$',item['link']):
                if item['link'] in self.seen:
                    raise DropItem('Duplicate link %s' % item['link'])
                self.seen.add(item['link'])
                line = json.dumps(dict(item), ensure_ascii=False) + '\n'
                self.file.write(line)
                return item
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item['link'] in self.seen:
            raise DropItem('Duplicate link %s' % item['link'])
        self.seen.add(item['link'])
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(line)
        return item
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if redis_db.hexists(redis_data_dict, item['link']):
            raise DropItem("Duplicate item found: %s" % item)
        else:
#            print item['link']

            cur=self.conn.cursor()
            add_url = """insert into sohuurl(url) VALUES (%s)"""
            data_url=(str(item['link']),)
            cur.execute(add_url,data_url)
            self.conn.commit()
            cur.close()
            return item
项目:web_crawler    作者:NearXdu    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item['link'] in self.seen:
            raise DropItem('Duplicate Link %s' % item['link'])
        self.seen.add(item['link'])
        line = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.file.write(line)
        return item
项目:multithread-crawler    作者:SaberAlexander    | 项目源码 | 文件源码
def process_item(self,jd_item,JDspider):  
        # if not jd_item['flag']:  
        #     raise DropItem("item dropped found: %s" % jd_item)  
        # else:
        str_line1= json.dumps(dict(jd_item)) + "\n"
        self.file1.write(str_line1)
        str_line2=json.dumps(dict(jd_item))+','+'\n'
        self.file2.write(str_line2)

        return jd_item
项目:Jobs-search    作者:Hopetree    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem('Missming{}!'.format(data))
        if valid:
            self.coll.insert(dict(item))
            log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider)

        return item
项目:FreeFoodCalendar    作者:Yuliang-Zou    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("Event added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        return item
项目:FreeFoodCalendar    作者:Yuliang-Zou    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("Event added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        return item
项目:bigdata_data    作者:htzy    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item['link'] in self.ids_seen:
            raise DropItem("Duplicate item found:%s" % item)
        else:
            self.ids_seen.add(item['link'])
            return item
项目:bigdata_data    作者:htzy    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if re.search(u'window|??|??|????', item['title'], re.I):
            print "ignore this item"
            raise DropItem("Contains word that you don't want: %s" % item['title'])
        elif re.search(u'window|??|??|????', item['abstract'], re.I):
            print "ignore this item"
            raise DropItem("Contains word that you don't want: %s" % item['abstract'])
        else:
            return item
项目:SinaWeiboSpider    作者:wen-fei    | 项目源码 | 文件源码
def process_item(self, item, spider):
        collection_name = item.__class__.__name__
        try:
            self.db[collection_name].insert(dict(item))
        except DuplicateKeyError:
            return DropItem("Duplicate item found: %s" % item)
        else:
            return item
项目:airbnb_scraper    作者:bashedev    | 项目源码 | 文件源码
def process_item(self, item, spider):
        """Drop items not fitting parameters. Open in browser if specified. Return accepted items."""

        if self._skip_list and str(item['id']) in self._skip_list:
            raise DropItem('Item in skip list: {}'.format(item['id']))

        if self._minimum_monthly_discount and 'monthly_discount' in item:
            if item['monthly_discount'] < self._minimum_monthly_discount:
                raise DropItem('Monthly discount too low: {}'.format(item['monthly_discount']))

        if self._minimum_weekly_discount and 'weekly_discount' in item:
            if item['weekly_discount'] < self._minimum_monthly_discount:
                raise DropItem('Weekly discount too low: {}'.format(item['weekly_discount']))

        # check regexes
        if self._cannot_have_regex:
            for f in self._fields_to_check:
                v = str(item[f].encode('ASCII', 'replace'))
                if self._cannot_have_regex.search(v):
                    raise DropItem('Found: {}'.format(self._cannot_have_regex.pattern))

        if self._must_have_regex:
            has_must_haves = False
            for f in self._fields_to_check:
                v = str(item[f].encode('ASCII', 'replace'))
                if self._must_have_regex.search(v):
                    has_must_haves = True
                    break

            if not has_must_haves:
                raise DropItem('Not Found: {}'.format(self._must_have_regex.pattern))

        # open in browser
        if self._web_browser:
            webbrowser.get(self._web_browser).open(item['url'])

        return item
项目:pythonStudy    作者:jeikerxiao    | 项目源码 | 文件源码
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        #item['image_paths'] = image_paths
        return item
项目:pythonStudy    作者:jeikerxiao    | 项目源码 | 文件源码
def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item
项目:jd_spider    作者:samrayleung    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                key = {}
                self.db[item['item_name']].insert(dict(item))
                logging.debug("add {}".format(item['item_name']))
            except (pymongo.errors.WriteError, KeyError) as err:
                raise DropItem(
                    "Duplicated comment Item: {}".format(item['good_name']))
        return item
项目:crawler    作者:brantou    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item['pid'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['pid'])
            return item
项目:scrapy-image    作者:lamphp    | 项目源码 | 文件源码
def item_completed(self, results, item, info):
        if info.spider.name == 'sisy':
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            item['image_paths'] = image_paths
            return item
项目:wechat-crawler    作者:DMGbupt    | 项目源码 | 文件源码
def process_item(self, item, spider):
        """??????
        """

        # ????
        if isinstance(item, WeChat):
            if self.is_duplicate_wechat(item):
                return DropItem("Duplicate news found: %s" % item['article_addr'])
            else:
                return item
项目:ip_proxy_pool    作者:leeyis    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if Redis.exists('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port'])) :
            raise DropItem("Duplicate item found: %s" % item)
        else:
            Redis.set('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port']),1)
            return item
项目:autoinjection    作者:ChengWiLL    | 项目源码 | 文件源码
def __getValue(self,url):
        isHaveManyQueryInUrl = False
        for value in self.valuedict:
            div_by_value = url.split(value.rstrip('\n'))
            mm = div_by_value[0]
            if mm in self.seen:
                raise DropItem('Duplicate link %s' % url)
            elif len(div_by_value) > 1 and not isHaveManyQueryInUrl:
                self.seen.add(mm)
                isHaveManyQueryInUrl = True
                line = url+'\n'
                print url
                self.file.write(line)
项目:Newscrawler    作者:JBH168    | 项目源码 | 文件源码
def process_item(self, item, spider):
        # For the case where something goes wrong
        if item['spider_response'].status != 200:
            # Item is no longer processed in the pipeline
            raise DropItem("%s: Non-200 response" % item['url'])
        else:
            return item