我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.exceptions.DropItem()。
def process_item(self, item, spider): if spider.name == 'RssCrawler': # Search the CurrentVersion table for a version of the article try: self.cursor.execute(self.compare_versions, (item['url'],)) except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError, pymysql.IntegrityError, TypeError) as error: self.log.error("Something went wrong in rss query: %s", error) # Save the result of the query. Must be done before the add, # otherwise the result will be overwritten in the buffer old_version = self.cursor.fetchone() if old_version is not None: # Compare the two download dates. index 3 of old_version # corresponds to the download_date attribute in the DB if (datetime.datetime.strptime( item['download_date'], "%y-%m-%d %H:%M:%S") - old_version[3]) \ < datetime.timedelta(hours=self.delta_time): raise DropItem("Article in DB too recent. Not saving.") return item
def process_item(self, item, spider): keywords = spider.search_terms title = item['title'].lower() ##### # We can pass in excluded words the same way as keywords later. Commented out for now. # excluded_words = ['asp.net', 'java', 'c#', 'web developer', 'c++', # 'windows', 'qa', 'support', '.net', 'manager', 'sales', # 'marketing', 'senior', 'snr', 'salesforce', 'crm'] ##### ##### # if any(keyword in title for keyword in excluded_words): # raise DropItem("Job title contained excluded word") ##### if any(keyword in title for keyword in keywords): return item else: raise DropItem("Job title doesn't contain our search terms")
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem('Missing{0}!'.format(data)) if valid: self.collection.insert(dict(item)) log.msg('??????!', level=log.DEBUG, spider=spider) return item # def testdb(self): # # ???MongoHQ # con = pymongo.Connection("paulo.mongohq.com",10042) # db = con.mytest # db.authenticate("root", "sa123") # db.urllist.drop()
def process_item(self, item, spider): if isinstance(item, SsptransparenciaBO): key = 'bos' _id = item['id'] elif isinstance(item, SsptransparenciaVitima): key = 'vitimas' _id = '%s::%s' % (item['bo_id'], item['count']) elif isinstance(item, SsptransparenciaNatureza): key = 'naturezas' _id = '%s::%s' % (item['bo_id'], item['count']) if _id in self.ids_seen[key]: raise DropItem('Duplicate item found: %s' % item) else: self.ids_seen[key].add(_id) return item
def process_item(self, item, domain): now = arrow.now() seen = self.check_seen_before(item) if len(seen) > 0: last_seen = max(seen) time_limit = now.replace(**self.time_scale).timestamp if last_seen < time_limit: self.insert_item_price(item, now.timestamp) raise DropItem("Already seen %s, %s" % (item['url'], arrow.get(last_seen).humanize())) else: self.insert_item_price(item, now.timestamp) self.insert_item_main(item) self.insert_item_tag_list(item) self.insert_item_description(item) self.conn.commit() return item
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: try: # key = {} # key['sku_id'] = item['sku_id'] # self.db[item['item_name']].update(key, dict(item), upsert=True) self.db[item['item_name']].insert(dict(item)) logging.debug("add {}".format(item['item_name'])) except (pymongo.errors.WriteError, KeyError) as err: raise DropItem("Duplicated Item: {}".format(item['name'])) return item
def process_item(self, item, spider): db_matches = db.session.query(DBMenuEntry).filter_by( category=item['category'], mensa=item['mensa'], description=item['description'], date_valid=item['date_valid'], allergens=item['allergens'], price=item['price'] ).all() if db_matches: # If there is more than one matching entry in the database, we probably # already saved a duplicate by accident. I really hope that doesn't happen. assert(len(db_matches) == 1) spider.crawler.stats.inc_value('items_already_in_db') raise DropItem( "Menu item already found in database.\n" "Previously scraped on: {previous_scrape_time}".format( previous_scrape_time=str(db_matches[0].time_scraped))) else: return item
def item_completed(self, results, item, info): ''' :param results: :param item: :param info: :return: ???????????????????????????????????????? item_completed() ??????? ''' spiderName = self.spiderinfo.spider.name if spiderName == 'jiandan': image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") return item
def process_item(self, item, spider): if spider.name == 'RssCrawler': # Search the CurrentVersion table for a version of the article try: self.cursor.execute(self.compare_versions, (item['url'],)) except mysql.connector.Error as error: self.log.error("Something went wrong in rss query: %s", error) # Save the result of the query. Must be done before the add, # otherwise the result will be overwritten in the buffer old_version = self.cursor.fetchone() if old_version is not None: # Compare the two download dates. index 3 of old_version # corresponds to the download_date attribute in the DB if (datetime.datetime.strptime( item['download_date'], "%y-%m-%d %H:%M:%S") - old_version[3]) \ < datetime.timedelta(hours=self.delta_time): raise DropItem("Article in DB too recent. Not saving.") return item
def process_item(self, item, spider): def raise_if_missing(name, item): if name not in item: raise DropItem('The required field "{}" is missing in: {}.'. format(name, item)) # Required fields for all items for required in ('id', 'title', 'link'): raise_if_missing(required, item) # Required fields for FeedEntryItems if isinstance(item, FeedEntryItem): for required in ('updated',): raise_if_missing(required, item) return item
def process_item(self, item, spider): if not isinstance(item, ProxyItem): return item if not item.get('ip', None) or not item.get('port', None): raise DropItem('Bad ProxyItem') item.setdefault('addr', 'Unknown') item.setdefault('mode', 'Unknown') item.setdefault('protocol', 'http') item.setdefault('validation_time', 'Unknown') proxy = '{}://{}'.format(item['protocol'], item['proxy']) if self.conn.sismember('rookie_proxies', proxy) or\ self.conn.sismember('available_proxies', proxy) or\ self.conn.sismember('lost_proxies', proxy) or\ self.conn.sismember('dead_proxies', proxy): raise DropItem('Already in the waiting list') key = 'proxy_info:'+item['proxy'] pipe = self.conn.pipeline(False) pipe.sadd('rookie_proxies', proxy) pipe.zadd('rookies_checking', item['proxy'], time.time()) pipe.hmset(key, dict(item)) pipe.hset(key, 'failed_times', 0) pipe.execute() return item
def process_item(self, item, spider): title = item.get('title', 'title_not_set') if title == 'title_not_set': err_msg = 'Missing title in: %s' % item.get('url') raise DropItem(err_msg) raw_content = item.get('raw_content', 'raw_content_not_set') if raw_content == 'raw_content_not_set': err_msg = 'Missing raw_content in: %s' % item.get('url') raise DropItem(err_msg) published_at = item.get('published_at', 'published_at_not_set') if published_at == 'published_at_not_set': err_msg = 'Missing published_at in: %s' % item.get('url') raise DropItem(err_msg) # Pass item to the next pipeline, if any return item
def process_item(self, item, spider): try: data = { 'url': item['url'], 'file_name': item['file_name'], 'media_type': item['media_type'], 'host': item['host'], 'file_dir': item['file_dir'], 'download': item['download'], 'extract': item['extract'], 'info': item['info'], 'stack': item['stack'], 'media_urls': item['media_urls'], } self.col.update({'url': item['url']}, data, upsert=True) # self.col.update({'url': item['url']}, {'$set': {'info': item['info']}}) # self.col.insert(data) except Exception, err: logging.error(str(err)) raise DropItem(str(err)) return item
def __insert_item(self, item=None): item, self.items = self.items, item item.pop('index', None) try: data = { 'url': item['url'], 'file_name': item['file_name'], 'media_type': item['media_type'], 'host': item['host'], 'file_dir': item['file_dir'], 'download': item['download'], 'extract': item['extract'], 'info': item['info'], 'stack': item['stack'], 'media_urls': item['media_urls'], } self.col.update({'url': item['url']}, data, upsert=True) # self.col.insert(data) except Exception, err: logging.error(str(err)) raise DropItem(str(err)) return item
def process_item(self, item, spider): if self.site_item_exist(item): self.MG_table.insert(dict(item)) logging.debug("Question added to MongoDB database!") # log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) ''' Scrapy ?? 5 ? logging ??? CRITICAL - ????(critical) ERROR - ????(regular errors) WARNING - ????(warning messages) INFO - ????(informational messages) DEBUG - ????(debugging messages) ???????DEBUG ''' else: raise DropItem("{} is exist".format(item['url'])) return item
def process_item(self, item, spider): """Main function that process URL item (first phase).""" # validate URL length if len(item['raw']) > MAX_URL_LEN: item['raw'] = item['raw'][:MAX_URL_LEN] logger.error('Raw URL too long, trucate it! %r', item['raw']) # parse raw URL purl = get_parsed_url(item['raw']) if purl is None or purl.hostname is None: raise DropItem('Invalide URL') site_id = belongs_to_site(purl.hostname, self.site_tuples) if site_id is None: raise DropItem('Offsite domain: %s', item) item['site_id'] = site_id # insert URL into table try: get_or_create_murl(spider.session, item, spider.platform_id) except SQLAlchemyError as e: logger.error(e) spider.session.rollback() raise DropItem('Fail to insert database of url: %s', item) return item
def process_item(self, item, spider): """Check if we need to store the item and decide whether to notify. """ # check if already in the database stored = self.jobs_collection.find_one({'url': item['url']}) valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: if stored: item = stored item['times_seen'] += 1 self.jobs_collection.update( {'_id': item['_id']}, dict(item), False) else: # if not (and if not already set), add date to item if not item.get('date_added', False): item['date_added'] = datetime.now().isoformat() if not item.get('date_posted', False): item['date_posted'] = datetime.now().isoformat() item['times_seen'] = 0 self.jobs_collection.insert(item) return item
def _convert(self, item, spider): image_paths = [im['path'] for im in item['images']] datapath = spider.crawler.settings['FILES_STORE'] image_files = [datapath + path for path in image_paths] item['pdf_file'] = '%s.pdf' % item['id'] dest = '{root}/{spider}/{file}'.format( root=datapath, spider=item['spider'], file=item['pdf_file'], ) print "file:"+dest # Use convert command from ImageMagick. cmd = ['convert'] + image_files + [dest] try: # TODO: capture errors subprocess.check_call(cmd, stdout=subprocess.PIPE) except subprocess.CalledProcessError as detail: print detail raise DropItem("failed to generate PDF") return item
def process_item(self, item, spider): str = "" for e in item["bookinfo"]: if re.search(r'^\s*$', e): print "drop this element" else: str = str + e + "," item["bookinfo"] = str[:-1] if item['name']: if item['author']: return item else: raise DropItem("Missing name or author in %s" % item)
def process_item(self, item, spider): item_keywords = judge_key_words(item)#??item???????? if item_keywords: #????????item item["keywords"] = item_keywords return item else: logger = logging.getLogger(spider.name) logger.info("No keyword in %s" % item["news_url"]) raise DropItem("No keyword in %s" % item["news_url"])
def process_item(self, item, spider): """check item weather in item_seen """ if item['hash'] in self.item_seen: raise DropItem('Duplicate item found: %s' %item) else: self.item_seen.add(item['hash']) return item
def process_item(self, item, spider): """return ip is duplicate or not :item: crawl item including host port :returns: return item or DropItem """ if 'ip' not in item: raise DropItem('') port = item.get('port', 80) host = '%s:%s' % (item['ip'], port) if self.conn.sismember(settings.HOST_S, host) or self.dup_in_queue(host): raise DropItem('%s, cause duplicate' % (host)) else: return item
def process_item(self, item, spider): """save to redis and return item :item: crawl item including host port :returns: return item or DropItem """ if 'ip' not in item: raise DropItem('') port = item.get('port', 80) host = '%s:%s' % (item['ip'], port) self.conn.sadd(self.host_s, host) return item
def process_item(self, item, spider): job_title_company = item['title'] + item['company'] if job_title_company in self.title_company: raise DropItem("Duplicate item found: %s" % (item)) else: self.title_company.add(job_title_company) return item
def process_item(self, item, spider): valid = True print '--'*40 for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: try: self.collection.insert(dict(item)) log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) except: print 'ggggg'*40 return item
def process_item(self, item, spider): print "------" if item.keys() >= 5: if item in self.has: raise DropItem("Duplicate item found: %s" % item) else: self.has.add(item) return item # mongodb??
def process_item(self, item, spider): if item['pid'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item
def process_item(self, item, spider): if item['link'] in self.seen: raise DropItem('Duplicate link %s' % item['link']) self.seen.add(item['link']) line = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file.write(line) return item
def process_item(self, item, spider): if not re.match('.*comment.*',item['link']): if re.match('^http.*qq.com.*\.s?html?$',item['link']): if item['link'] in self.seen: raise DropItem('Duplicate link %s' % item['link']) self.seen.add(item['link']) line = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file.write(line) return item
def process_item(self, item, spider): if redis_db.hexists(redis_data_dict, item['link']): raise DropItem("Duplicate item found: %s" % item) else: # print item['link'] cur=self.conn.cursor() add_url = """insert into sohuurl(url) VALUES (%s)""" data_url=(str(item['link']),) cur.execute(add_url,data_url) self.conn.commit() cur.close() return item
def process_item(self, item, spider): if item['link'] in self.seen: raise DropItem('Duplicate Link %s' % item['link']) self.seen.add(item['link']) line = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file.write(line) return item
def process_item(self,jd_item,JDspider): # if not jd_item['flag']: # raise DropItem("item dropped found: %s" % jd_item) # else: str_line1= json.dumps(dict(jd_item)) + "\n" self.file1.write(str_line1) str_line2=json.dumps(dict(jd_item))+','+'\n' self.file2.write(str_line2) return jd_item
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem('Missming{}!'.format(data)) if valid: self.coll.insert(dict(item)) log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider) return item
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: self.collection.insert(dict(item)) log.msg("Event added to MongoDB database!", level=log.DEBUG, spider=spider) return item
def process_item(self, item, spider): if item['link'] in self.ids_seen: raise DropItem("Duplicate item found:%s" % item) else: self.ids_seen.add(item['link']) return item
def process_item(self, item, spider): if re.search(u'window|??|??|????', item['title'], re.I): print "ignore this item" raise DropItem("Contains word that you don't want: %s" % item['title']) elif re.search(u'window|??|??|????', item['abstract'], re.I): print "ignore this item" raise DropItem("Contains word that you don't want: %s" % item['abstract']) else: return item
def process_item(self, item, spider): collection_name = item.__class__.__name__ try: self.db[collection_name].insert(dict(item)) except DuplicateKeyError: return DropItem("Duplicate item found: %s" % item) else: return item
def process_item(self, item, spider): """Drop items not fitting parameters. Open in browser if specified. Return accepted items.""" if self._skip_list and str(item['id']) in self._skip_list: raise DropItem('Item in skip list: {}'.format(item['id'])) if self._minimum_monthly_discount and 'monthly_discount' in item: if item['monthly_discount'] < self._minimum_monthly_discount: raise DropItem('Monthly discount too low: {}'.format(item['monthly_discount'])) if self._minimum_weekly_discount and 'weekly_discount' in item: if item['weekly_discount'] < self._minimum_monthly_discount: raise DropItem('Weekly discount too low: {}'.format(item['weekly_discount'])) # check regexes if self._cannot_have_regex: for f in self._fields_to_check: v = str(item[f].encode('ASCII', 'replace')) if self._cannot_have_regex.search(v): raise DropItem('Found: {}'.format(self._cannot_have_regex.pattern)) if self._must_have_regex: has_must_haves = False for f in self._fields_to_check: v = str(item[f].encode('ASCII', 'replace')) if self._must_have_regex.search(v): has_must_haves = True break if not has_must_haves: raise DropItem('Not Found: {}'.format(self._must_have_regex.pattern)) # open in browser if self._web_browser: webbrowser.get(self._web_browser).open(item['url']) return item
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") #item['image_paths'] = image_paths return item
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") return item
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: try: key = {} self.db[item['item_name']].insert(dict(item)) logging.debug("add {}".format(item['item_name'])) except (pymongo.errors.WriteError, KeyError) as err: raise DropItem( "Duplicated comment Item: {}".format(item['good_name'])) return item
def process_item(self, item, spider): if item['pid'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['pid']) return item
def item_completed(self, results, item, info): if info.spider.name == 'sisy': image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
def process_item(self, item, spider): """?????? """ # ???? if isinstance(item, WeChat): if self.is_duplicate_wechat(item): return DropItem("Duplicate news found: %s" % item['article_addr']) else: return item
def process_item(self, item, spider): if Redis.exists('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port'])) : raise DropItem("Duplicate item found: %s" % item) else: Redis.set('ip_port:%s:%s' % (datetime.now().strftime("%Y%m%d"),item['ip_port']),1) return item
def __getValue(self,url): isHaveManyQueryInUrl = False for value in self.valuedict: div_by_value = url.split(value.rstrip('\n')) mm = div_by_value[0] if mm in self.seen: raise DropItem('Duplicate link %s' % url) elif len(div_by_value) > 1 and not isHaveManyQueryInUrl: self.seen.add(mm) isHaveManyQueryInUrl = True line = url+'\n' print url self.file.write(line)
def process_item(self, item, spider): # For the case where something goes wrong if item['spider_response'].status != 200: # Item is no longer processed in the pipeline raise DropItem("%s: Non-200 response" % item['url']) else: return item