我们从Python开源项目中,提取了以下22个代码示例,用于说明如何使用scrapy.log.INFO。
def parse_xpath(self, response, xpath): appItemList = [] sel = Selector(response) for url in sel.xpath(xpath).extract(): url = urljoin(response.url, url) log.msg("Catch an application: %s" % url, level=log.INFO) appItem = AppItem() appItem['url'] = url appItemList.append(appItem) return appItemList #def parse_anzhi(self, response, xpath): # appItemList = [] # hxs = HtmlXPathSelector(response) # for script in hxs.select(xpath).extract(): # id = re.search(r"\d+", script).group() # url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,) # appItem = AppItem() # appItem['url'] = url # appItemList.append(appItem) # return appItemList
def process_item(self,item,spider): m = hashlib.md5() m.update(item['url']) url_MD5 = m.hexdigest() content_simhash = Simhash(self.get_features(item['content'])).value language = 'en' query_json='{"fields":["url_MD5","content_simhash"],"query":{"filtered":{"filter":{"term":{"url_MD5":"'+url_MD5+'"}}}}}' es = Elasticsearch(host='192.168.1.14',port=9200,timeout=1000) res = es.search(index="hiddenwebs", body=query_json) if res['hits']['total'] == 0: es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language}) else: flag = 0 for hit in res['hits']['hits']: #print content_simhash #print hit["fields"]["content_simhash"][0] if int(hit["fields"]["content_simhash"][0]) == int(content_simhash): log.msg('The similar pages in es %s'%(item['url']),level=log.INFO) flag = 1 es.index(index="hiddenwebs", doc_type="hiddenwebpages", id=hit['_id'], body={"create_time":item['create_time']}) break if flag == 0 : es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})
def parse(self, response): for build in foreigh_7: item = SightItem() log.msg('build: ' + build, level=log.INFO) if baidu_geo_api(build.encode('utf-8')) is not None: lng, lat = baidu_geo_api(build.encode('utf-8')) else: lng, lat = 1, 1 item['lng'] = lng item['lat'] = lat item['id_num'] = self.id_num self.id_num += 1L item['category'] = u'??????' item['title'] = build.encode('utf-8') pinyin = lazy_pinyin(build) item['pinyin'] = ''.join(pinyin).upper() if lng == 1 or lat == 1: log.msg('no landmark found: ' + 'at line 36,' + build, level=log.INFO) continue baike_url = 'https://baike.baidu.com/item/%s' % build yield scrapy.Request(baike_url, meta={'item': item}, callback=self.content_parse)
def content_parse(self, response): log.msg('run into content_parse at line 40', level=log.INFO) item = response.meta['item'] result = response.xpath( '//div[@class="main-content"]/div[@class="lemma-summary"]/div[@class="para"]').extract() # ???? if len(result) != 0: pattern = re.compile(r'<[^>]+>', re.S) description = pattern.sub('', result[0]).encode('utf-8') else: description = 'description_null' item['description'] = description picture_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&ic=0&width=0&height=0' % item[ 'title'].decode('utf-8') log.msg('picture_url: ' + picture_url, level=log.INFO) log.msg('run out content_parse at line 51', level=log.INFO) yield scrapy.Request(picture_url, meta={'item': item, 'splash': { 'endpoint': 'render.html', 'args': {'wait': 0.5} } }, callback=self.picture_parse)
def baidu_geo_api(sight_name): sight_name = sight_name.decode('utf-8') ak = 'qsQB3G3zIR1SvZ01bEIAMBHGbCCUhTgm' url = 'http://api.map.baidu.com/geocoder/v2/?output=json&address=%s&ak=%s' % (sight_name, ak) log.msg('run into baidu_geo_api at line 123, url: ' + url, log.INFO) try: response = urllib2.urlopen(url.encode('utf-8')) result = response.read() json_text = json.loads(result) if json_text.get('status') != 1: lng = json_text.get('result').get('location').get('lng') lng = float('%.2f' % lng) lat = json_text.get('result').get('location').get('lat') lat = float('%.2f' % lat) print 'lng: %d, lat: %d' % (lng, lat) return lng, lat else: log.msg('response status is 1 at line 132,' + sight_name, level=log.INFO) return 1, 1 except urllib2.HTTPError as e: print 'HttpError in baidu_geo_api at line 40 %s' % e except TypeError as e: print 'TypeError in baidu_geo_api at line 53 %s' % e
def readIds(self): names = filter(lambda x: 'model' in x and 'json' in x, os.listdir('/Users/king/Work/code/codePool/python/autohome_spider/data')) print names if not names: log.msg('[spec]no model data file in data dir.', log.ERROR) return model_file_name = names[-1] f = codecs.open('/Users/king/Work/code/codePool/python/autohome_spider/data/%s' % model_file_name, 'r') ids = [line['id'] for line in json.loads(f.read())] log.msg(len(ids), log.INFO) return ids
def readIds(self): names = filter(lambda x: 'model' in x and 'json' in x, os.listdir('/home/king/code/python_job/autohome_spider/data')) print names if not names: log.msg('[spec]no model data file in data dir.', log.ERROR) return model_file_name = names[-1] f = codecs.open('/home/king/code/python_job/autohome_spider/data/%s' % model_file_name, 'r') ids = [line['id'] for line in json.loads(f.read())] log.msg(len(ids), log.INFO) return ids
def process_request(self,request,spider): user_agent = UserAgent() ua = user_agent.random if ua: log.msg('Current UserAgent: '+ua, level=log.INFO) request.headers.setdefault('User-Agent', ua)
def process_item(self, item, spider): log.msg("Catch an AppItem", level=log.INFO) return item
def scan(html): alerts = list() matches = HTMLClassifier.yara_rules.match(data=html) if not len(matches) > 0: return alerts for match in matches['html']: print match alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']]) alert_data = "\n".join([s['data'] for s in match['strings']]) alerts.append((alert_reason, alert_data)) log.msg("Yara HTML Classification Match: " + alert_reason, level=log.INFO) return alerts
def scan(uri): alerts = list() matches = URLClassifier.yara_rules.match(data=uri.encode('ascii', 'ignore')) if not len(matches) > 0: return alerts for match in matches['urls']: alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']]) alert_data = "\n".join([s['data'] for s in match['strings']]) alerts.append((alert_reason, alert_data)) log.msg("Yara URL Classification Match: " + alert_reason, level=log.INFO) return alerts
def scan(js): alerts = list() matches = JSClassifier.yara_rules.match(data=js.encode('ascii', 'ignore')) if not len(matches) > 0: return alerts for match in matches['js']: alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']]) alert_data = "\n".join([s['data'] for s in match['strings']]) alerts.append((alert_reason, alert_data)) log.msg("Yara JS Classification Match: " + alert_reason, level=log.INFO) return alerts
def parse_response(self, response): page_id = ObjectId() analyzer = Analyzer(response) alerts = analyzer.inspect_response() elems = analyzer.get_resource_elems() page = analyzer.get_page_info() for alert in alerts: alert['org_id'] = self.org yield alert for elem in elems: elem['page_id'] = page_id elem['org_id'] = self.org yield elem page['page_id'] = page_id page['org_id'] = self.org yield page #limit page depth if self.pages_crawled >= settings.PAGES_PER_DOMAIN: return for link in LxmlLinkExtractor(unique=True, deny_extensions=list(), allow_domains=self.allowed_domains).extract_links(response): if not link.url in self.already_crawled and self.pages_crawled <= settings.PAGES_PER_DOMAIN: self.already_crawled.add(link.url) self.pages_crawled = self.pages_crawled + 1 log.msg("Yielding request for " + link.url, level=log.INFO) yield WebdriverRequest(link.url, callback=self.parse_response) elif self.pages_crawled >= settings.PAGES_PER_DOMAIN: log.msg("Reached max crawl depth: " + str(settings.PAGES_PER_DOMAIN), level=log.INFO) return else: log.msg("avoiding duplicate request for: " + link.url, level=log.INFO)
def info(msg): log.msg(str(msg), level=log.INFO)
def spider_closing(spider): """Activates on spider closed signal""" log.msg("Spider closed: %s" % spider, level=log.INFO) RUNNING_CRAWLERS.remove(spider) if not RUNNING_CRAWLERS: reactor.stop()
def open(self, spider): super(RecorderScheduler, self).open(spider) self.stats_manager = StatsManager(spider.crawler.stats) settings = spider.crawler.settings self.recorder_enabled = settings.get('RECORDER_ENABLED', DEFAULT_RECORDER_ENABLED) if not self.recorder_enabled: log.msg('Recorder disabled!', log.WARNING) return log.msg('Starting recorder', log.INFO) recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None) if not recorder_storage: self.recorder_enabled = False log.msg('Missing Recorder storage! Recorder disabled...', log.WARNING) return self.graph = graphs.Manager( engine=recorder_storage, drop_all_tables=settings.getbool('RECORDER_STORAGE_DROP_ALL_TABLES', DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES), clear_content=settings.getbool('RECORDER_STORAGE_CLEAR_CONTENT', DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT))
def close(self, reason): super(RecorderScheduler, self).close(reason) if self.recorder_enabled: log.msg('Finishing recorder (%s)' % reason, log.INFO) pages = self.graph.session.query(graphs.Page).filter_by(status=None).all() for page in pages: n_deleted_links = self.graph.session.query(graphs.Relation).filter_by(child_id=page.id).delete() if n_deleted_links: self.stats_manager.remove_links(n_deleted_links) n_deleted_pages = self.graph.session.query(graphs.Page).filter_by(status=None).delete() if n_deleted_pages: self.stats_manager.remove_pages(n_deleted_pages) self.graph.save()
def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: #???????useragent print "********Current UserAgent:%s************",ua log.msg('Current UserAgent:'+ua,log.INFO) request.headers.setdefault('User-Agent', ua) #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: # ???????useragent # log.INFO("********Current UserAgent:%s************".format(ua)) # ?? log.msg('Current UserAgent: ' + ua, level=log.INFO) request.headers.setdefault('User-Agent', ua) # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape # for more user agent strings,you can find it in # http://www.useragentstring.com/pages/useragentstring.php
def parse(self, response): response_domain = urlparse(response.url).netloc appItemList = [] cookie = {} xpath_rule = self.scrape_rules['xpath'] for key in xpath_rule.keys(): if key in response_domain: appItemList.extend( self.parse_xpath(response, xpath_rule[key])) break custom_parser_rule = self.scrape_rules['custom_parser'] for key in custom_parser_rule.keys(): if key in response_domain: appItemList.extend( getattr(custom_parser, custom_parser_rule[key])(response)) break #if "appchina" in response_domain: # xpath = "//a[@id='pc-download' and @class='free']/@href" # appItemList.extend(self.parse_xpath(response, xpath)) #elif "hiapk" in response_domain: # xpath = "//a[@class='linkbtn d1']/@href" # appItemList.extend(self.parse_xpath(response, xpath)) #elif "android.d.cn" in response_domain: # xpath = "//a[@class='down']/@href" # appItemList.extend(self.parse_xpath(response, xpath)) #elif "anzhi" in response_domain: # xpath = "//div[@id='btn']/a/@onclick" # appItemList.extend(self.parse_anzhi(response, xpath)) #else: # pass sel = Selector(response) for url in sel.xpath('//a/@href').extract(): url = urljoin(response.url, url) yield Request(url, meta=cookie, callback=self.parse) for item in appItemList: yield item #def parse_appchina(self, response): # appItemList = [] # hxs = HtmlXPathSelector(response) # for url in hxs.select( # "//a[@id='pc-download' and @class='free']/@href" # ).extract(): # url = urljoin(response.url, url) # log.msg("Catch an application: %s" % url, level=log.INFO) # appItem = AppItem() # appItem['url'] = url # appItemList.append(appItem) # return appItemList
def picture_parse(self, response): log.msg('run into picture_parse at line 66', level=log.INFO) item = response.meta['item'] host_address = 'http://image.baidu.com' path = response.xpath('//*[@id="page"]/a[10]/@href').extract_first() url = host_address.encode('utf-8') + path page_num = response.xpath('//*[@id="page"]/strong/span/text()').extract_first() log.msg('page_num is %s' % page_num, level=log.INFO) for option in response.xpath('//div[@id="imgid"]/ul[@class="imglist"]/li[@class="imgitem"]'): item_final = SightItem() item_final['title'] = item['title'] item_final['lng'] = item['lng'] item_final['lat'] = item['lat'] item_final['description'] = item['description'] item_final['category'] = item['category'] img_src = option.xpath('a/@href').extract_first() result = re.search(r'.*objurl=(http.*?)&.*', img_src).groups()[0] img_src = urllib.unquote(result).encode('utf-8') item['url'] = img_src print 'img_src: %s ========================****==============' % img_src img_url = jpg_test(img_url=img_src) print 'function jpg_test img_url is: %s ****************************' % img_url # if img_url is not None: try: print 'id_num: %s' % item['id_num'] save_img(img_url=img_url, id_num=item['id_num']) except TypeError as e: log.msg('img url is NoneType in function picture_parse at line 103: {0}'.format(e), level=log.INFO) if img_src is None or len(img_src) == 0: item['url'] = 'url_null' log.msg('img_src is null==============' + img_src, level=log.INFO) item_final['url'] = item['url'] log.msg('img_src in line 61***********' + img_src + '; type: %s ' % type(img_src), log.INFO) log.msg('run out picture_parse at line 92', level=log.INFO) yield item if path and page_num < PAGE_NUM: log.msg('***************path**************\r\n' + path, level=log.INFO) yield scrapy.Request(url, meta={'item': item, 'splash': { 'endpoint': 'render.html', 'args': {'wait': 0.5} } }, callback=self.picture_parse) # def next_page_parse(self, response):