Python scrapy.log 模块,msg() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scrapy.log.msg()

项目:crepriceSpider    作者:zhousenbiao    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem('Missing{0}!'.format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg('??????!', level=log.DEBUG, spider=spider)

        return item

    # def testdb(self):
    #     # ???MongoHQ
    #     con = pymongo.Connection("paulo.mongohq.com",10042)
    #     db = con.mytest
    #     db.authenticate("root", "sa123")
    #     db.urllist.drop()
项目:ip_proxy_pool    作者:leeyis    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if len(item['ip_port']):
            a = Proxy(
                ip_port=item['ip_port'],
                type=item['type'],
                level=item['level'],
                location=item['location'],
                speed=item['speed'],
                lifetime=item['lifetime'],
                lastcheck=item['lastcheck'],
                rule_id=item['rule_id'],
                source=item['source']
            )
            session = loadSession()
            try:
                session.merge(a)
                session.commit()
            except MySQLdb.IntegrityError, e:
                log.msg("MySQL Error: %s" % str(e), _level=logging.WARNING)
            return item
        else:
            log.msg("ip_port is invalid!",_level=logging.WARNING)
项目:MonkeyKing_crawler_recommender    作者:BitTigerInst    | 项目源码 | 文件源码
def process_item(self, item, spider):
        #import pudb; pu.db
        #val = "{0}\t{1}\t{2}\t{3}\t".format(item['appid'], item['title'], item['recommended'], item['intro'])
        #self.file.write('--------------------------------------------\n')
        #self.file.write(val)
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("new app added to MongoDB database!",
                    level=log.DEBUG, spider=spider)

        return item
项目:codePool    作者:LittleLory    | 项目源码 | 文件源码
def parse_model_selled(self, response):
        log.msg('[parse_selled] %s' % response.url)
        series_id = response.meta['series_id']
        data = json.loads(response.body_as_unicode())
        models = data['Spec']
        count = 0
        for model in models:
            model_id = model['Id']
            model_name = model['Name']
            group = model['GroupName']
            price = model['Price']

            model = ModelItem()
            model['id'] = model_id
            model['name'] = model_name
            model['series_id'] = series_id
            model['group'] = group
            model['price'] = price
            yield model
            count += 1
        log.msg('[parse_selled] model count is %d' % count)
项目:Android-Repackaged-App-Detection-System    作者:M157q    | 项目源码 | 文件源码
def parse_xpath(self, response, xpath):
        appItemList = []
        sel = Selector(response)
        for url in sel.xpath(xpath).extract():
            url = urljoin(response.url, url)
            log.msg("Catch an application: %s" % url, level=log.INFO)
            appItem = AppItem()
            appItem['url'] = url
            appItemList.append(appItem)
        return appItemList

    #def parse_anzhi(self, response, xpath):
    #    appItemList = []
    #    hxs = HtmlXPathSelector(response)
    #    for script in hxs.select(xpath).extract():
    #        id = re.search(r"\d+", script).group()
    #        url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
    #        appItem = AppItem()
    #        appItem['url'] = url
    #        appItemList.append(appItem)
    #    return appItemList
项目:wallstreetcnScrapy    作者:jianzhichun    | 项目源码 | 文件源码
def _do_upinsert(self, conn, item, spider):
        conn.execute("""SELECT EXISTS(
            SELECT 1 FROM wstable WHERE id = %s
        )""", (item['id'],))
        ret = conn.fetchone()[0]
        uri, title, author, time, description, content, images, view, id1 = self._parseItem(item)
        if ret:
            conn.execute("""
                update wstable set uri = %s, title = %s, author = %s, time1 = %s, description = %s, content = %s, images = %s, view1 = %s where id = %s    
            """, (uri,title,author,time,description,content,images,view,id1))
#             log.msg("""
#                 update wstable set uri = %s, title = %s, author = %s, time1 = %s, description = %s, content = %s, images = %s, view1 = %s where id = %s    
#                 """ % (uri,title,author,time,description,content,images,view,id1))
        else:
#             log.msg("""
#             insert into wstable(id, uri, title, author, time1, description, content, images, view1) 
#             values(%s, %s, %s, %s, %s, %s, %s, %s, %s)
#             """ % (id1,uri,title,author,time,description,content,images,view))
            conn.execute("""
            insert into wstable(id, uri, title, author, time1, description, content, images, view1) 
            values(%s, %s, %s, %s, %s, %s, %s, %s, %s)
            """, (id1,uri,title,author,time,description,content,images,view))
#             log.msg('finished item %s' % item['id'])
            print 'finished item %s' % item['id']
项目:stockSpider    作者:mizhdi    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if spider.name == 'baiduTopStockSpider':
            collection = self.db[settings['stock']]
            d = dict(item)
            cursor = list(collection.find({'num': d["num"], 'source': d["source"]}))

            if cursor:
                collection.update({'_id': cursor[0]['_id']}, d)
            else:
                collection.insert(d)
            log.msg("stock added to MongoDB database!", level=log.DEBUG, spider=spider)
        elif spider.name == 'xueqiuPostSpider':
            collection = self.db['post']
            collection.save(dict(item))
            log.msg("post added to MongoDB database!", level=log.DEBUG, spider=spider)

        return item
项目:spiders    作者:poodarchu    | 项目源码 | 文件源码
def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile
项目:spiders    作者:poodarchu    | 项目源码 | 文件源码
def determine_level(self, response):
        """
        determine the index level of current response, so we can decide wether to continue crawl or not.
        level 1: people/[a-z].html
        level 2: people/[A-Z][\d+].html
        level 3: people/[a-zA-Z0-9-]+.html
        level 4: search page, pub/dir/.+
        level 5: profile page
        """
        import re
        url = response.url
        if re.match(".+/[a-z]\.html", url):
            return 1
        elif re.match(".+/[A-Z]\d+.html", url):
            return 2
        elif re.match(".+/people-[a-zA-Z0-9-]+", url):
            return 3
        elif re.match(".+/pub/dir/.+", url):
            return 4
        elif re.match(".+/search/._", url):
            return 4
        elif re.match(".+/pub/.+", url):
            return 5
        log.msg("Crawl cannot determine the url's level: " + url)
        return None
项目:Crawlers    作者:mi-minus    | 项目源码 | 文件源码
def get_last_time(self):
        try:
            self.cu.execute('CREATE TABLE history (time TEXT,result TEXT,spider_name TEXT primary key)')
            last_time="2015-1-1 00:00:00"
        except:
            try:
                self.cu.execute('SELECT time FROM history where spider_name="'+self.spider_name+'"')
                last_time = self.cu.fetchone()[0]
                log.msg('************* '+last_time,level=log.WARNING)
            except:
                last_time="2015-5-1 00:00:00"
                log.msg('************* '+last_time,level=log.WARNING)

        last_time = time.strptime(last_time, '%Y-%m-%d %H:%M:%S')  
        last_time = time.mktime(last_time)     
        return last_time
项目:Crawlers    作者:mi-minus    | 项目源码 | 文件源码
def insert_new_time(self): 
        if time.mktime(time.strptime(self.item_max_time, '%Y-%m-%d %H:%M:%S')) < time.time():
            if self.sqlite_flag:
                try:
                    log.msg('delete from history where spider_name='+self.spider_name,level=log.WARNING)
                    self.cu.execute('delete from history where spider_name="'+self.spider_name+'"')
                    self.sx.commit() 
                except sqlite3.OperationalError,e:
                    log.msg('__________',level=log.WARNING)
                    pass

                sql = "insert into history values(?,?,?)"
                params = (self.item_max_time,self.item_max_id,self.spider_name)
                self.cu.execute(sql,params)    
                self.sx.commit() 
        self.close_sqlite()
项目:scrapy-cluster    作者:WalnutATiie    | 项目源码 | 文件源码
def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1
        if retries <= self.max_retry_times:
            log.msg(format="Retrying %(request)s " \
                            "(failed %(retries)d times): %(reason)s",
                    level=log.DEBUG, spider=spider, request=request,
                    retries=retries, reason=reason)
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            # our priority setup is different from super
            retryreq.meta['priority'] = retryreq.meta['priority'] - 10

            return retryreq
        else:
            log.msg(format="Gave up retrying %(request)s "\
                            "(failed %(retries)d times): %(reason)s",
                    level=log.DEBUG, spider=spider, request=request,
                    retries=retries, reason=reason)
项目:scrapy-cluster    作者:WalnutATiie    | 项目源码 | 文件源码
def process_item(self,item,spider):
    m = hashlib.md5()
    m.update(item['url'])
    url_MD5 = m.hexdigest()
    content_simhash = Simhash(self.get_features(item['content'])).value
    language = 'en'
    query_json='{"fields":["url_MD5","content_simhash"],"query":{"filtered":{"filter":{"term":{"url_MD5":"'+url_MD5+'"}}}}}'
    es = Elasticsearch(host='192.168.1.14',port=9200,timeout=1000)
    res = es.search(index="hiddenwebs", body=query_json)
    if res['hits']['total'] == 0:
        es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})    
    else:   
        flag = 0
        for hit in res['hits']['hits']:
            #print content_simhash
            #print hit["fields"]["content_simhash"][0]
            if int(hit["fields"]["content_simhash"][0]) == int(content_simhash):
                log.msg('The similar pages in es %s'%(item['url']),level=log.INFO)
                flag = 1
                es.index(index="hiddenwebs", doc_type="hiddenwebpages", id=hit['_id'], body={"create_time":item['create_time']})
                break
        if flag == 0 :
            es.index(index="hiddenwebs", doc_type="hiddenwebpages",body={"url":item['url'],"content":item['content'],"create_time":item['create_time'],"domain_name":item['domain_name'],"url_MD5":url_MD5,"title":item['title'],"content_simhash":content_simhash,"language":language})
项目:findtrip    作者:fankcoder    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if item['site'] == 'Qua':
            if item['company']:
                item['company'] = wash(item['company'])
            if item['flight_time']:
                item['flight_time'] = wash(item['flight_time'])
            if item['airports']:
                item['airports'] = wash(item['airports'])
            if item['passtime']:
                item['passtime'] = wash(item['passtime'])
            if item['price']:
                item['price'] = wash(item['price'])        
            for data in item:
                if not data:
                    raise DropItem("Missing data!")
            self.collection.insert(dict(item))
            log.msg("Question added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        elif item['site'] == 'Ctrip':
            self.collection.insert(dict(item))
            log.msg("Question added to MongoDB database!",
                    level=log.DEBUG, spider=spider)

        return item
项目:scrapy_sight    作者:wankaiss    | 项目源码 | 文件源码
def parse(self, response):
        for build in foreigh_7:
            item = SightItem()
            log.msg('build: ' + build, level=log.INFO)
            if baidu_geo_api(build.encode('utf-8')) is not None:
                lng, lat = baidu_geo_api(build.encode('utf-8'))
            else:
                lng, lat = 1, 1
            item['lng'] = lng
            item['lat'] = lat
            item['id_num'] = self.id_num
            self.id_num += 1L
            item['category'] = u'??????'
            item['title'] = build.encode('utf-8')
            pinyin = lazy_pinyin(build)
            item['pinyin'] = ''.join(pinyin).upper()
            if lng == 1 or lat == 1:
                log.msg('no landmark found: ' + 'at line 36,' + build, level=log.INFO)
                continue
            baike_url = 'https://baike.baidu.com/item/%s' % build
            yield scrapy.Request(baike_url, meta={'item': item}, callback=self.content_parse)
项目:scrapy_sight    作者:wankaiss    | 项目源码 | 文件源码
def content_parse(self, response):
        log.msg('run into content_parse at line 40', level=log.INFO)
        item = response.meta['item']
        result = response.xpath(
            '//div[@class="main-content"]/div[@class="lemma-summary"]/div[@class="para"]').extract()  # ????
        if len(result) != 0:
            pattern = re.compile(r'<[^>]+>', re.S)
            description = pattern.sub('', result[0]).encode('utf-8')
        else:
            description = 'description_null'
        item['description'] = description
        picture_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&ic=0&width=0&height=0' % item[
            'title'].decode('utf-8')
        log.msg('picture_url: ' + picture_url, level=log.INFO)
        log.msg('run out content_parse at line 51', level=log.INFO)
        yield scrapy.Request(picture_url, meta={'item': item,
                                                'splash': {
                                                    'endpoint': 'render.html',
                                                    'args': {'wait': 0.5}
                                                }
                                                }, callback=self.picture_parse)
项目:scrapy_sight    作者:wankaiss    | 项目源码 | 文件源码
def google_geo_api(sight_name):
    sight_name = sight_name.decode('utf-8')
    key = "AIzaSyDJtV9r7rAr9EBwlQ8Rbxvo6e7CkJsLn4k"
    url = "https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=AIzaSyAw-IJpHf6CYtb4OVgrj2MB7pmXlbSs7aY%s" % (sight_name, key)
    print 'url: %s' % url
    response = urllib2.urlopen(url.encode('utf-8'))
    result = response.read()
    json_loads = json.loads(result)
    if json_loads.get('status') == 'OK':
        location = json_loads.get('results')[0].get('geometry').get('location')
        lat = location.get('lat')
        lat = float('%.2f' % lat)
        lng = location.get('lng')
        lng = float('%.2f' % lng)
        print ('lat: %s\r\n lng %s' % (lat, lng))
        return lng, lat
    else:
        log.msg('There is no result about lat and lng')
        return 1, 1
        # json_text = json.loads(result)
        # lng = json_text.get('geometry')
        # print ('lng: %s' % lng)
项目:crawl_web    作者:hanxlinsist    | 项目源码 | 文件源码
def process_exception(self, request, exception, spider):
        proxy = request.meta['proxy']
        log.msg('Removing failed proxy <%s>, %d proxies left' % (
                    proxy, len(self.proxies)))
        try:
            del self.proxies[proxy]
        except ValueError:
            pass
项目:rental    作者:meihuanyu    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        print '--'*40
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            try:
                self.collection.insert(dict(item))
                log.msg("Question added to MongoDB database!",
                        level=log.DEBUG, spider=spider)
            except:
                print 'ggggg'*40
        return item
项目:Jobs-search    作者:Hopetree    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem('Missming{}!'.format(data))
        if valid:
            self.coll.insert(dict(item))
            log.msg('item added to mongodb database !',level=log.DEBUG,spider=spider)

        return item
项目:FreeFoodCalendar    作者:Yuliang-Zou    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("Event added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        return item
项目:FreeFoodCalendar    作者:Yuliang-Zou    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg("Event added to MongoDB database!",
                    level=log.DEBUG, spider=spider)
        return item
项目:jd_spider    作者:samrayleung    | 项目源码 | 文件源码
def close_spider(self, spider, reason):
        if self._dump:
            log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self.get_stats()),
                    spider=spider)
        self._persist_stats(self.get_stats(), spider)
项目:ip_proxy_pool    作者:leeyis    | 项目源码 | 文件源码
def process_request(self, request, spider):
        ua = random.choice(self.user_agent_list)
        if ua:
            #???????useragent
            #print "********Current UserAgent:%s************" %ua
            #??
            log.msg('Current UserAgent: '+ua, _level=logging.INFO)
            request.headers.setdefault('User-Agent', ua)

    #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
    #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
项目:ip_proxy_pool    作者:leeyis    | 项目源码 | 文件源码
def process_request(self, request, spider):
        # Set the location of the proxy
        pro_adr = random.choice(self.proxyList)
        log.msg("Current Proxy <%s>" % pro_adr,_level=logging.INFO)
        request.meta['proxy'] = "http://" + pro_adr
项目:doubanBook    作者:YangShuqing    | 项目源码 | 文件源码
def process_exception(self, request, exception, spider):
        proxy = request.meta['proxy']
        log.msg('Removing failed proxy <%s>, %d proxies left' % (
                    proxy, len(self.proxies)))
        try:
            del self.proxies[proxy]
        except ValueError:
            pass
项目:scrapy-itemloader    作者:scrapy    | 项目源码 | 文件源码
def parse_datetime(value):
    try:
        d = parse(value)
    except ValueError:
        log.msg('Unable to parse %s' % value, level=log.WARNING)
        return value
    else:
        return d.isoformat()
项目:scrapy-itemloader    作者:scrapy    | 项目源码 | 文件源码
def parse_date(value):
    try:
        d = parse(value)
    except ValueError:
        log.msg('Unable to parse %s' % value, level=log.WARNING)
        return value
    else:
        return d.strftime("%Y-%m-%d")
项目:scrappy    作者:DormyMo    | 项目源码 | 文件源码
def start_listening(self):
        self.port = listen_tcp(self.portrange, self.host, self)
        h = self.port.getHost()
        log.msg(format='Web service listening on %(host)s:%(port)d',
                level=log.DEBUG, host=h.host, port=h.port)
项目:codePool    作者:LittleLory    | 项目源码 | 文件源码
def parse(self, response):
        book_id = response.url.strip('/').split('/')[-1]
        log.msg('book_id[%s].' % book_id)
        book_name = response.xpath('//title/text()')[0].extract().strip(' (??)')
        bean = BookName()
        bean['book_id'] = book_id
        bean['book_name'] = book_name
        yield bean
项目:codePool    作者:LittleLory    | 项目源码 | 文件源码
def parse(self, response):
        url = response.url
        log.msg('[url]%s' % url)
        body = response.body
        soup = BeautifulSoup(body, 'lxml').select('.cardetail-infor')[0]
        text = str(self.gettextonly(soup)).decode('utf-8')
        m = re.findall(ur'(????|????|?????|????|????|? ? ?|? ? ?|????|??????)?\n?(.+)\n', text, re.M | re.U)
        map = dict([(d[0], d[1]) for d in m])
        result = SpecItem()
        result['id'] = url.split('/')[-1]
        result['spec'] = map
        yield result
项目:codePool    作者:LittleLory    | 项目源码 | 文件源码
def readIds(self):

        names = filter(lambda x: 'model' in x and 'json' in x,
                       os.listdir('/Users/king/Work/code/codePool/python/autohome_spider/data'))
        print names
        if not names:
            log.msg('[spec]no model data file in data dir.', log.ERROR)
            return
        model_file_name = names[-1]
        f = codecs.open('/Users/king/Work/code/codePool/python/autohome_spider/data/%s' % model_file_name, 'r')
        ids = [line['id'] for line in json.loads(f.read())]
        log.msg(len(ids), log.INFO)
        return ids
项目:codePool    作者:LittleLory    | 项目源码 | 文件源码
def parse(self, response):
        log.msg('[parse] %s' % response.url)

        # ????ID???????????URL?request??
        for seriesId in response.xpath('body/dl').re(r'id="s(\d+)"'):
            series_page_url = "http://www.autohome.com.cn/" + seriesId
            log.msg('series_page_url:%s' % series_page_url)
            request = scrapy.Request(url=series_page_url, callback=self.parse_model_selling, dont_filter=True)
            request.meta['series_id'] = seriesId
            yield request

    # ???????
项目:taobao    作者:laogewen    | 项目源码 | 文件源码
def process_item(self, item, spider):
        valid=True
        for data in item:
            if not data:
                valid=False
                raise DropItem('Missing{0}!'.format(data))
        if valid:
            self.collection.insert(dict(item))
            log.msg('question added to mongodb database!',
                    level=log.DEBUG,spider=spider)
        return item
项目:crawlBugs    作者:gnahznib    | 项目源码 | 文件源码
def process_item(self, item, spider):
        for data in item:
            if not data:
                raise DropItem("Missing data!")
        #self.collection.update({'url': item['url']}, dict(item), upsert=True)
        self.collection.insert(dict(item))
        log.msg("Question added to MongoDB database!",
                level=log.DEBUG, spider=spider)
        return None
项目:Douban_Crawler    作者:rafacheng    | 项目源码 | 文件源码
def process_request(self,request,spider):
        user_agent = UserAgent()
        ua = user_agent.random
        if ua:
            log.msg('Current UserAgent: '+ua, level=log.INFO) 
            request.headers.setdefault('User-Agent', ua)
项目:Android-Repackaged-App-Detection-System    作者:M157q    | 项目源码 | 文件源码
def process_item(self, item, spider):
        log.msg("Catch an AppItem", level=log.INFO)
        return item
项目:Android-Repackaged-App-Detection-System    作者:M157q    | 项目源码 | 文件源码
def process_item(self, item, spider):
        try:
            self.conn.execute('insert into apps(url) values(?)',
                        (item['url'],)
                    )
            self.conn.commit()
            log.msg("Inserting into database");
        except sqlite3.IntegrityError:
            print "Duplicated"
        return item
项目:wallstreetcnScrapy    作者:jianzhichun    | 项目源码 | 文件源码
def process_item(self, item, spider):
        for field in self.required_fields:

            if not item[field]:
#                 log.msg("Field '%s' missing" % (field))
                print "Field '%s' missing" % (field)
                raise DropItem("Field '%s' missing: %r" % (field, item))
        return item
项目:wallstreetcnScrapy    作者:jianzhichun    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if 'image_urls' in item:
            images = []
            abpath = '%s/%s/%s/%s' % (spider.name, item['id'][0],item['id'][1],item['id'])
            dir_path = '%s/%s' % (settings['IMAGES_STORE'], abpath)
            if not os.path.exists(dir_path) and len(item['image_urls'])>0:
                os.makedirs(dir_path)
            for image_url in item['image_urls']:
                name = image_url.split('/')[-1]
                _i = name.rfind('!')
                if _i > 4:
                    name = name[:_i]
                name = re.sub('\\\|/|:|\*|\?|"|<|>','_',name)
                image_file_name = name[-100:]
                file_path = '%s/%s' % (dir_path, image_file_name)
                images.append((image_url, file_path))
                if os.path.exists(file_path):
                    continue
                with open(file_path, 'wb') as handle:
                    try:
                        response = requests.get(image_url, stream=True)
                        for block in response.iter_content(1024):
                            if not block:
                                break
                            handle.write(block)
    #                     log.msg("download img to %s" % file_path)
                    except:
                        continue
            item['images'] = images
            if not images:
                pass
            else:
                _ = images[0][1]
                item['firstimage'] = '%s/%s' % (abpath, _[_.rfind('/')+1:])
                print item['firstimage']
        return item
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def scan(html):
        alerts = list()
        matches = HTMLClassifier.yara_rules.match(data=html)
        if not len(matches) > 0:
            return alerts

        for match in matches['html']:
           print match
           alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
           alert_data = "\n".join([s['data'] for s in match['strings']])
           alerts.append((alert_reason, alert_data))
           log.msg("Yara HTML Classification Match: " + alert_reason, level=log.INFO)
        return alerts
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def scan(uri):
        alerts = list()
        matches = URLClassifier.yara_rules.match(data=uri.encode('ascii', 'ignore'))
        if not len(matches) > 0:
            return alerts

        for match in matches['urls']:
           alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
           alert_data = "\n".join([s['data'] for s in match['strings']])
           alerts.append((alert_reason, alert_data))
           log.msg("Yara URL Classification Match: " + alert_reason, level=log.INFO)
        return alerts
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def scan(js):
        alerts = list()
        matches = JSClassifier.yara_rules.match(data=js.encode('ascii', 'ignore'))
        if not len(matches) > 0:
            return alerts

        for match in matches['js']:
           alert_reason = ", ".join([" ".join(t.split('_')) for t in match['tags']])
           alert_data = "\n".join([s['data'] for s in match['strings']])
           alerts.append((alert_reason, alert_data))
           log.msg("Yara JS Classification Match: " + alert_reason, level=log.INFO)
        return alerts
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def process_item(self, item, spider):
        if not type(item) == Alert:
            return item

        uri = item['uri']

        if not uri:
            raise DropItem("Not a valid alert URI: ", uri)

        if spider.custom_whitelist:
            for (pattern) in spider.custom_whitelist:
                if pattern[0] in uri:
                    raise DropItem("Whitelisted domain found in Alert: ", uri)

        if spider.alexa_whitelist:
            try:
                parsed_uri = urlparse(uri)
                parsed_domain = '{uri.netloc}'.format(uri=parsed_uri)
                domain = get_tld(uri)
                for alexa_domain in spider.alexa_whitelist:
                    if domain.endswith(alexa_domain):
                        raise DropItem("Alert domain found in Alexa Whitelist: ", domain)
            except (TldIOError,TldDomainNotFound,TldBadUrl) as e:
                log.msg("Error parsing TLD. Still allowing alert for " + uri, level=log.WARNING)
            except:
                raise

        return item
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def spider_opened(self, spider):
        self.conn = MySQLdb.connect(host=settings.MYSQL_HOST, db=settings.MYSQL_DB, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWORD, charset='utf8', use_unicode=True)
        cursor = spider.conn.cursor()
        sql_str = "SELECT pattern from whitelist"
        cursor.execute(sql_str)
        self.custom_whitelist = cursor.fetchall()
        try:
            alexa_whitelist_file = pkgutil.get_data("malspider", "resources/alexa-1k-whitelist.csv").decode('ascii')
            self.alexa_whitelist = alexa_whitelist_file.splitlines()
        except:
            log.msg("Error loading alexa whitelist...", level=log.ERROR)
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def parse_response(self, response):
        page_id = ObjectId()

        analyzer = Analyzer(response)
        alerts = analyzer.inspect_response()
        elems = analyzer.get_resource_elems()
        page = analyzer.get_page_info()

        for alert in alerts:
            alert['org_id'] = self.org
            yield alert

        for elem in elems:
            elem['page_id'] = page_id
            elem['org_id'] = self.org
            yield elem

        page['page_id'] = page_id
        page['org_id'] = self.org
        yield page

        #limit page depth
        if self.pages_crawled >= settings.PAGES_PER_DOMAIN:
            return

        for link in LxmlLinkExtractor(unique=True, deny_extensions=list(), allow_domains=self.allowed_domains).extract_links(response):
            if not link.url in self.already_crawled and self.pages_crawled <= settings.PAGES_PER_DOMAIN:
                self.already_crawled.add(link.url)
                self.pages_crawled = self.pages_crawled + 1
                log.msg("Yielding request for " + link.url, level=log.INFO)
                yield WebdriverRequest(link.url, callback=self.parse_response)
            elif self.pages_crawled >= settings.PAGES_PER_DOMAIN:
                log.msg("Reached max crawl depth: " + str(settings.PAGES_PER_DOMAIN), level=log.INFO)
                return
            else:
                log.msg("avoiding duplicate request for: " + link.url, level=log.INFO)
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def _download_request(self, request, spider):
        """Download a request URL using webdriver."""
        log.msg('Downloading %s with webdriver' % request.url, level=log.DEBUG)
        request.manager.webdriver.get(request.url)
        #time.sleep(5)
        take_screenshot = getattr(settings, 'TAKE_SCREENSHOT', None)
        screenshot_loc = getattr(settings, 'SCREENSHOT_LOCATION', None)
        if take_screenshot and screenshot_loc:
          screenshot_location = screenshot_loc + str(randint(10000,10000000)) + '.png'
          request.manager.webdriver.save_screenshot(screenshot_location)
          request.meta['screenshot'] = screenshot_location

        request.meta['User-Agent'] = request.headers.get('User-Agent')
        request.meta['Referer'] = request.headers.get('Referer')
        return WebdriverResponse(request.url, request.manager.webdriver)
项目:malspider    作者:ciscocsirt    | 项目源码 | 文件源码
def _do_action_request(self, request, spider):
        """Perform an action on a previously webdriver-loaded page."""
        log.msg('Running webdriver actions %s' % request.url, level=log.DEBUG)
        request.actions.perform()
        return WebdriverResponse(request.url, request.manager.webdriver)
项目:RealSpider    作者:RealSanqian    | 项目源码 | 文件源码
def process_request(self, request, spider):
        ua = random.choice(self.user_agent_list)
        if ua:
            #???????useragent
            print "********Current UserAgent:%s************" %ua

            #??
            log.msg('Current UserAgent: '+ua, level=1)
            request.headers.setdefault('User-Agent', ua)

    #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
    #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
项目:dytt8project    作者:WiseWolfs    | 项目源码 | 文件源码
def process_item(self,item,spider):
        for data in item:
            if not data:
                raise DropItem("Missing data!")
        self.collection.update({'url':item['url']},dict(item),upsert=True)
        log.msg("Question added to MongoDB !",level=log.DEBUG,spider=spider)
        return item