Python scrapy.http 模块，HtmlResponse() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用scrapy.http.HtmlResponse()。

项目：ArticleSpider 作者：mtianyan | 项目源码 | 文件源码

def process_request(self, request, spider):
        if spider.name == "jobbole":
            self.browser.get(request.url)
            import time
            time.sleep(3)
            print ("??:{0}".format(request.url))

            return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8", request=request)

#linux?

# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(800, 600))
# display.start()
#
# browser = webdriver.Chrome()
# browser.get()

项目：Scrapy-BenchCLI 作者：Parth-Vader | 项目源码 | 文件源码

def main():
    start = timer()

    url = 'http://scrapinghub.com/'
    link_extractor = LinkExtractor()
    total = 0
    for files in glob.glob('sites/*'):

        f = (io.open(files, "r", encoding="utf-8"))
        html = f.read()

        r3 = HtmlResponse(url=url, body=html, encoding='utf8')
        links = link_extractor.extract_links(r3)
        total = total + len(links)
    end = timer()
    print("\nTotal number of links extracted = {0}".format(total))
    print("Time taken = {0}".format(end - start))
    click.secho("Rate of link extraction : {0} links/second\n".format(
        float(total / (end - start))), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / (end - start)))))

项目：landchina-spider 作者：sundiontheway | 项目源码 | 文件源码

def process_request(self, request, spider):
        if request.meta.has_key('PhantomJS'):
            log.debug('PhantomJS Requesting: %s' % request.url)
            ua = None
            try:
                ua = UserAgent().random
            except:
                ua = 'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'

            webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = ua

            try:
                self.driver.get(request.url)
                content = self.driver.page_source.encode('utf-8')
                url = self.driver.current_url.encode('utf-8')
            except:
                return HtmlResponse(request.url, encoding='utf-8', status=503, body='')

            if content == '<html><head></head><body></body></html>':
                return HtmlResponse(request.url, encoding ='utf-8', status=503, body='')
            else:
                return HtmlResponse(url, encoding='utf-8', status=200, body=content)

        else:
            log.debug('Common Requesting: %s' % request.url)

项目：PyCrawler 作者：KillersDeath | 项目源码 | 文件源码

def goodsUrlList(home_url):
    '''
    ?????????????????url
    :param home_url: http://www.vipmro.com/search/?&categoryId=501110
    :return:url??
    '''
    # ????????
    all_group_list = parseOptional(home_url)
    # ????goods????url
    url_list = []
    for url in all_group_list:
        # url = 'http://www.vipmro.com/search/?ram=0.9551325197768372&categoryId=501110&attrValueIds=509805,509801,509806,509807'
        # ??html
        home_page = getHtmlFromJs(url)['content'].encode('utf-8')
        html = HtmlResponse(url=url,body=str(home_page))
        urls = html.selector.xpath('/html/body/div[7]/div[1]/ul/li/div[2]/a/@href').extract()
        url_list.extend(urls)
    #     print(len(urls))
    #     print(urls)
    #     exit()
    # print(len(url_list))
    # print(url_list)
    return url_list

项目：PyCrawler 作者：KillersDeath | 项目源码 | 文件源码

def parseOptional(url):
    '''
    ??url???????????url
    :param url: http://www.vipmro.com/search/?&categoryId=501110
    :return:['http://www.vipmro.com/search/?categoryId=501110&attrValueIds=509801,512680,509807,509823']
    '''
    # ??html
    home_page = getHtmlFromJs(url)['content'].encode('utf-8')
    html = HtmlResponse(url=url,body=str(home_page))
    # ????
    xi_lie = html.selector.xpath('/html/body/div[5]/div[6]/ul/li/a/@href').re(r'ValueIds=(\d+)')
    # ????????
    fen_duan = html.selector.xpath('/html/body/div[5]/div[10]/ul/li/a/@href').re(r'ValueIds=(\d+)')
    # ?????
    tuo_kou_qi = html.selector.xpath('/html/body/div[5]/div[14]/ul/li/a/@href').re(r'ValueIds=(\d+)')
    # ????
    an_zhuang = html.selector.xpath('/html/body/div[5]/div[12]/ul/li/a/@href').re(r'ValueIds=(\d+)')
    # ????????
    all_group = list(itertools.product(xi_lie,fen_duan,tuo_kou_qi,an_zhuang))
    _url = url + '&attrValueIds='
    url_list = map(lambda x:_url+','.join(list(x)),all_group)

    return url_list

项目：EasyCrawler 作者：playwolf719 | 项目源码 | 文件源码

def process_request(self, request, spider):
        try:
            driver = webdriver.PhantomJS() #????????
             # driver = webdriver.Firefox()
            print "---"+str(request.meta["page"])+"-----js url start-------"
            print datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            driver.get(self.pc_index_url+"&page="+str(request.meta["page"]) )
            # time.sleep(1)
            tmp=driver.find_element_by_id('sf-item-list-data').get_attribute("innerHTML")
            print "---"+str(request.meta["page"])+"-----js url end-------"
            print datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            body = tmp
            return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
        except Exception,e:
            print "-------------------"
            print e.__doc__
            print e.message
            print "-------------------"

项目：ugc.aggregator 作者：Dreamcatcher-GIS | 项目源码 | 文件源码

def intohotel(self,Links):

        url = "http://hotels.ctrip.com/" + Links
        self.driver.get(url)
        self.driver.maximize_window()
        self.driver.implicitly_wait(80)
        time.sleep(3)
        response = HtmlResponse(url="my HTML string",body=self.driver.page_source,encoding="utf-8")
        # ????????
        # self.crawlcommentinfo(commentnum)
        # # ????????
        try:
            items = self.crawlhotelinfo(response,url)
        except:
            items = self.crawlhotelinfo2(response,url)
        # ????????
        self.xiechengDao.savehotelComment(items)



    # ???????????

项目：ugc.aggregator 作者：Dreamcatcher-GIS | 项目源码 | 文件源码

def __crawllianjie(self,page_sourse):
        response = HtmlResponse(url="my HTML string",body=page_sourse,encoding="utf-8")
        hotel_list = response.xpath("//div[@class='searchresult_list ']/ul")
        for hotel in hotel_list:
            url = hotel.xpath("li[@class='searchresult_info_name']/h2/a/@href").extract()[0]
            address = hotel.xpath("li[@class='searchresult_info_name']/p[@class='searchresult_htladdress']/text()").extract()[0]
            commnum = hotel.xpath("li[@class='searchresult_info_judge ']/div/a/span[@class='hotel_judgement']/text()").extract()
            if len(commnum):
                commnum = re.sub('\D','',commnum[0])
                commnum = commnum if len(commnum)>0 else 0
            else:
                commnum = 0
            name = hotel.xpath("li[@class='searchresult_info_name']/h2/a/text()").extract()[0]
            self.listPageInfo.append({
                "guid": uuid.uuid1(),
                "url": url,
                "hotel_name": name,
                "OTA": self.__ota_info,
                "comm_num": int(commnum),
                "address": address
            })

项目：ugc.aggregator 作者：Dreamcatcher-GIS | 项目源码 | 文件源码

def __parseUrls(self, page_source):
        response = HtmlResponse(url="my HTML string",body=page_source,encoding="utf-8")
        # ?????????url???urlList?
        url_list = response.xpath("//a[@class='name']/@href").extract()
        comment_number_list = response.xpath("//div[@class='comment']/a/span/text()").extract()
        name_list = response.xpath("//a[@class='name']/text()").extract()
        address_list = response.xpath("//span[@class='address']/text()").extract()
        if len(url_list) == len(comment_number_list) == len(name_list) == len(address_list):
            for i in range(0, len(url_list)):
                self.listPageInfo.append({
                    "guid": uuid.uuid1(),
                    "url": url_list[i],
                    "hotel_name": name_list[i],
                    "OTA": "??",
                    "comm_num": int(comment_number_list[i]),
                    "address": address_list[i]
                })

项目：ugc.aggregator 作者：Dreamcatcher-GIS | 项目源码 | 文件源码

def __parseUrls(self,page_source):
        response = HtmlResponse(url="My HTML String",body=page_source,encoding="utf-8")
        hotel_list = response.xpath("//div[@class='h_list']/div[@class='h_item']")
        for hotel in hotel_list:
            url = hotel.xpath(".//p[@class='h_info_b1']/a/@href").extract()[0]
            name = hotel.xpath(".//p[@class='h_info_b1']/a/@title").extract()[0]
            address = hotel.xpath(".//p[@class='h_info_b2']/text()").extract()[1]
            commnum = hotel.xpath(".//div[@class='h_info_comt']/a/span[@class='c555 block mt5']/b/text()").extract()
            if len(commnum)==0:
                commnum = 0
            else:commnum = commnum[0]
            self.listPageInfo.append({
                "guid": uuid.uuid1(),
                "url": url,
                "hotel_name": name,
                "OTA": self.__ota_info,
                "comm_num": commnum,
                "address": address
            })
            pass

项目：Spider 作者：poluo | 项目源码 | 文件源码

def process_request(self, request, spider):
        try:
            selenium_enable = request.meta.get('selenium')
        except Exception as e:
            log.info(e)
            selenium_enable = False
        if selenium_enable:
            self.driver.get(request.url)
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR,
                     "#js-fans-rank > div > div.f-con > div.f-cn.cur > ul > li> a"))
            )
            body = self.driver.page_source
            response = HtmlResponse(url=self.driver.current_url, body=body, request=request, encoding='utf8')
            return response
        else:
            request.headers[
                'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
            request.headers[
                'Accept'] = '*/*'
            request.headers['Accept-Encoding'] = 'gzip, deflate, sdch, br'
            request.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,zh-TW;q=0.6'
            request.headers['Connection'] = 'keep-alive'
            request.headers['Host'] = 'www.douyu.com'
            request.headers['Upgrade-Insecure-Requests'] = 1

            try:
                cookies_enable = request.meta.get('cookies')
            except Exception as e:
                log.info(e)
                cookies_enable = False
            if cookies_enable:
                del request.headers['Upgrade-Insecure-Requests']
                request.headers['DNT'] = '1'
                request.headers['X-Requested-With'] = 'XMLHttpRequest'
                request.headers['referer'] = request.meta['referer']
                self.cookies['_dys_lastPageCode'] = request.meta.get('_dys_lastPageCode')
                self.cookies['_dys_refer_action_code'] = request.meta.get('_dys_refer_action_code')
                request.cookies = self.cookies

项目：Spider 作者：poluo | 项目源码 | 文件源码

def process_request(self, request, spider):
        if self.use_selenium(request.url):
            if self.use_proxy():
                if self._count > 20:
                    self.update_driver()
                    self._count = 0
                    log.info('update driver')
            yield HtmlResponse(request.url, encoding='utf-8', body=self.driver.page_source.encode('utf8'))

项目：tianyancha 作者：Range0122 | 项目源码 | 文件源码

def process_request(self, request, spider):
        if request.url[26] == 'c':
            ua = random.choice(self.user_agent_list)
            dcap = dict(DesiredCapabilities.PHANTOMJS)
            dcap["phantomjs.page.settings.userAgent"] = ua
            dcap["phantomjs.page.settings.loadImages"] = False
            driver = webdriver.PhantomJS(executable_path='E:\Webdriver\phantomjs-2.1.1-windows\\bin\phantomjs.exe',
                                         desired_capabilities=dcap)
            driver.get(request.url)
            sleep_time = random.randint(15, 22)
            time.sleep(sleep_time)
            try:
                detail = driver.find_element_by_xpath('//a[@ng-click="showDetail = btnOnClick(showDetail)"]')
                detail.click()
            except:
                pass
            body = driver.page_source
            url = driver.current_url
            driver.quit()
            return HtmlResponse(url=url, body=body, request=request, encoding='utf-8')

项目：fintech_spider 作者：hee0624 | 项目源码 | 文件源码

def process_request(self, request, spider):
        if spider.name == "gsxt":
            # print("PhantomJS is starting...")
            # driver = webdriver.PhantomJS(r"/home/lxw/Downloads/phantomjs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs")   # OK
            driver = webdriver.Chrome(r"/home/lxw/Software/chromedirver_selenium/chromedriver") # OK

            """
            # Using IP Proxies:
            # ????chrome?????chrome???IP?????????????????
            # ??DesiredCapabilities(????)??????????sessionId????????????????????????????url
            proxy = webdriver.Proxy()
            proxy.proxy_type = ProxyType.MANUAL
            req = requests.get("http://datazhiyuan.com:60001/plain", timeout=10)
            print("Get an IP proxy:", req.text)

            if req.text:
                proxy.http_proxy = req.text  # "1.9.171.51:800"
            # ????????webdriver.DesiredCapabilities.PHANTOMJS?
            proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
            driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
            """

            driver.get(request.url) # ????????????, ??http://roll.news.qq.com/??
            time.sleep(2)
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)   # ???js????????????????????
            time.sleep(3)
            body = driver.page_source
            print("??" + request.url)
            return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
        else:
            return

项目：scrapy-cluster 作者：WalnutATiie | 项目源码 | 文件源码

def do_test(self, meta_object,
                            text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                        meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request)

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.assertEqual(raw_item_count, expected_raw)
        self.assertEqual(request_count, expected_requests)

项目：GuShiWen 作者：zhouzhaoxin | 项目源码 | 文件源码

def detail_translate_note(self, all_url, itemi):
        for url in all_url:
            url = self.site_domain + url
            print('detail_translate_note url %s' % url)
            html_requests = requests.get(url).text.encode('utf-8')
            html_response = HtmlResponse(url=url, body=html_requests, headers={'Connection': 'close'})
            html_all = Selector(html_response)
            itemi['detail_translate_note_text_title'] = html_all.xpath(
                '//div[@class="main3"]/div[@class="shileft"]/div[@class="son1"]/h1/text()').extract()
            itemi['detail_translate_text'] = html_all.xpath(
                '//div[@class="main3"]/div[@class="shileft"]/div[@class="shangxicont"]/p[not(@style)]/descendant-or-self::text()').extract()
            item_list_temp = []
            for item_list in itemi['detail_translate_text']:
                temp = item_list.encode('utf-8')
                temp = re.sub(r'\"', "“", temp)
                item_list_temp.append(temp)
            itemi['detail_translate_text'] = item_list_temp
        pass

    # ????

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page(self):
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Search_Drug_Name'
        meta = {
            'original_url': 'http://www.accessdata.fda.gov/somewhere.cfm',
            'original_cookies': {
                'foo': 'bar',
            },
        }
        mock_response = HtmlResponse(url=url)
        mock_response.request = Request(url, meta=meta)

        with mock.patch('random.random', return_value='random_cookiejar'):
            spider = Spider()
            request = spider.parse_drug_details_or_overview(mock_response)

        assert request.url == meta['original_url']
        assert request.cookies == meta['original_cookies']
        assert request.dont_filter
        assert request.callback == spider.parse_drug_details_or_overview
        assert request.meta['cookiejar'] == 'random_cookiejar'

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def get_url(betamax_session):
    def _get_url(url, request_kwargs={}):
        '''Returns a scrapy.html.HtmlResponse with the contents of the received
        url.

        Note that the session is kept intact among multiple calls to this
        method (i.e. cookies are passed over).

        We also don't verify SSL certificates, because Takeda's certificate is
        invalid. If they become valid, we can resume verifying the
        certificates.
        '''
        response = betamax_session.get(url, verify=False)
        scrapy_response = HtmlResponse(
            url=str(response.url),
            body=response.content,
        )
        scrapy_response.request = Request(url, **request_kwargs)

        return scrapy_response
    return _get_url

项目：badoo_scrapy_splash_redis 作者：Supe2015 | 项目源码 | 文件源码

def test_form_request_from_response():
    # Copied from scrapy tests (test_from_response_submit_not_first_clickable)
    def _buildresponse(body, **kwargs):
        kwargs.setdefault('body', body)
        kwargs.setdefault('url', 'http://example.com')
        kwargs.setdefault('encoding', 'utf-8')
        return HtmlResponse(**kwargs)
    response = _buildresponse(
        """<form action="get.php" method="GET">
        <input type="submit" name="clickable1" value="clicked1">
        <input type="hidden" name="one" value="1">
        <input type="hidden" name="two" value="3">
        <input type="submit" name="clickable2" value="clicked2">
        </form>""")
    req = SplashFormRequest.from_response(
        response, formdata={'two': '2'}, clickdata={'name': 'clickable2'})
    assert req.method == 'GET'
    assert req.meta['splash']['args']['url'] == req.url
    fs = cgi.parse_qs(req.url.partition('?')[2], True)
    assert fs['clickable2'] == ['clicked2']
    assert 'clickable1' not in fs
    assert fs['one'] == ['1']
    assert fs['two'] == ['2']

项目：badoo_scrapy_splash_redis 作者：Supe2015 | 项目源码 | 文件源码

def test_form_request_from_response():
    # Copied from scrapy tests (test_from_response_submit_not_first_clickable)
    def _buildresponse(body, **kwargs):
        kwargs.setdefault('body', body)
        kwargs.setdefault('url', 'http://example.com')
        kwargs.setdefault('encoding', 'utf-8')
        return HtmlResponse(**kwargs)
    response = _buildresponse(
        """<form action="get.php" method="GET">
        <input type="submit" name="clickable1" value="clicked1">
        <input type="hidden" name="one" value="1">
        <input type="hidden" name="two" value="3">
        <input type="submit" name="clickable2" value="clicked2">
        </form>""")
    req = SplashFormRequest.from_response(
        response, formdata={'two': '2'}, clickdata={'name': 'clickable2'})
    assert req.method == 'GET'
    assert req.meta['splash']['args']['url'] == req.url
    fs = cgi.parse_qs(req.url.partition('?')[2], True)
    assert fs['clickable2'] == ['clicked2']
    assert 'clickable1' not in fs
    assert fs['one'] == ['1']
    assert fs['two'] == ['2']

项目：badoo_scrapy_splash_redis 作者：Supe2015 | 项目源码 | 文件源码

def test_form_request_from_response():
    # Copied from scrapy tests (test_from_response_submit_not_first_clickable)
    def _buildresponse(body, **kwargs):
        kwargs.setdefault('body', body)
        kwargs.setdefault('url', 'http://example.com')
        kwargs.setdefault('encoding', 'utf-8')
        return HtmlResponse(**kwargs)
    response = _buildresponse(
        """<form action="get.php" method="GET">
        <input type="submit" name="clickable1" value="clicked1">
        <input type="hidden" name="one" value="1">
        <input type="hidden" name="two" value="3">
        <input type="submit" name="clickable2" value="clicked2">
        </form>""")
    req = SplashFormRequest.from_response(
        response, formdata={'two': '2'}, clickdata={'name': 'clickable2'})
    assert req.method == 'GET'
    assert req.meta['splash']['args']['url'] == req.url
    fs = cgi.parse_qs(req.url.partition('?')[2], True)
    assert fs['clickable2'] == ['clicked2']
    assert 'clickable1' not in fs
    assert fs['one'] == ['1']
    assert fs['two'] == ['2']

项目：Broad_Crawler 作者：rafacheng | 项目源码 | 文件源码

def extractLinks(self, response):
        retv = []
        link_extractor = LinkExtractor()
        if isinstance(response, HtmlResponse):
            links = link_extractor.extract_links(response)
            for link in links:
                if self.postfix in link.url:
                    retv.append(link.url)
        return retv

项目：PyCrawler 作者：KillersDeath | 项目源码 | 文件源码

def goodsDetail(detail_url):
    '''
    ??xpath??????
    :param detail_url: ???url
    :return: ?????? dict
    '''
    goods_data = defaultdict()
    # ?????
    goods_data['source_url'] = detail_url
    # ??html body???str??
    body = getHtmlFromJs(detail_url)['content'].encode('utf-8')
    html = HtmlResponse(url=detail_url,body=str(body))
    # ??
    goods_data['name'] = html.xpath('/html/body/div[7]/div[2]/h1/text()').extract()[0]
    # ??
    goods_data['price'] = html.selector.xpath('/html/body/div[7]/div[2]/div[2]/ul/li[1]/label[1]/text()').extract()[0]
    # ??
    goods_data['type'] = html.selector.xpath('/html/body/div[7]/div[2]/div[2]/ul/li[3]/label/text()').extract()[0]
    # ??
    goods_data['detail'] = html.selector.xpath('/html/body/div[9]/div[2]/div[2]/table').extract()[0]
    # ??
    pics = []
    for pic in html.selector.xpath('/html/body/div[7]/div[1]/div[2]/div[2]/ul/li/img'):
        # ??????,????
        pics.append(pic.xpath('@src').extract()[0].replace('!240240',''))
    goods_data['pics'] = '|'.join(pics)
    goods_data['storage'] = ''
    goods_data['lack_period'] = ''
    goods_data['created'] = int(time.time())
    goods_data['updated'] = int(time.time())

    # print(goods_data['detail'])
    return goods_data

项目：open-pension-crawler 作者：nirgn975 | 项目源码 | 文件源码

def process_request(self, request, spider):
        # driver = webdriver.Firefox(executable_path="/Users/roysegall/geckodriver")
        driver = webdriver.PhantomJS(executable_path='/Users/roysegall/phantomjs')
        driver.get(request.url)
        return HtmlResponse(request.url, encoding='utf-8', body=driver.page_source.encode('utf-8'))

项目：scrapy-training 作者：scrapinghub | 项目源码 | 文件源码

def process_request(self, request, spider):
        if request.meta.get('nojs'):
            # disable js rendering in a per-request basis
            return
        self.driver.get(request.url)
        content = self.driver.page_source
        return HtmlResponse(request.url, body=content, encoding='utf-8')

项目：ugc.aggregator 作者：Dreamcatcher-GIS | 项目源码 | 文件源码

def pageHandler_comment(self,page_source,pageNum,userID,weiboID):
        response = HtmlResponse(url="my HTML string",body=page_source,encoding="utf-8")
        if pageNum==1:
            pass
        items = self.__getCommentItems(response,pageNum,userID,weiboID)
        if len(items)>0:
            self.weiboDao.saveWeiboComment(items)

    # ??????????

项目：ugc.aggregator 作者：Dreamcatcher-GIS | 项目源码 | 文件源码

def __parseHotelComment(self, page_source, hotel_id, comm_type):
        response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8")
        remarkDom = response.xpath("//div[@class='user_remark_datail']")
        remarkDomLen = len(response.xpath("//div[@class='user_remark_datail']/div"))
        # ?????????????????????
        same_num = 0
        for i in range(1, remarkDomLen+1):
            id = uuid.uuid1()
            # ???
            username = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b2']/text()"%i).extract()
            username = username[0] if len(username) > 0 else ""
            # ????
            remarkText = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b2']/p/text()"%i).extract()
            remark = ""
            for str in remarkText:
                remark = remark + re.sub("\s+", "", str)
            # ????
            comm_time = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b4']/div[@style='float: right;']/text()"%i).extract()[0]
            # ????
            user_type = ""
            senti_value = None
            viewpoint = None
            try:
                user_type = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b3']/text()"%i).extract()[0]
                senti_value = self.hotelNLP.sentiment(remark.encode("utf-8"))
                viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8"))
            except:
                traceback.print_exc()
            comm = {"guid":id, "username":username, "remark":remark, "comm_time":comm_time, "user_type":user_type, "hotel_id":hotel_id, "comm_type":comm_type, "senti_value":senti_value, "viewpoint":viewpoint}
            if self.__is_exist_in_comment_list(comm):
                same_num += 1
            else:
                self.commList.append(comm)
        if same_num == remarkDomLen:
            return False
        else:
            return True

项目：Scrapy-BenchCLI 作者：Parth-Vader | 项目源码 | 文件源码

def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

项目：feeds 作者：nblock | 项目源码 | 文件源码

def parse(self, response):
        # Wiener Linien returns HTML with an XML content type which creates an
        # XmlResponse.
        response = HtmlResponse(url=response.url, body=response.body)
        for item in response.css('.block-news-item'):
            il = FeedEntryItemLoader(response=response,
                                     timezone=self._timezone,
                                     base_url='http://{}'.format(self.name))
            link = response.urljoin(item.css('a::attr(href)').extract_first())
            il.add_value('link', link)
            il.add_value('title', item.css('h3::text').extract_first())
            il.add_value('updated', item.css('.date::text').extract_first())
            yield scrapy.Request(link, self.parse_item, meta={'il': il})

项目：decoration-design-crawler 作者：imflyn | 项目源码 | 文件源码

def process_request(self, request, spider):
        if JAVASCRIPT in request.meta and request.meta[JAVASCRIPT] is True:
            driver = self.phantomjs_opened()
            try:
                driver.get(request.url)
                body = driver.page_source
                return HtmlResponse(request.url, body=body, encoding='utf-8', request=request)
            finally:
                self.phantomjs_closed(driver)

项目：scrapyweixi 作者：Felix-P-Code | 项目源码 | 文件源码

def process_request(self, request, spider):

        if 'how' in request.meta:

            if 'isscreen' in request.meta:
                print(1)
                true_page = selenium_request(request.url,True)
            else:
                true_page = selenium_request(request.url)

            return HtmlResponse(request.url, body=true_page, encoding='utf-8', request=request, )

项目：django-scrapy-lcv_search 作者：Albino1995 | 项目源码 | 文件源码

def process_request(self, request, spider):
        if spider.name == "jobbole":
            spider.browser.get(request.url)
            import time
            # time.sleep(3)
            print("???{0}".format(request.url))
            return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8"
                                , request=request)

项目：scraper 作者：firmadyne | 项目源码 | 文件源码

def parse_kb(self, response):
        # initial html tokenization to find regions segmented by e.g. "======"
        # or "------"
        filtered = response.xpath(
            "//div[@class='sfdc_richtext']").extract()[0].split("=-")

        for entry in [x and x.strip() for x in filtered]:
            resp = HtmlResponse(url=response.url, body=entry,
                                encoding=response.encoding)

            for link in resp.xpath("//a"):
                href = link.xpath("@href").extract()[0]
                if "cache-www" in href:
                    text = resp.xpath("//text()").extract()
                    text_next = link.xpath("following::text()").extract()

                    item = FirmwareLoader(item=FirmwareImage(),
                                          response=response,
                                          date_fmt=["%b %d, %Y", "%B %d, %Y",
                                                    "%m/%d/%Y"])

                    version = FirmwareLoader.find_version_period(text_next)
                    if not version:
                        version = FirmwareLoader.find_version_period(text)

                    item.add_value("version", version)
                    item.add_value("date", item.find_date(text))
                    item.add_value("url", href)
                    item.add_value("product", response.meta["product"])
                    item.add_value("vendor", self.name)
                    yield item.load_item()

项目：scraper 作者：firmadyne | 项目源码 | 文件源码

def parse_kb(self, response):
        mib = None

        # need to perform some nasty segmentation because different firmware versions are not clearly separated
        # reverse order to get MIB before firmware items
        for entry in reversed(response.xpath(
                "//div[@id='support-article-downloads']/div/p")):
            for segment in reversed(entry.extract().split("<br><br>")):
                resp = HtmlResponse(
                    url=response.url, body=segment, encoding=response.encoding)
                for href in resp.xpath("//a/@href").extract():
                    text = resp.xpath("//text()").extract()

                    if "MIBs" in href:
                        mib = href

                    elif "firmware" in href:
                        text = resp.xpath("//text()").extract()

                        item = FirmwareLoader(
                            item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"])
                        item.add_value("date", item.find_date(text))
                        item.add_xpath("url", "//a/@href")
                        item.add_value("mib", mib)
                        item.add_value("product", response.meta["product"])
                        item.add_value("vendor", self.name)
                        item.add_value(
                            "version", FirmwareLoader.find_version_period(text))
                        yield item.load_item()

项目：quant 作者：yutiansut | 项目源码 | 文件源码

def process_request(self, request, spider):
        print("Using process_request")
        true_page = selenium_request(request.url)
        return HtmlResponse(request.url, body=true_page, encoding='utf-8', request=request)

项目：FirstSpider 作者：yipwinghong | 项目源码 | 文件源码

def process_request(self, request, spider):
        if spider.name == 'jobbole':
            spider.browser.get(request.url)
            time.sleep(3)
            print("??: {0}".format(request.url))
            return HtmlResponse(
                url=spider.browser.current_url,
                body=spider.browser.page_source,
                encoding="utf-8",
                request=request
            )

项目：frontoxy 作者：fabienvauchelles | 项目源码 | 文件源码

def read(self, source):
        source_filename = os.path.basename(source)

        with zipfile.ZipFile(source) as zf:
            filenames = sorted(set([zipinfo.filename[:10] for zipinfo in zf.infolist()]))
            for filename in filenames:
                source_path = u'{0}/{1}'.format(source_filename, filename)

                # Read info
                desc = zf.read(self.INFO_FORMAT.format(filename))
                info = json.loads(desc)

                url = info['url'].encode('utf8')
                info.pop('url', None)

                headers = info['headers']
                info.pop('headers', None)

                status = info['status']
                info.pop('status', None)

                info_meta = info['meta']
                info_meta['source_path'] = source_path

                # Read content
                content = zf.read(self.BODY_FORMAT.format(filename))
                request = Request(
                    url=url,
                    meta=info_meta
                )

                response = HtmlResponse(
                    url=url,
                    headers=headers,
                    status=status,
                    body=content,
                    request=request,
                )

                yield response

项目：GuShiWen 作者：zhouzhaoxin | 项目源码 | 文件源码

def handle_detail(self, response, itemi):
        print(response)
        response = response.strip()
        # requests.adapters.DEFAULT_RETRIES = 10
        # s = requests.session()
        # s.config['keep_alive'] = False
        html_requests_item = requests.get(response)
        html_requests = html_requests_item.text.encode('utf-8')
        # html_requests_item.connection.close()

        html_response = HtmlResponse(url=response, body=html_requests, headers={'Connection': 'close'})
        html_all = Selector(html_response)
        html = html_all.xpath('//div[@class="main3"]/div[@class="shileft"]')
        itemi['detail_dynasty'] = html.xpath(
            u'div[@class="son2"]/p/span[contains(text(),"???")]/parent::p/text()').extract()[0]
        itemi['detail_translate_note_url'] = html.xpath(
            u'div[@class="son5"]//u[contains(text(),"?????")]/parent::a/@href').extract()

        itemi['detail_appreciation_url'] = html.xpath(
            u'div[@class="son5"]//u[contains(text(),"?")]/parent::a/@href').extract()

        itemi['detail_background_url'] = html.xpath(
            u'div[@class="son5"]//u[contains(text(),"????") or contains(text(),"????")]/parent::a/@href').extract()
        itemi['detail_author'] = html.xpath(
            u'div[@class="son2"]/p/span[contains(text(),"???")]/parent::p/a/text()').extract()

        itemi['detail_text'] = "".join(html.xpath('div[@class="son2"]/text()').extract()).strip().encode('utf-8')
        # itemi['detail_text'] = re.sub(r'?',"“",itemi['detail_text'])
        # itemi['detail_text'] = re.sub(r'\(.*?\)',"",itemi['detail_text'])
        itemi['detail_text'] = re.sub(r'\r?\n\t?.*?\)', "", itemi['detail_text'])

        if itemi['detail_background_url']:
            self.detail_background(itemi['detail_background_url'], itemi)
            pass
        else:
            pass

        self.detail_translate_note(itemi['detail_translate_note_url'], itemi)
        self.detail_appreciation(itemi['detail_appreciation_url'], itemi)

    # ??????

项目：GuShiWen 作者：zhouzhaoxin | 项目源码 | 文件源码

def detail_background(self, all_url, itemi):
        detail_appreciation_container = []
        for url in all_url:
            url = self.site_domain + url
            print('detail_background_text url : %s' % url)
            html_requests = requests.get(url).text.encode('utf-8')
            html_response = HtmlResponse(url=url, body=html_requests, headers={'Connection': 'close'})
            html_all = Selector(html_response)
            temp = ''.join(html_all.xpath(
                u'//div[@class="main3"]/div[@class="shileft"]/div[@class="shangxicont"]/p[not(@style or contains(text(),"?????"))]').extract())
            temp = temp.encode('utf-8')
            temp = re.sub(r'<p>', '', temp)
            temp = re.sub(r'</p>', '', temp)
            temp = re.sub(r'</a>', '', temp)
            temp = re.sub(r'(<a\s+href=\s*\".*?\">)', '', temp)
            alt = re.search(r'\s+alt=\s*\"(.*?)\"\s+', temp)
            # print(alt.group(1))
            if alt is not None:
                temp = re.sub(r'<img.*\s*>', alt.group(1), temp)
            else:
                print('%s have a none img' % url)
            temp = re.sub(r'\"', "“", temp)

            detail_appreciation_container.append(temp)
        itemi['detail_background_text'] = detail_appreciation_container

    # ???????

项目：GuShiWen 作者：zhouzhaoxin | 项目源码 | 文件源码

def detail_appreciation(self, all_url, itemi):
        detail_appreciation_container = []
        for url in all_url:
            url = self.site_domain + url
            print('detail_appreciation url : %s' % url)
            html_requests = requests.get(url).text.encode('utf-8')
            html_response = HtmlResponse(url=url, body=html_requests, headers={'Connection': 'close'})
            html_all = Selector(html_response)
            temp = ''.join(html_all.xpath(
                u'//div[@class="main3"]/div[@class="shileft"]/div[@class="shangxicont"]/p[not(@style or contains(text(),"?????"))]').extract())
            temp = temp.encode('utf-8')
            temp = re.sub(r'<p>', '', temp)
            temp = re.sub(r'</p>', '', temp)
            temp = re.sub(r'</a>', '', temp)
            temp = re.sub(r'(<a\s+href=\s*\".*?\">)', '', temp)
            alt = re.search(r'\s+alt=\s*\"(.*?)\"\s+', temp)
            # print(alt.group(1))
            if alt is not None:
                temp = re.sub(r'<img.*\s*>', alt.group(1), temp)
            else:
                print('%s have a none img in appricate' % url)
            temp = re.sub(r'\"', "“", temp)
            # if self.site_domain + '/shangxi_4618.aspx' == url:
            # print(temp)
            detail_appreciation_container.append(temp)
        itemi['detail_appreciation_text'] = detail_appreciation_container
        pass

项目：Charlotte 作者：LiZoRN | 项目源码 | 文件源码

def process_request(self, request, spider):
        if spider.name == "jobbole":
            # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
            spider.browser.get(request.url)
            import time
            time.sleep(3)
            print ("??:{0}".format(request.url))

            return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def test_parse_drug_details_or_overview_delegates_to_parse_drug_details_when_response_in_drug_details(self):
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.DrugDetails'
        mock_response = HtmlResponse(url=url)
        expected_result = 'expected_result'

        with mock.patch.object(Spider,
                               'parse_drug_details',
                               return_value=expected_result) as mock_method:
            spider = Spider()
            result = spider.parse_drug_details_or_overview(mock_response)

        mock_method.assert_called_once_with(mock_response)
        assert result == expected_result

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def test_parse_drug_details_or_overview_delegates_to_parse_drug_details_when_response_in_drug_overview(self):
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Overview&DrugName=E-BASE'
        mock_response = HtmlResponse(url=url)
        expected_result = 'expected_result'

        with mock.patch.object(Spider,
                               'parse_drug_overview',
                               return_value=expected_result) as mock_method:
            spider = Spider()
            result = spider.parse_drug_details_or_overview(mock_response)

        mock_method.assert_called_once_with(mock_response)
        assert result == expected_result

项目：collectors 作者：opentrials | 项目源码 | 文件源码

def test_parse_drug_details_or_overview_raises_exception_for_unknown_pages(self):
        url = 'http://www.accessdata.fda.gov/'
        mock_response = HtmlResponse(url=url)

        with pytest.raises(Exception):
            spider = Spider()
            spider.parse_drug_details_or_overview(mock_response)

项目：alltheplaces 作者：alltheplaces | 项目源码 | 文件源码

def parse(self, response):
        marker_txt = re.findall(re.compile("markerData.*\}", re.MULTILINE), response.body_as_unicode())
        if not len(marker_txt):
            return
        markers_json = "{\"" + marker_txt[0]
        markers = list(json.loads(markers_json).values())[0]

        if not len(markers):
            return
        for marker in markers:
            marker_response = HtmlResponse(url="", body=marker["info"].encode("utf-8"))
            hours = re.findall(r"\{\"label.*\}", marker["info"])
            hours = hours[0]
            parsed_hours = json.loads(hours)

            addr_parts = marker_response.css(".address span:not(.phone)::text").extract()
            url = marker_response.css("header a").xpath("@href").extract_first()
            city, state = addr_parts[-1].split(",")

            yield GeojsonPointItem(lat=marker.get("lat"), lon=marker.get("lng"),
                                   name=marker_response.css("header a::text").extract_first(default=None),
                                   addr_full=", ".join(addr_parts),
                                   city=city.strip(),
                                   state=state.strip(),
                                   country="United States",
                                   phone=marker_response.css(".phone::text").extract_first(),
                                   website=url,
                                   opening_hours=get_hours(parsed_hours["days"]),
                                   ref=url.split("/")[-1].split(".")[0])

项目：alltheplaces 作者：alltheplaces | 项目源码 | 文件源码

def parse(self, response):
        data = json.loads(response.body_as_unicode())
        stores = data['markers']                            
        for store in stores:                                 
            html = HtmlResponse(
                url="", 
                body=store['info'].encode('UTF-8')
            )

            unp = {}
            unp['lat'] = store['lat']
            unp['lon'] = store['lng']

            if unp['lat']: unp['lat'] = float(unp['lat'])
            if unp['lon']: unp['lon'] = float(unp['lon'])

            unp['ref'] = store['locationId']
            unp['addr_full'] = html.xpath('//div[contains(@class, "addr")]/text()').extract_first()
            unp['phone'] = html.xpath('//div[contains(@class, "phone")]/text()').extract_first()
            unp['name'] = html.xpath('//div[@class="loc-name"]/text()').extract_first()
            addr2 = html.xpath('//div[contains(@class, "csz")]/text()').extract_first()
            if addr2:
                addr2 = addr2.strip()
                three_pieces = self.addr2regex.search(addr2)
                if three_pieces:
                    city, state, zipcode = three_pieces.groups()
                    unp['city'] = city
                    unp['state'] = state
                    unp['postcode'] = zipcode

            properties = {}                                                
            for key in unp:
                if unp[key]:
                    properties[key] = unp[key]

            yield GeojsonPointItem(**properties)

项目：scrapy-bench 作者：scrapy | 项目源码 | 文件源码

def main():
    total = 0
    time = 0
    tar = tarfile.open("bookfiles.tar.gz")
    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()

        response = HtmlResponse(url="local", body=html, encoding='utf8')

        start = timer()

        rating = response.xpath(
            "//*[@id='content_inner']/article/div[1]/div[2]/p[3]/i[1]").extract(),  # .split(' ')[-1],
        title = response.xpath(
            "//*[@id=('content_inner')]/article/div[1]/div[2]/h1").extract(),
        price = response.xpath(
            "//*[@id=('content_inner')]/article/div[1]/div[2]/p[1]"),
        stock = ''.join(response.xpath(
            "//*[@id=('content_inner')]/article/div[1]/div[2]/p[2]").re('(\d+)')),

        end = timer()
        page = [rating, title, price, stock]

        total = total + 1
        time = time + end - start

    print("\nTotal number of pages extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} pages/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time))))

项目：scrapy-bench 作者：scrapy | 项目源码 | 文件源码

def main():
    url = 'http://scrapinghub.com/'
    link_extractor = LinkExtractor()
    total = 0
    time = 0
    tar = tarfile.open("sites.tar.gz")
    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()

        start = timer()

        response = HtmlResponse(url=url, body=html, encoding='utf8')
        links = link_extractor.extract_links(response)

        end = timer()

        total = total + len(links)
        time = time + end - start

    print("\nTotal number of links extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} links/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time))))

项目：scrapy-bench 作者：scrapy | 项目源码 | 文件源码

def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

项目：scrapy-bench 作者：scrapy | 项目源码 | 文件源码

def main():
    total = 0
    time = 0
    tar = tarfile.open("bookfiles.tar.gz")
    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()

        response = HtmlResponse(url="local", body=html, encoding='utf8')

        start = timer()

        rating = response.css(
            'p.star-rating::attr(class)').extract_first().split(' ')[-1]
        title = response.css('.product_main h1::text').extract_first()
        price = response.css(
            '.product_main p.price_color::text').re_first('£(.*)')
        stock = ''.join(
            response.css('.product_main .instock.availability ::text').re('(\d+)'))
        category = ''.join(
            response.css('ul.breadcrumb li:nth-last-child(2) ::text').extract()).strip()

        end = timer()
        page = [rating, title, price, stock, category]

        total = total + 1
        time = time + end - start

    print("\nTotal number of pages extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} pages/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time))))