我们从Python开源项目中,提取了以下20个代码示例,用于说明如何使用scrapy.http.TextResponse()。
def load_response(self, case_id): "Create Scrapy Response from the html file" url = self.raw_item['url'] request = Request(url=url) page_id = case_id + '.html' page_path = os.path.join(self._destdir, page_id) page = open(page_path, 'rb') response = TextResponse(url=url, request=request, body=page.read().decode('utf-8', 'ignore'), encoding='utf-8') page = page.close() return response
def test_magic_response2(): # check 'body' handling and another 'headers' format mw = _get_mw() req = SplashRequest('http://example.com/', magic_response=True, headers={'foo': 'bar'}, dont_send_headers=True) req = mw.process_request(req, None) assert 'headers' not in req.meta['splash']['args'] resp_data = { 'body': base64.b64encode(b"binary data").decode('ascii'), 'headers': {'Content-Type': 'text/plain'}, } resp = TextResponse("http://mysplash.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp2 = mw.process_response(req, resp, None) assert resp2.data == resp_data assert resp2.body == b'binary data' assert resp2.headers == {b'Content-Type': [b'text/plain']} assert resp2.status == 200 assert resp2.url == "http://example.com/"
def test_magic_response_http_error(): mw = _get_mw() req = SplashRequest('http://example.com/foo') req = mw.process_request(req, None) resp_data = { "info": { "error": "http404", "message": "Lua error: [string \"function main(splash)\r...\"]:3: http404", "line_number": 3, "type": "LUA_ERROR", "source": "[string \"function main(splash)\r...\"]" }, "description": "Error happened while executing Lua script", "error": 400, "type": "ScriptError" } resp = TextResponse("http://mysplash.example.com/execute", headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp = mw.process_response(req, resp, None) assert resp.data == resp_data assert resp.status == 404 assert resp.url == "http://example.com/foo"
def run_crawler(base_url, ua, start_date, end_date, google_username, google_password): temp_df = pd.DataFrame() dates = date_range(start_date, end_date) for d in dates: url = '{0}//transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/{1}/plus/1'.format( base_url, d) rqst = requests.get(url, headers={"User-Agent": ua}) resp = TextResponse(url, body=rqst.content) players, nat, ages, positions, prev_clubs, next_clubs, mkt_values, trans_prices = get_data_lists( resp) df = get_df(players, nat, ages, positions, prev_clubs, next_clubs, mkt_values, trans_prices, d) trends_df = get_trends_data(google_username, google_password, players, d) df = pd.merge(df, trends_df, how='left', on='player') temp_df = pd.concat([temp_df, df]) return temp_df
def process_exception(self, request, exception, spider): if isinstance(exception, self.DONT_RETRY_ERRORS): return TextResponse(url=request.meta['proxy'])
def setUp(self): self.spider = MinutesSpider() self.index = TextResponse( url=self.spider.start_urls[0], body=open('./tests/samples/minute_index.html').read(), encoding='utf-8' ) self.response = TextResponse( url="http://mail.camara.rj.gov.br/APL/Legislativos/atas.nsf/" + "3f8037c08c436684032577040057cb8c/54ab5cc388ffcda5832580830059b178?OpenDocument", body=open('./tests/samples/minute_item.html').read(), encoding='utf-8' ) # 105ª Sessão Ordinária
def setUp(self): self.spider = AldermanSpider() self.index = TextResponse( url=self.spider.start_urls[0], body=open('./tests/samples/alderman_index.html', encoding='windows-1252').read(), encoding='utf-8' ) self.item = TextResponse( url="http://www.camara.rj.gov.br/vereador_informacoes.php?m1=inform&cvd=24", body=open('./tests/samples/alderman_item.html', encoding='windows-1252').read(), encoding='utf-8' ) # Carlos Bolsonaro
def dummy_response(): """Dummy response fixture.""" from scrapy.http import TextResponse, Request url = 'http://www.example.com' request = Request(url=url) response = TextResponse(url=url, request=request, body=TEST_FILE_2, encoding='utf-8') return response
def test_unicode_url(): mw = _get_mw() req = SplashRequest( # note unicode URL u"http://example.com/", endpoint='execute') req2 = mw.process_request(req, None) res = {'html': '<html><body>Hello</body></html>'} res_body = json.dumps(res) response = TextResponse("http://mysplash.example.com/execute", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'application/json'}, body=res_body.encode('utf8')) response2 = mw.process_response(req2, response, None) assert response2.url == "http://example.com/"
def collect(conf, conn): """Collect ICD-XX-CM conditions. """ # For more information see: # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip' FILE = 'Tabular.xml' VERSION = 'ICD-10-CM' LAST_UPDATED = '2015-10-01' # Prepare xml zip = requests.get(URL).content xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read() res = TextResponse(url=URL, body=xml, encoding='utf-8') count = 0 for diag in res.xpath('//diag'): # We need only leafs childs = diag.xpath('./diag') if not childs: continue # Get data data = { 'name': diag.xpath('./name/text()').extract_first(), 'desc': diag.xpath('./desc/text()').extract_first(), 'terms': diag.xpath('.//note/text()').extract(), 'version': VERSION, 'last_updated': LAST_UPDATED, } # Create record record = Record.create(URL, data) # Write record record.write(conf, conn) # Log info count += 1 if not count % 100: logger.info('Collected %s "%s" conditions', count, record.table)
def test_splash_request(): mw = _get_mw() cookie_mw = _get_cookie_mw() req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>" # check request preprocessing req2 = cookie_mw.process_request(req, None) or req req2 = mw.process_request(req2, None) or req2 assert req2 is not None assert req2 is not req assert req2.url == "http://127.0.0.1:8050/render.html" assert req2.headers == {b'Content-Type': [b'application/json']} assert req2.method == 'POST' assert isinstance(req2, SplashRequest) assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>" expected_body = {'url': req.url} assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing response = TextResponse("http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, body=b"<html><body>Hello</body></html>") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) assert isinstance(response2, scrapy_splash.SplashTextResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url assert response2.body == b"<html><body>Hello</body></html>" assert response2.css("body").extract_first() == "<body>Hello</body>" assert response2.headers == {b'Content-Type': [b'text/html']} # check .replace method response3 = response2.replace(status=404) assert response3.status == 404 assert isinstance(response3, scrapy_splash.SplashTextResponse) for attr in ['url', 'real_url', 'headers', 'body']: assert getattr(response3, attr) == getattr(response2, attr)
def test_cookies(): mw = _get_mw() cookie_mw = _get_cookie_mw() def request_with_cookies(cookies): req = SplashRequest( 'http://example.com/foo', endpoint='execute', args={'lua_source': 'function main() end'}, magic_response=True, cookies=cookies) req = cookie_mw.process_request(req, None) or req req = mw.process_request(req, None) or req return req def response_with_cookies(req, cookies): resp_data = { 'html': '<html><body>Hello</body></html>', 'headers': [], 'cookies': cookies, } resp = TextResponse( 'http://mysplash.example.com/execute', headers={b'Content-Type': b'application/json'}, body=json.dumps(resp_data).encode('utf8')) resp = mw.process_response(req, resp, None) resp = cookie_mw.process_response(req, resp, None) return resp # Concurent requests req1 = request_with_cookies({'spam': 'ham'}) req2 = request_with_cookies({'bom': 'bam'}) resp1 = response_with_cookies(req1, [ {'name': 'spam', 'value': 'ham'}, {'name': 'spam_x', 'value': 'ham_x'}, ]) resp2 = response_with_cookies(req2, [ {'name': 'spam', 'value': 'ham'}, # because req2 was made after req1 {'name': 'bom_x', 'value': 'bam_x'}, ]) assert resp1.cookiejar is resp2.cookiejar cookies = {c.name: c.value for c in resp1.cookiejar} assert cookies == {'spam': 'ham', 'spam_x': 'ham_x', 'bom_x': 'bam_x'} # Removing already removed req1 = request_with_cookies({'spam': 'ham'}) req2 = request_with_cookies({'spam': 'ham', 'pom': 'pam'}) resp2 = response_with_cookies(req2, [ {'name': 'pom', 'value': 'pam'}, ]) resp1 = response_with_cookies(req1, []) assert resp1.cookiejar is resp2.cookiejar cookies = {c.name: c.value for c in resp1.cookiejar} assert cookies == {'pom': 'pam'}