Python scrapy.http 模块,TextResponse() 实例源码

我们从Python开源项目中,提取了以下20个代码示例,用于说明如何使用scrapy.http.TextResponse()

项目:mailingListScraper    作者:gaalcaras    | 项目源码 | 文件源码
def load_response(self, case_id):
        "Create Scrapy Response from the html file"

        url = self.raw_item['url']

        request = Request(url=url)

        page_id = case_id + '.html'
        page_path = os.path.join(self._destdir, page_id)
        page = open(page_path, 'rb')

        response = TextResponse(url=url,
                                request=request,
                                body=page.read().decode('utf-8', 'ignore'),
                                encoding='utf-8')

        page = page.close()

        return response
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_magic_response2():
    # check 'body' handling and another 'headers' format
    mw = _get_mw()
    req = SplashRequest('http://example.com/', magic_response=True,
                        headers={'foo': 'bar'}, dont_send_headers=True)
    req = mw.process_request(req, None)
    assert 'headers' not in req.meta['splash']['args']

    resp_data = {
        'body': base64.b64encode(b"binary data").decode('ascii'),
        'headers': {'Content-Type': 'text/plain'},
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    assert resp2.data == resp_data
    assert resp2.body == b'binary data'
    assert resp2.headers == {b'Content-Type': [b'text/plain']}
    assert resp2.status == 200
    assert resp2.url == "http://example.com/"
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_magic_response_http_error():
    mw = _get_mw()
    req = SplashRequest('http://example.com/foo')
    req = mw.process_request(req, None)

    resp_data = {
        "info": {
            "error": "http404",
            "message": "Lua error: [string \"function main(splash)\r...\"]:3: http404",
            "line_number": 3,
            "type": "LUA_ERROR",
            "source": "[string \"function main(splash)\r...\"]"
        },
        "description": "Error happened while executing Lua script",
        "error": 400,
        "type": "ScriptError"
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp = mw.process_response(req, resp, None)
    assert resp.data == resp_data
    assert resp.status == 404
    assert resp.url == "http://example.com/foo"
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_magic_response2():
    # check 'body' handling and another 'headers' format
    mw = _get_mw()
    req = SplashRequest('http://example.com/', magic_response=True,
                        headers={'foo': 'bar'}, dont_send_headers=True)
    req = mw.process_request(req, None)
    assert 'headers' not in req.meta['splash']['args']

    resp_data = {
        'body': base64.b64encode(b"binary data").decode('ascii'),
        'headers': {'Content-Type': 'text/plain'},
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    assert resp2.data == resp_data
    assert resp2.body == b'binary data'
    assert resp2.headers == {b'Content-Type': [b'text/plain']}
    assert resp2.status == 200
    assert resp2.url == "http://example.com/"
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_magic_response_http_error():
    mw = _get_mw()
    req = SplashRequest('http://example.com/foo')
    req = mw.process_request(req, None)

    resp_data = {
        "info": {
            "error": "http404",
            "message": "Lua error: [string \"function main(splash)\r...\"]:3: http404",
            "line_number": 3,
            "type": "LUA_ERROR",
            "source": "[string \"function main(splash)\r...\"]"
        },
        "description": "Error happened while executing Lua script",
        "error": 400,
        "type": "ScriptError"
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp = mw.process_response(req, resp, None)
    assert resp.data == resp_data
    assert resp.status == 404
    assert resp.url == "http://example.com/foo"
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_magic_response2():
    # check 'body' handling and another 'headers' format
    mw = _get_mw()
    req = SplashRequest('http://example.com/', magic_response=True,
                        headers={'foo': 'bar'}, dont_send_headers=True)
    req = mw.process_request(req, None)
    assert 'headers' not in req.meta['splash']['args']

    resp_data = {
        'body': base64.b64encode(b"binary data").decode('ascii'),
        'headers': {'Content-Type': 'text/plain'},
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    assert resp2.data == resp_data
    assert resp2.body == b'binary data'
    assert resp2.headers == {b'Content-Type': [b'text/plain']}
    assert resp2.status == 200
    assert resp2.url == "http://example.com/"
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_magic_response_http_error():
    mw = _get_mw()
    req = SplashRequest('http://example.com/foo')
    req = mw.process_request(req, None)

    resp_data = {
        "info": {
            "error": "http404",
            "message": "Lua error: [string \"function main(splash)\r...\"]:3: http404",
            "line_number": 3,
            "type": "LUA_ERROR",
            "source": "[string \"function main(splash)\r...\"]"
        },
        "description": "Error happened while executing Lua script",
        "error": 400,
        "type": "ScriptError"
    }
    resp = TextResponse("http://mysplash.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp = mw.process_response(req, resp, None)
    assert resp.data == resp_data
    assert resp.status == 404
    assert resp.url == "http://example.com/foo"
项目:expected_goals    作者:andrebrener    | 项目源码 | 文件源码
def run_crawler(base_url, ua, start_date, end_date,
                google_username, google_password):

    temp_df = pd.DataFrame()

    dates = date_range(start_date, end_date)

    for d in dates:
        url = '{0}//transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/{1}/plus/1'.format(
            base_url, d)

        rqst = requests.get(url, headers={"User-Agent": ua})
        resp = TextResponse(url, body=rqst.content)

        players, nat, ages, positions, prev_clubs, next_clubs, mkt_values, trans_prices = get_data_lists(
            resp)

        df = get_df(players,
                    nat,
                    ages,
                    positions,
                    prev_clubs,
                    next_clubs,
                    mkt_values,
                    trans_prices,
                    d)

        trends_df = get_trends_data(google_username, google_password,
                                    players, d)

        df = pd.merge(df, trends_df, how='left', on='player')

        temp_df = pd.concat([temp_df, df])

    return temp_df
项目:hq-proxies    作者:arthurmmm    | 项目源码 | 文件源码
def process_exception(self, request, exception, spider):
        if isinstance(exception, self.DONT_RETRY_ERRORS):
            return TextResponse(url=request.meta['proxy'])
项目:vigilante    作者:VigilantePolitico    | 项目源码 | 文件源码
def setUp(self):
        self.spider = MinutesSpider()
        self.index = TextResponse(
            url=self.spider.start_urls[0],
            body=open('./tests/samples/minute_index.html').read(),
            encoding='utf-8'
        )
        self.response = TextResponse(
            url="http://mail.camara.rj.gov.br/APL/Legislativos/atas.nsf/" +
            "3f8037c08c436684032577040057cb8c/54ab5cc388ffcda5832580830059b178?OpenDocument",
            body=open('./tests/samples/minute_item.html').read(),
            encoding='utf-8'
        )  # 105ª Sessão Ordinária
项目:vigilante    作者:VigilantePolitico    | 项目源码 | 文件源码
def setUp(self):
        self.spider = AldermanSpider()
        self.index = TextResponse(
            url=self.spider.start_urls[0],
            body=open('./tests/samples/alderman_index.html', encoding='windows-1252').read(),
            encoding='utf-8'
        )
        self.item = TextResponse(
            url="http://www.camara.rj.gov.br/vereador_informacoes.php?m1=inform&cvd=24",
            body=open('./tests/samples/alderman_item.html', encoding='windows-1252').read(),
            encoding='utf-8'
        )  # Carlos Bolsonaro
项目:Pysearch2.0    作者:Pysearch    | 项目源码 | 文件源码
def dummy_response():
    """Dummy response fixture."""
    from scrapy.http import TextResponse, Request
    url = 'http://www.example.com'
    request = Request(url=url)
    response = TextResponse(url=url, request=request, body=TEST_FILE_2, encoding='utf-8')
    return response
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_unicode_url():
    mw = _get_mw()
    req = SplashRequest(
        # note unicode URL
        u"http://example.com/", endpoint='execute')
    req2 = mw.process_request(req, None)
    res = {'html': '<html><body>Hello</body></html>'}
    res_body = json.dumps(res)
    response = TextResponse("http://mysplash.example.com/execute",
                            # Scrapy doesn't pass request to constructor
                            # request=req2,
                            headers={b'Content-Type': b'application/json'},
                            body=res_body.encode('utf8'))
    response2 = mw.process_response(req2, response, None)
    assert response2.url == "http://example.com/"
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_unicode_url():
    mw = _get_mw()
    req = SplashRequest(
        # note unicode URL
        u"http://example.com/", endpoint='execute')
    req2 = mw.process_request(req, None)
    res = {'html': '<html><body>Hello</body></html>'}
    res_body = json.dumps(res)
    response = TextResponse("http://mysplash.example.com/execute",
                            # Scrapy doesn't pass request to constructor
                            # request=req2,
                            headers={b'Content-Type': b'application/json'},
                            body=res_body.encode('utf8'))
    response2 = mw.process_response(req2, response, None)
    assert response2.url == "http://example.com/"
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_unicode_url():
    mw = _get_mw()
    req = SplashRequest(
        # note unicode URL
        u"http://example.com/", endpoint='execute')
    req2 = mw.process_request(req, None)
    res = {'html': '<html><body>Hello</body></html>'}
    res_body = json.dumps(res)
    response = TextResponse("http://mysplash.example.com/execute",
                            # Scrapy doesn't pass request to constructor
                            # request=req2,
                            headers={b'Content-Type': b'application/json'},
                            body=res_body.encode('utf8'))
    response2 = mw.process_response(req2, response, None)
    assert response2.url == "http://example.com/"
项目:collectors    作者:opentrials    | 项目源码 | 文件源码
def collect(conf, conn):
    """Collect ICD-XX-CM conditions.
    """

    # For more information see:
    # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html
    URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip'
    FILE = 'Tabular.xml'
    VERSION = 'ICD-10-CM'
    LAST_UPDATED = '2015-10-01'

    # Prepare xml
    zip = requests.get(URL).content
    xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read()
    res = TextResponse(url=URL, body=xml, encoding='utf-8')

    count = 0
    for diag in res.xpath('//diag'):
        # We need only leafs
        childs = diag.xpath('./diag')
        if not childs:
            continue

        # Get data
        data = {
            'name': diag.xpath('./name/text()').extract_first(),
            'desc': diag.xpath('./desc/text()').extract_first(),
            'terms': diag.xpath('.//note/text()').extract(),
            'version': VERSION,
            'last_updated': LAST_UPDATED,
        }

        # Create record
        record = Record.create(URL, data)

        # Write record
        record.write(conf, conn)

        # Log info
        count += 1
        if not count % 100:
            logger.info('Collected %s "%s" conditions', count, record.table)
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_splash_request():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
    assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"

    # check request preprocessing
    req2 = cookie_mw.process_request(req, None) or req
    req2 = mw.process_request(req2, None) or req2
    assert req2 is not None
    assert req2 is not req
    assert req2.url == "http://127.0.0.1:8050/render.html"
    assert req2.headers == {b'Content-Type': [b'application/json']}
    assert req2.method == 'POST'
    assert isinstance(req2, SplashRequest)
    assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>"

    expected_body = {'url': req.url}
    assert json.loads(to_native_str(req2.body)) == expected_body

    # check response post-processing
    response = TextResponse("http://127.0.0.1:8050/render.html",
                            # Scrapy doesn't pass request to constructor
                            # request=req2,
                            headers={b'Content-Type': b'text/html'},
                            body=b"<html><body>Hello</body></html>")
    response2 = mw.process_response(req2, response, None)
    response2 = cookie_mw.process_response(req2, response2, None)
    assert isinstance(response2, scrapy_splash.SplashTextResponse)
    assert response2 is not response
    assert response2.real_url == req2.url
    assert response2.url == req.url
    assert response2.body == b"<html><body>Hello</body></html>"
    assert response2.css("body").extract_first() == "<body>Hello</body>"
    assert response2.headers == {b'Content-Type': [b'text/html']}

    # check .replace method
    response3 = response2.replace(status=404)
    assert response3.status == 404
    assert isinstance(response3, scrapy_splash.SplashTextResponse)
    for attr in ['url', 'real_url', 'headers', 'body']:
        assert getattr(response3, attr) == getattr(response2, attr)
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_cookies():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    def request_with_cookies(cookies):
        req = SplashRequest(
            'http://example.com/foo',
            endpoint='execute',
            args={'lua_source': 'function main() end'},
            magic_response=True,
            cookies=cookies)
        req = cookie_mw.process_request(req, None) or req
        req = mw.process_request(req, None) or req
        return req

    def response_with_cookies(req, cookies):
        resp_data = {
            'html': '<html><body>Hello</body></html>',
            'headers': [],
            'cookies': cookies,
        }
        resp = TextResponse(
            'http://mysplash.example.com/execute',
            headers={b'Content-Type': b'application/json'},
            body=json.dumps(resp_data).encode('utf8'))
        resp = mw.process_response(req, resp, None)
        resp = cookie_mw.process_response(req, resp, None)
        return resp

    # Concurent requests
    req1 = request_with_cookies({'spam': 'ham'})
    req2 = request_with_cookies({'bom': 'bam'})
    resp1 = response_with_cookies(req1, [
        {'name': 'spam', 'value': 'ham'},
        {'name': 'spam_x', 'value': 'ham_x'},
    ])
    resp2 = response_with_cookies(req2, [
        {'name': 'spam', 'value': 'ham'},  # because req2 was made after req1
        {'name': 'bom_x', 'value': 'bam_x'},
    ])
    assert resp1.cookiejar is resp2.cookiejar
    cookies = {c.name: c.value for c in resp1.cookiejar}
    assert cookies == {'spam': 'ham', 'spam_x': 'ham_x', 'bom_x': 'bam_x'}

    # Removing already removed
    req1 = request_with_cookies({'spam': 'ham'})
    req2 = request_with_cookies({'spam': 'ham', 'pom': 'pam'})
    resp2 = response_with_cookies(req2, [
        {'name': 'pom', 'value': 'pam'},
    ])
    resp1 = response_with_cookies(req1, [])
    assert resp1.cookiejar is resp2.cookiejar
    cookies = {c.name: c.value for c in resp1.cookiejar}
    assert cookies == {'pom': 'pam'}
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_cookies():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    def request_with_cookies(cookies):
        req = SplashRequest(
            'http://example.com/foo',
            endpoint='execute',
            args={'lua_source': 'function main() end'},
            magic_response=True,
            cookies=cookies)
        req = cookie_mw.process_request(req, None) or req
        req = mw.process_request(req, None) or req
        return req

    def response_with_cookies(req, cookies):
        resp_data = {
            'html': '<html><body>Hello</body></html>',
            'headers': [],
            'cookies': cookies,
        }
        resp = TextResponse(
            'http://mysplash.example.com/execute',
            headers={b'Content-Type': b'application/json'},
            body=json.dumps(resp_data).encode('utf8'))
        resp = mw.process_response(req, resp, None)
        resp = cookie_mw.process_response(req, resp, None)
        return resp

    # Concurent requests
    req1 = request_with_cookies({'spam': 'ham'})
    req2 = request_with_cookies({'bom': 'bam'})
    resp1 = response_with_cookies(req1, [
        {'name': 'spam', 'value': 'ham'},
        {'name': 'spam_x', 'value': 'ham_x'},
    ])
    resp2 = response_with_cookies(req2, [
        {'name': 'spam', 'value': 'ham'},  # because req2 was made after req1
        {'name': 'bom_x', 'value': 'bam_x'},
    ])
    assert resp1.cookiejar is resp2.cookiejar
    cookies = {c.name: c.value for c in resp1.cookiejar}
    assert cookies == {'spam': 'ham', 'spam_x': 'ham_x', 'bom_x': 'bam_x'}

    # Removing already removed
    req1 = request_with_cookies({'spam': 'ham'})
    req2 = request_with_cookies({'spam': 'ham', 'pom': 'pam'})
    resp2 = response_with_cookies(req2, [
        {'name': 'pom', 'value': 'pam'},
    ])
    resp1 = response_with_cookies(req1, [])
    assert resp1.cookiejar is resp2.cookiejar
    cookies = {c.name: c.value for c in resp1.cookiejar}
    assert cookies == {'pom': 'pam'}
项目:badoo_scrapy_splash_redis    作者:Supe2015    | 项目源码 | 文件源码
def test_splash_request():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
    assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"

    # check request preprocessing
    req2 = cookie_mw.process_request(req, None) or req
    req2 = mw.process_request(req2, None) or req2
    assert req2 is not None
    assert req2 is not req
    assert req2.url == "http://127.0.0.1:8050/render.html"
    assert req2.headers == {b'Content-Type': [b'application/json']}
    assert req2.method == 'POST'
    assert isinstance(req2, SplashRequest)
    assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>"

    expected_body = {'url': req.url}
    assert json.loads(to_native_str(req2.body)) == expected_body

    # check response post-processing
    response = TextResponse("http://127.0.0.1:8050/render.html",
                            # Scrapy doesn't pass request to constructor
                            # request=req2,
                            headers={b'Content-Type': b'text/html'},
                            body=b"<html><body>Hello</body></html>")
    response2 = mw.process_response(req2, response, None)
    response2 = cookie_mw.process_response(req2, response2, None)
    assert isinstance(response2, scrapy_splash.SplashTextResponse)
    assert response2 is not response
    assert response2.real_url == req2.url
    assert response2.url == req.url
    assert response2.body == b"<html><body>Hello</body></html>"
    assert response2.css("body").extract_first() == "<body>Hello</body>"
    assert response2.headers == {b'Content-Type': [b'text/html']}

    # check .replace method
    response3 = response2.replace(status=404)
    assert response3.status == 404
    assert isinstance(response3, scrapy_splash.SplashTextResponse)
    for attr in ['url', 'real_url', 'headers', 'body']:
        assert getattr(response3, attr) == getattr(response2, attr)