小编典典

如何使用casperjs从XHR响应中捕获和处理数据?

ajax

网页上的数据是动态显示的,似乎检查html中的每个更改并提取数据是一项非常艰巨的任务,并且还需要我使用非常不可靠的XPath。因此,我希望能够从XHR数据包中提取数据。

我希望能够从XHR数据包中提取信息以及生成要发送到服务器的“
XHR”数据包。提取信息部分对我来说更重要,因为通过使用casperjs自动触发html元素可以轻松处理信息的发送。

我附上我的意思的屏幕截图。在此处输入图片说明

响应选项卡中的文本是我以后需要处理的数据。(已从服务器收到此XHR响应。)


阅读 337

收藏
2020-07-26

共1个答案

小编典典

这是不容易的,因为该resource.received事件处理程序只提供元数据一样urlheaders或者status,但不是实际的数据。底层phantomjs事件处理程序的行为方式相同。


无状态AJAX请求

如果ajax调用 是无状态的 ,则可以重复该请求

casper.on("resource.received", function(resource){
    // somehow identify this request, here: if it contains ".json"
    // it also also only does something when the stage is "end" otherwise this would be executed two times
    if (resource.url.indexOf(".json") != -1 && resource.stage == "end") {
        var data = casper.evaluate(function(url){
            // synchronous GET request
            return __utils__.sendAJAX(url, "GET");
        }, resource.url);
        // do something with data, you might need to JSON.parse(data)
    }
});
casper.start(url); // your script

您可能需要将事件侦听器添加到中resource.requested。这样,您就无需完成呼叫。

您也可以像这样在控制流中执行此操作源:A:CasperJSwaitForResource:如何获取我等待的资源:

casper.start(url);

var res, resData;
casper.waitForResource(function check(resource){
    res = resource;
    return resource.url.indexOf(".json") != -1;
}, function then(){
    resData = casper.evaluate(function(url){
        // synchronous GET request
        return __utils__.sendAJAX(url, "GET");
    }, res.url);
    // do something with the data here or in a later step
});

casper.run();

有状态的AJAX请求

如果 不是无状态的
,则需要替换XMLHttpRequest的实现。您将需要注入自己的onreadystatechange处理程序实现,在page
window对象中收集信息,然后在另一个evaluate调用中收集它。

您可能想要查看sinon.js中的XHR伪造者,或使用以下完整代理XMLHttpRequest(我如何创建XMLHttpRequest包装器/代理中的方法3建模):

function replaceXHR(){
    (function(window, debug){
        function args(a){
            var s = "";
            for(var i = 0; i < a.length; i++) {
                s += "\t\n[" + i + "] => " + a[i];
            }
            return s;
        }
        var _XMLHttpRequest = window.XMLHttpRequest;

        window.XMLHttpRequest = function() {
            this.xhr = new _XMLHttpRequest();
        }

        // proxy ALL methods/properties
        var methods = [ 
            "open", 
            "abort", 
            "setRequestHeader", 
            "send", 
            "addEventListener", 
            "removeEventListener", 
            "getResponseHeader", 
            "getAllResponseHeaders", 
            "dispatchEvent", 
            "overrideMimeType"
        ];
        methods.forEach(function(method){
            window.XMLHttpRequest.prototype[method] = function() {
                if (debug) console.log("ARGUMENTS", method, args(arguments));
                if (method == "open") {
                    this._url = arguments[1];
                }
                return this.xhr[method].apply(this.xhr, arguments);
            }
        });

        // proxy change event handler
        Object.defineProperty(window.XMLHttpRequest.prototype, "onreadystatechange", {
            get: function(){
                // this will probably never called
                return this.xhr.onreadystatechange;
            },
            set: function(onreadystatechange){
                var that = this.xhr;
                var realThis = this;
                that.onreadystatechange = function(){
                    // request is fully loaded
                    if (that.readyState == 4) {
                        if (debug) console.log("RESPONSE RECEIVED:", typeof that.responseText == "string" ? that.responseText.length : "none");
                        // there is a response and filter execution based on url
                        if (that.responseText && realThis._url.indexOf("whatever") != -1) {
                            window.myAwesomeResponse = that.responseText;
                        }
                    }
                    onreadystatechange.call(that);
                };
            }
        });

        var otherscalars = [
            "onabort",
            "onerror",
            "onload",
            "onloadstart",
            "onloadend",
            "onprogress",
            "readyState",
            "responseText",
            "responseType",
            "responseXML",
            "status",
            "statusText",
            "upload",
            "withCredentials",
            "DONE",
            "UNSENT",
            "HEADERS_RECEIVED",
            "LOADING",
            "OPENED"
        ];
        otherscalars.forEach(function(scalar){
            Object.defineProperty(window.XMLHttpRequest.prototype, scalar, {
                get: function(){
                    return this.xhr[scalar];
                },
                set: function(obj){
                    this.xhr[scalar] = obj;
                }
            });
        });
    })(window, false);
}

如果您想一开始就捕获AJAX调用,则需要将其添加到第一个事件处理程序中

casper.on("page.initialized", function(resource){
    this.evaluate(replaceXHR);
});

evaluate(replaceXHR)在需要时使用。

控制流如下所示:

function replaceXHR(){ /* from above*/ }

casper.start(yourUrl, function(){
    this.evaluate(replaceXHR);
});

function getAwesomeResponse(){
    return this.evaluate(function(){
        return window.myAwesomeResponse;
    });
}

// stops waiting if window.myAwesomeResponse is something that evaluates to true
casper.waitFor(getAwesomeResponse, function then(){
    var data = JSON.parse(getAwesomeResponse());
    // Do something with data
});

casper.run();

如上所述,我为XMLHttpRequest创建了一个代理,以便每次在页面上使用它时,我都可以对其进行处理。您抓取的页面使用xhr.onreadystatechange回调来接收数据。通过定义一个特定的setter函数来完成代理,该setter函数将接收到的数据写入window.myAwesomeResponse页面上下文中。您唯一需要做的就是检索此文本。


JSONP请求

如果您知道前缀(使用加载的JSON调用的函数,例如insert({"data":["Some", "JSON", "here"],"id":"asdasda")),则为JSONP编写代理甚至更加容易。您可以insert在页面上下文中覆盖

  1. 页面加载后

    casper.start(url).then(function(){
    this.evaluate(function(){
        var oldInsert = insert;
        insert = function(json){
            window.myAwesomeResponse = json;
            oldInsert.apply(window, arguments);
        };
    });
    

    }).waitFor(getAwesomeResponse, function then(){
    var data = JSON.parse(getAwesomeResponse());
    // Do something with data
    }).run();

  2. 或在接收到请求之前(如果在调用请求之前就注册了该函数)

    casper.on("resource.requested", function(resource){
    // filter on the correct call
    if (resource.url.indexOf(".jsonp") != -1) {
        this.evaluate(function(){
            var oldInsert = insert;
            insert = function(json){
                window.myAwesomeResponse = json;
                oldInsert.apply(window, arguments);
            };
        });
    }
    

    }).run();

    casper.start(url).waitFor(getAwesomeResponse, function then(){
    var data = JSON.parse(getAwesomeResponse());
    // Do something with data
    }).run();

2020-07-26