网页上的数据是动态显示的,似乎检查html中的每个更改并提取数据是一项非常艰巨的任务,并且还需要我使用非常不可靠的XPath。因此,我希望能够从XHR数据包中提取数据。
XHR
我希望能够从XHR数据包中提取信息以及生成要发送到服务器的“ XHR”数据包。提取信息部分对我来说更重要,因为通过使用casperjs自动触发html元素可以轻松处理信息的发送。
我附上我的意思的屏幕截图。
响应选项卡中的文本是我以后需要处理的数据。(已从服务器收到此XHR响应。)
这是不容易的,因为该resource.received事件处理程序只提供元数据一样url,headers或者status,但不是实际的数据。底层phantomjs事件处理程序的行为方式相同。
resource.received
url
headers
status
如果ajax调用 是无状态的 ,则可以重复该请求
casper.on("resource.received", function(resource){ // somehow identify this request, here: if it contains ".json" // it also also only does something when the stage is "end" otherwise this would be executed two times if (resource.url.indexOf(".json") != -1 && resource.stage == "end") { var data = casper.evaluate(function(url){ // synchronous GET request return __utils__.sendAJAX(url, "GET"); }, resource.url); // do something with data, you might need to JSON.parse(data) } }); casper.start(url); // your script
您可能需要将事件侦听器添加到中resource.requested。这样,您就无需完成呼叫。
resource.requested
您也可以像这样在控制流中执行此操作源:A:CasperJSwaitForResource:如何获取我等待的资源:
casper.start(url); var res, resData; casper.waitForResource(function check(resource){ res = resource; return resource.url.indexOf(".json") != -1; }, function then(){ resData = casper.evaluate(function(url){ // synchronous GET request return __utils__.sendAJAX(url, "GET"); }, res.url); // do something with the data here or in a later step }); casper.run();
如果 不是无状态的 ,则需要替换XMLHttpRequest的实现。您将需要注入自己的onreadystatechange处理程序实现,在page window对象中收集信息,然后在另一个evaluate调用中收集它。
onreadystatechange
window
evaluate
您可能想要查看sinon.js中的XHR伪造者,或使用以下完整代理XMLHttpRequest(我如何创建XMLHttpRequest包装器/代理中的方法3建模):
XMLHttpRequest
function replaceXHR(){ (function(window, debug){ function args(a){ var s = ""; for(var i = 0; i < a.length; i++) { s += "\t\n[" + i + "] => " + a[i]; } return s; } var _XMLHttpRequest = window.XMLHttpRequest; window.XMLHttpRequest = function() { this.xhr = new _XMLHttpRequest(); } // proxy ALL methods/properties var methods = [ "open", "abort", "setRequestHeader", "send", "addEventListener", "removeEventListener", "getResponseHeader", "getAllResponseHeaders", "dispatchEvent", "overrideMimeType" ]; methods.forEach(function(method){ window.XMLHttpRequest.prototype[method] = function() { if (debug) console.log("ARGUMENTS", method, args(arguments)); if (method == "open") { this._url = arguments[1]; } return this.xhr[method].apply(this.xhr, arguments); } }); // proxy change event handler Object.defineProperty(window.XMLHttpRequest.prototype, "onreadystatechange", { get: function(){ // this will probably never called return this.xhr.onreadystatechange; }, set: function(onreadystatechange){ var that = this.xhr; var realThis = this; that.onreadystatechange = function(){ // request is fully loaded if (that.readyState == 4) { if (debug) console.log("RESPONSE RECEIVED:", typeof that.responseText == "string" ? that.responseText.length : "none"); // there is a response and filter execution based on url if (that.responseText && realThis._url.indexOf("whatever") != -1) { window.myAwesomeResponse = that.responseText; } } onreadystatechange.call(that); }; } }); var otherscalars = [ "onabort", "onerror", "onload", "onloadstart", "onloadend", "onprogress", "readyState", "responseText", "responseType", "responseXML", "status", "statusText", "upload", "withCredentials", "DONE", "UNSENT", "HEADERS_RECEIVED", "LOADING", "OPENED" ]; otherscalars.forEach(function(scalar){ Object.defineProperty(window.XMLHttpRequest.prototype, scalar, { get: function(){ return this.xhr[scalar]; }, set: function(obj){ this.xhr[scalar] = obj; } }); }); })(window, false); }
如果您想一开始就捕获AJAX调用,则需要将其添加到第一个事件处理程序中
casper.on("page.initialized", function(resource){ this.evaluate(replaceXHR); });
或evaluate(replaceXHR)在需要时使用。
evaluate(replaceXHR)
控制流如下所示:
function replaceXHR(){ /* from above*/ } casper.start(yourUrl, function(){ this.evaluate(replaceXHR); }); function getAwesomeResponse(){ return this.evaluate(function(){ return window.myAwesomeResponse; }); } // stops waiting if window.myAwesomeResponse is something that evaluates to true casper.waitFor(getAwesomeResponse, function then(){ var data = JSON.parse(getAwesomeResponse()); // Do something with data }); casper.run();
如上所述,我为XMLHttpRequest创建了一个代理,以便每次在页面上使用它时,我都可以对其进行处理。您抓取的页面使用xhr.onreadystatechange回调来接收数据。通过定义一个特定的setter函数来完成代理,该setter函数将接收到的数据写入window.myAwesomeResponse页面上下文中。您唯一需要做的就是检索此文本。
xhr.onreadystatechange
window.myAwesomeResponse
如果您知道前缀(使用加载的JSON调用的函数,例如insert({"data":["Some", "JSON", "here"],"id":"asdasda")),则为JSONP编写代理甚至更加容易。您可以insert在页面上下文中覆盖
insert({"data":["Some", "JSON", "here"],"id":"asdasda")
insert
页面加载后
casper.start(url).then(function(){ this.evaluate(function(){ var oldInsert = insert; insert = function(json){ window.myAwesomeResponse = json; oldInsert.apply(window, arguments); }; });
}).waitFor(getAwesomeResponse, function then(){ var data = JSON.parse(getAwesomeResponse()); // Do something with data }).run();
或在接收到请求之前(如果在调用请求之前就注册了该函数)
casper.on("resource.requested", function(resource){ // filter on the correct call if (resource.url.indexOf(".jsonp") != -1) { this.evaluate(function(){ var oldInsert = insert; insert = function(json){ window.myAwesomeResponse = json; oldInsert.apply(window, arguments); }; }); }
}).run();
casper.start(url).waitFor(getAwesomeResponse, function then(){ var data = JSON.parse(getAwesomeResponse()); // Do something with data }).run();