我目前正在创建一个Node.js webscraper / proxy,但是在解析在源脚本部分中找到的相对Urls时遇到了麻烦,我发现REGEX可以解决问题。虽然不知道我将如何实现。
反正有什么我可以解决的吗?
另外,我愿意采取一种更简单的方法来执行此操作,因为我对其他代理如何解析网站感到困惑。我认为大多数只是赞美的网站抓取工具,可以读取网站的来源,将所有链接/表格中继回代理。
OP的注释,因为他请求了这样的功能:更改base_url为代理的basE URL,以实现所需的结果。
base_url
下面将显示两个功能(代码中包含使用指南)。确保您不跳过此答案的任何部分以完全理解该功能的行为。
rel_to_abs(urL)
../
./
.
//
replace_all_rel_by_abs
url()
rel_to_abs - 解析相对网址
rel_to_abs
function rel_to_abs(url){ /* Only accept commonly trusted protocols: * Only data-image URLs are accepted, Exotic flavours (escaped slash, * html-entitied characters) are not supported to keep the function fast */ if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) return url; //Url is already absolute var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/"; if(url.substring(0,2) == "//") return location.protocol + url; else if(url.charAt(0) == "/") return location.protocol + "//" + location.host + url; else if(url.substring(0,2) == "./") url = "." + url; else if(/^\s*$/.test(url)) return ""; //Empty = Return nothing else url = "../" + url; url = base_url + url; var i=0 while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,""))); /* Escape certain characters to prevent XSS */ url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22") .replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E"); return url; }
案例/示例:
http://foo.bar
/doo
./meh
../booh
该函数将相对路径转换为../,并执行搜索和替换(http://domain/sub/anything-but-a- slash/../me至http://domain/sub/me)。
http://domain/sub/anything-but-a- slash/../me
http://domain/sub/me
replace_all_rel_by_abs - 转换 脚本实例内部的 所有相关URL URL(<script>,事件处理程序 不会 被替换,因为创建一个快速安全的过滤器来解析JavaScript几乎是不可能的。
<script>
该脚本内部带有一些注释。动态创建正则表达式,因为单个RE的大小可以为 3000个 字符。<meta http-equiv=refresh content=.. >可以以各种方式混淆,因此RE的大小。
<meta http-equiv=refresh content=.. >
function replace_all_rel_by_abs(html){ /*HTML/XML Attribute may not be prefixed by these characters (common attribute chars. This list is not complete, but will be sufficient for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */ var att = "[^-a-z0-9:._]"; var entityEnd = "(?:;|(?!\\d))"; var ents = {" ":"(?:\\s| ?|�*32"+entityEnd+"|�*20"+entityEnd+")", "(":"(?:\\(|�*40"+entityEnd+"|�*28"+entityEnd+")", ")":"(?:\\)|�*41"+entityEnd+"|�*29"+entityEnd+")", ".":"(?:\\.|�*46"+entityEnd+"|�*2e"+entityEnd+")"}; /* Placeholders to filter obfuscations */ var charMap = {}; var s = ents[" "]+"*"; //Short-hand for common use var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*"; /* ^ Important: Must be pre- and postfixed by < and >. * This RE should match anything within a tag! */ /* @name ae @description Converts a given string in a sequence of the original input and the HTML entity @param String string String to convert */ function ae(string){ var all_chars_lowercase = string.toLowerCase(); if(ents[string]) return ents[string]; var all_chars_uppercase = string.toUpperCase(); var RE_res = ""; for(var i=0; i<string.length; i++){ var char_lowercase = all_chars_lowercase.charAt(i); if(charMap[char_lowercase]){ RE_res += charMap[char_lowercase]; continue; } var char_uppercase = all_chars_uppercase.charAt(i); var RE_sub = [char_lowercase]; RE_sub.push("�*" + char_lowercase.charCodeAt(0) + entityEnd); RE_sub.push("�*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd); if(char_lowercase != char_uppercase){ /* Note: RE ignorecase flag has already been activated */ RE_sub.push("�*" + char_uppercase.charCodeAt(0) + entityEnd); RE_sub.push("�*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd); } RE_sub = "(?:" + RE_sub.join("|") + ")"; RE_res += (charMap[char_lowercase] = RE_sub); } return(ents[string] = RE_res); } /* @name by @description 2nd argument for replace(). */ function by(match, group1, group2, group3){ /* Note that this function can also be used to remove links: * return group1 + "javascript://" + group3; */ return group1 + rel_to_abs(group2) + group3; } /* @name by2 @description 2nd argument for replace(). Parses relevant HTML entities */ var slashRE = new RegExp(ae("/"), 'g'); var dotRE = new RegExp(ae("."), 'g'); function by2(match, group1, group2, group3){ /*Note that this function can also be used to remove links: * return group1 + "javascript://" + group3; */ group2 = group2.replace(slashRE, "/").replace(dotRE, "."); return group1 + rel_to_abs(group2) + group3; } /* @name cr @description Selects a HTML element and performs a search-and-replace on attributes @param String selector HTML substring to match @param String attribute RegExp-escaped; HTML element attribute to match @param String marker Optional RegExp-escaped; marks the prefix @param String delimiter Optional RegExp escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> */ function cr(selector, attribute, marker, delimiter, end){ if(typeof selector == "string") selector = new RegExp(selector, "gi"); attribute = att + attribute; marker = typeof marker == "string" ? marker : "\\s*=\\s*"; delimiter = typeof delimiter == "string" ? delimiter : ""; end = typeof end == "string" ? "?)("+end : ")("; var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')', 'gi'); var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")", 'gi'); var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')', 'gi'); html = html.replace(selector, function(match){ return match.replace(re1, by).replace(re2, by).replace(re3, by); }); } /* @name cri @description Selects an attribute of a HTML element, and performs a search-and-replace on certain values @param String selector HTML element to match @param String attribute RegExp-escaped; HTML element attribute to match @param String front RegExp-escaped; attribute value, prefix to match @param String flags Optional RegExp flags, default "gi" @param String delimiter Optional RegExp-escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> */ function cri(selector, attribute, front, flags, delimiter, end){ if(typeof selector == "string") selector = new RegExp(selector, "gi"); attribute = att + attribute; flags = typeof flags == "string" ? flags : "gi"; var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)', 'gi'); var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)", 'gi'); var at1 = new RegExp('('+front+')([^"]+)(")', flags); var at2 = new RegExp("("+front+")([^']+)(')", flags); if(typeof delimiter == "string"){ end = typeof end == "string" ? end : ""; var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"), flags); var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2).replace(at3, by2)}; } else { var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2)}; } html = html.replace(selector, function(match){ return match.replace(re1, handleAttr).replace(re2, handleAttr); }); } /* <meta http-equiv=refresh content=" ; url= " > */ cri("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))", "content", ae("url")+s+ae("=")+s, "i"); cr("<"+any+att+"href\\s*="+any+">", "href"); /* Linked elements */ cr("<"+any+att+"src\\s*="+any+">", "src"); /* Embedded elements */ cr("<object"+any+att+"data\\s*="+any+">", "data"); /* <object data= > */ cr("<applet"+any+att+"codebase\\s*="+any+">", "codebase"); /* <applet codebase= > */ /* <param name=movie value= >*/ cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))", "value"); cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)"); /* <style> */ cri("<"+any+att+"style\\s*="+any+">", "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")")); /*< style=" url(...) " > */ return html; }
私有功能的简短摘要:
rel_to_abs(url)
replace_all_rel_by_abs(html) -用绝对URL替换HTML字符串中所有与URL相关的出现。
replace_all_rel_by_abs(html)
ae
by
cr- ç reate ř E放置-创建并执行一个搜索和替换。 示例:(href="..."在任何HTML标记内)。
cr
href="..."
cri- ç reate ř E放置 我 n第-创建并执行一个搜索和替换。 示例:url(..)在styleHTML标记内的all 属性内。
cri
url(..)
style
打开任何页面,然后将以下书签粘贴到位置栏中:
javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})();
注入的代码包含上面定义的两个功能,以及如下所示的测试用例。 注意 :测试用例 不会 修改页面的HTML,但是会在文本区域(可选)中显示已解析的结果。
var t=(new Date).getTime(); var result = replace_all_rel_by_abs(document.documentElement.innerHTML); if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){ var txt = document.createElement("textarea"); txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%" txt.ondblclick = function(){this.parentNode.removeChild(this)} txt.value = result; document.body.appendChild(txt); }