我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用bs4.element.NavigableString()。
def parseStatus(rtext): texts = [t for t in rtext.contents if isinstance(t, NavigableString)] for text in texts: upperText = text.strip().upper() originalText = upperText for char in CHARS_TO_IGNORE: upperText = upperText.replace(char, "") upperWords = upperText.split() if (("CLEAR" in upperWords or "CLR" in upperWords) and not originalText.endswith("?")): return states.CLEAR elif ("STAT" in upperWords or "STATUS" in upperWords): return states.REQUEST elif ("?" in originalText): return states.REQUEST elif (text.strip().upper() in ("BLUE", "BLUES ONLY", "ONLY BLUE" "STILL BLUE", "ALL BLUES")): return states.CLEAR
def parseShips(rtext): def formatShipName(text, word): newText = u"""<span style="color:#d95911;font-weight:bold"> {0}</span>""" text = text.replace(word, newText.format(word)) return text texts = [t for t in rtext.contents if isinstance(t, NavigableString)] for text in texts: upperText = text.upper() for shipName in evegate.SHIPNAMES: if shipName in upperText: hit = True start = upperText.find(shipName) end = start + len(shipName) if ((start > 0 and upperText[start - 1] not in (" ", "X")) or ( end < len(upperText) - 1 and upperText[end] not in ("S", " "))): hit = False if hit: shipInText = text[start:end] formatted = formatShipName(text, shipInText) textReplace(text, formatted) return True
def _censorNaviStrCandidateWithTemplate(self, candi_str, template_str, template_var_cache): if not type(candi_str) == element.NavigableString or not type(template_str) == element.NavigableString: return False matchObj = self.RegPattern.search(template_str) if matchObj is not None: varName = matchObj.group(1) varValue = None subed_tmpl_str = self.RegPattern.sub('(.+)', template_str) reg2 = re.compile(subed_tmpl_str) self.logger.debug('subed tmpl reg2 =', reg2) mo2 = reg2.match(candi_str) if mo2 is not None: varValue = mo2.group(1) self._procTemplateVariable(varName, varValue, template_var_cache) else: return False elif not candi_str == template_str: return False return True
def is_text(self): ''' Check if this element is a text Also comments and processing instructions are instances of NavigableString, so we have to make additional checks ''' if not isinstance(self.context, NavigableString): return False if ( self.is_comment() or self.is_doctype() or self.is_processing_instruction() ): return False return True
def text(self, target=None, ignore_pureascii_words=False): """ Get all text in HTML, skip script and comment :param target: the BeatuifulSoup object, default self.b :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website) :return: list of str """ if target is None: target = self.b from bs4 import Comment from bs4.element import NavigableString,Doctype result = [] for descendant in target.descendants: if not isinstance(descendant, NavigableString) \ or isinstance(descendant,Doctype) \ or descendant.parent.name in ["script", "style"] \ or isinstance(descendant, Comment) \ or "none" in descendant.parent.get("style","")\ or "font-size:0px" in descendant.parent.get("style",""): continue data = descendant.strip() if len(data) > 0: if not ignore_pureascii_words or any([ord(i)>127 for i in data]): if PY2: result.append(data.encode()) else: result.append(data) return result
def parseUrls(rtext): def findUrls(s): # yes, this is faster than regex and less complex to read urls = [] prefixes = ("http://", "https://") for prefix in prefixes: start = 0 while start >= 0: start = s.find(prefix, start) if start >= 0: stop = s.find(" ", start) if stop < 0: stop = len(s) urls.append(s[start:stop]) start += 1 return urls def formatUrl(text, url): newText = u"""<a style="color:#28a5ed;font-weight:bold" href="link/{0}">{0}</a>""" text = text.replace(url, newText.format(url)) return text texts = [t for t in rtext.contents if isinstance(t, NavigableString)] for text in texts: urls = findUrls(text) for url in urls: textReplace(text, formatUrl(text, url)) return True
def getSoupStringConcat(soupTag): ''' Beautiful soup tags return their content text in the .string parameter if there is only one string child. Some unfortunate cases on scotus blog have more than one child-string, and this helper just concat's them. :param soupTag: a bs4 tag that contains one or more strings :return: a string containing all string children of soupTag, concatenated. ''' if isinstance(soupTag, NavigableString): return soupTag.string result = "" for t in soupTag.descendants: if t.string is not None and isinstance(t, NavigableString): # only include NavigableStrings (work around .string default searching behavior) if t.parent.name != "script": # prevent reading js result = result + t.string return result
def is_whitespace_string(elem): return isinstance(elem, NavigableString) and elem.strip() == ""
def lex(source): """Convert source into a stream of (css_classes, token_string).""" soup = BeautifulSoup(coqdoc(source)) root = soup.find(class_='code') strip_soup(root, is_whitespace_string) for elem in root.children: if isinstance(elem, NavigableString): yield [], elem elif elem.name == "span": cls = "coqdoc-{}".format(elem['type']) yield [cls], elem.string elif elem.name == 'br': pass else: raise ValueError(elem)
def _parseTagRecursive(self, candi_tag, template_tag, template_var_cache): for idx, tmpChild in enumerate(template_tag.contents): if tmpChild.name == 'lisp_pass': # this means <...>, # indicating that anything in this tag is expected to be ignored. continue if len(candi_tag.contents) <= idx: return False candiChild = candi_tag.contents[idx] typeCandi = type(candiChild) typeTmp = type(tmpChild) valid = False if typeCandi == typeTmp == element.Tag: if self._censorTagCandidateWithTemplate(candiChild, tmpChild, template_var_cache): valid = self._parseTagRecursive(candiChild, tmpChild, template_var_cache) elif typeCandi == typeTmp == element.NavigableString: valid = self._censorNaviStrCandidateWithTemplate( candiChild, tmpChild, template_var_cache) if valid is False and len(template_var_cache) > 0: self.logger.warning(template_tag) self.logger.warning(candi_tag) self.logger.warning('censor not passed. cache will be cleared') template_var_cache.clear() return False return True
def text(self): ''' Return the text contained in this element (if any) Convert the text characters to html entities ''' if not isinstance(self.context, NavigableString): return u'' if self.is_comment(): return unicode(self.context) return self.escaper.substitute_html(self.context.string)
def parse_article(self, url): raw = self.session.get(url, verify=False) soup = BeautifulSoup(raw.text, "lxml") try: article = {} article["Author"] = soup.select(".article-meta-value")[0].contents[0].split(" ")[0] article["Board"] = soup.select(".article-meta-value")[1].contents[0] article["Title"] = soup.select(".article-meta-value")[2].contents[0] article["Date"] = soup.select(".article-meta-value")[3].contents[0] content = "" for tag in soup.select("#main-content")[0]: if type(tag) is NavigableString and tag !='\n': content += tag break article["Content"] = content findIPtag = u'? ???:' # deal different ip type try: ip_temp = soup.find(string = re.compile(findIPtag)) ip_temp = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*", ip_temp).group() except: try: ip_temp = 'NA' f2_content = soup.select('.f2') for content in f2_content: if findIPtag in content.contents[0]: ip_temp = content.next_sibling.split()[-1] break except: ip_temp = 'NA' article["IP"] = ip_temp upvote = 0 downvote = 0 novote = 0 response_list = [] for response_struct in soup.select(".push"): if "warning-box" not in response_struct['class']: response_dic = {} response_dic["Content"] = response_struct.select(".push-content")[0].contents[0][1:] response_dic["Vote"] = response_struct.select(".push-tag")[0].contents[0][0] response_dic["User"] = response_struct.select(".push-userid")[0].contents[0] response_list.append(response_dic) if response_dic["Vote"] == u"?": upvote += 1 elif response_dic["Vote"] == u"?": downvote += 1 else: novote += 1 article["Responses"] = response_list article["UpVote"] = upvote article["DownVote"] = downvote article["NoVote"] = novote except Exception as e: print(e) print(u"error in: %s " % url) return article
def check_html(runner, html, key=None, app=None, check_html=True, check_classes=True): caller = stack()[1] filepos = '{}:{:d}'.format(caller.filename.rpartition('/')[2], caller.lineno) app = app or filepos.partition('_')[2].partition('.')[0] if key: filepos += '-{}'.format(key) store = [] soup = BeautifulSoup(html, 'html.parser') for desc in soup.descendants: if isinstance(desc, Tag): name = desc.name attrs = desc.attrs store.append(name) for attr in sorted(attrs): tag = str(attrs.get('name')) if name == 'input' and tag == 'csrfmiddlewaretoken' and attr == 'value': continue store.append(attr) val = attrs[attr] if check_classes and attr == 'class': for cls in val: if cls: runner.assertIn(cls, CLASS_ARRAY[app], msg=filepos) if isinstance(val, list): store.extend(sorted(val)) elif (isinstance(val, str) and not (val.startswith(STATIC_URL) or ('date' in tag and attr == 'value'))): if '?' in val: part = val.rpartition('?') store.append(part[0]) for arg in sorted(part[2].split('&')): store.append(arg) else: store.append(val) elif isinstance(desc, NavigableString): store.append(str(desc)) string = ' '.join(' '.join(store).split()) hsh = md5(string.encode()).hexdigest()[:HASH_LEN] if check_html: if WRITE_CHECKFILE: print(filepos, hsh, file=CHECKFILE) elif CHECK_HTML: runner.assertIn(filepos, CHECK_ARRAY, msg=filepos) runner.assertEqual(CHECK_ARRAY[filepos][:HASH_LEN], hsh, msg=filepos)