我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用bs4.Comment()。
def get_blog(cls, file_name): if cls.is_exist(file_name): with open(cls._real_file_name(file_name), 'r', encoding='utf-8') as f: txt = f.read() mtime = os.path.getmtime(cls._real_file_name(file_name)) from bs4 import BeautifulSoup, Comment import yaml comment = BeautifulSoup(txt, "html.parser").find(text=lambda text: isinstance(text, Comment)) if comment is not None: blog_info = yaml.load(comment) if 'use_toc' not in blog_info: blog_info['use_toc'] = False html = markdown(txt) return blog_info, txt, html, mtime else: return else: return
def get_lyrics(artist, song): artist = format_artist(artist) song = format_song(song) time.sleep(1) url = LYRICS_URL.format(artist, song) content = None try: response = urlopen(url) content = response.read() except Exception as e: print(url) print(e) print("failed\n") return None soup = bs(content, "html.parser", parse_only=SoupStrainer('div')) for l in soup: for lyrics in soup.find_all(string=lambda t: isinstance(t, Comment)): if "start of lyrics" in lyrics or "Usage" in lyrics: lyrics = re.sub('</?br/?>', '', str(lyrics.parent)) lyrics = re.sub('<.*?>', '', str(lyrics)) return str(lyrics)
def get_overrides(soup): overrides = [] comments = soup.find_all(string=lambda text:isinstance(text,Comment)) for comment in comments: m = re.match(OVERRIDE_COMMENT_REGEX, comment) if m: new_overrides = m.group(1).split(",") new_overrides = [o.strip() for o in new_overrides] logger.info("Overrides found: %s" % new_overrides) overrides += new_overrides return overrides
def codeAnalyse(html, clas, name = ""): soup = BeautifulSoup(html,"html.parser") source = soup.find('code', id = "__cnt_0_4") soup = BeautifulSoup(str(source),"html.parser") comments = soup.findAll(text=lambda text:isinstance(text, Comment)) soup = BeautifulSoup(comments[0],"html.parser") source = soup.find('a', {"class":clas},string = name) pos = 0 for son in source.parent.find_next_sibling().find_next_siblings(): pos = pos+1 print(source.string, ":", son.a.string, pos, son.a.attrs['href'])
def __get_navigable_strings(self,soup): if isinstance(soup, NavigableString): if type(soup) not in (Comment, Declaration) and soup.strip(): yield soup elif soup.name not in ('script', 'style'): for c in soup.contents: for g in self.__get_navigable_strings(c): yield g
def text(self, target=None, ignore_pureascii_words=False): """ Get all text in HTML, skip script and comment :param target: the BeatuifulSoup object, default self.b :param ignore_pureascii_words: if set True, only return words that contains Chinese charaters (may be useful for English version website) :return: list of str """ if target is None: target = self.b from bs4 import Comment from bs4.element import NavigableString,Doctype result = [] for descendant in target.descendants: if not isinstance(descendant, NavigableString) \ or isinstance(descendant,Doctype) \ or descendant.parent.name in ["script", "style"] \ or isinstance(descendant, Comment) \ or "none" in descendant.parent.get("style","")\ or "font-size:0px" in descendant.parent.get("style",""): continue data = descendant.strip() if len(data) > 0: if not ignore_pureascii_words or any([ord(i)>127 for i in data]): if PY2: result.append(data.encode()) else: result.append(data) return result
def get_lyric(self, singer, song): # Replace spaces with _ singer = singer.replace(' ', '_') song = song.replace(' ', '_') url = 'http://lyrics.wikia.com/{0}:{1}'.format(singer, song) req = requests.get(url) s = BeautifulSoup(req.text, "lxml") # Get main lyrics holder lyrics = s.find("div", {'class': 'lyricbox'}) if lyrics is None: return None # Remove Scripts [s.extract() for s in lyrics('script')] # Remove comments comments = lyrics.findAll(text=lambda text: isinstance(text, Comment)) # Remove unecessary tags for tag in ['div', 'i', 'b', 'a']: for match in lyrics.findAll(tag): match.replaceWithChildren() # TODO: check if you need the encode/decode thing, if you do then do a try catch for it # get output as string and remove non unicode characters and replace <br> with newlines # output = str(lyrics).encode('utf-8', errors = 'replace')[22:-6:].decode('utf-8').replace('\n','').replace('<br/>','\n') output = str(lyrics).replace('\n', '').replace('<br/>', '\n')[22:-6:] try: return output except: return output.encode('utf-8')
def lyricswikia(artist, song): # original code found @ # https://github.com/geekpradd/PyLyrics/blob/master/PyLyrics/functions.py song = song.split(' - ', 1)[0] artist = artist.replace(' ', '_') song = song.replace(' ', '_') url = 'http://lyrics.wikia.com/{0}:{1}'.format(artist, song) print('Trying:', url) r = requests.get(url) s = BeautifulSoup(r.text, 'html.parser') # Get main lyrics holder lyrics = s.find("div", {'class': 'lyricbox'}) if lyrics is not None: # Remove Scripts [s.extract() for e in lyrics('script')] # Remove Comments comments = lyrics.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] # Remove unecessary tags for tag in ['div', 'i', 'b', 'a']: for match in lyrics.findAll(tag): match.replaceWithChildren() # Get output as a string and remove non unicode characters and replace # <br> with newlines lyrics = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode( "utf-8").replace('\n', '').replace('<br/>', '\n') try: return lyrics except: return lyrics.encode('utf-8')
def _get_commented_CDN_tags(self): def get_comment(s): return s if isinstance(s, Comment) and '//' in s and s.strip()[:4] in ['<lin', '<scr'] else '' comments = bs(self._get_template()).find_all(string=get_comment) tags = self._unitags(bs(str(comments)).select('link[href*="//"], script[src*="//"]')) if tags: for tag in tags: for comment in comments: if tag['open'] in comment and tag['ref'] in comment: tag['comment'] = comment return tags
def strip_html_comments(html): soup = BeautifulSoup(html, 'html.parser') for element in soup.find_all(text=lambda text: isinstance(text, html_comment)): element.extract() return str(soup)
def findSalaries(self, soupped): total_salaries = [] all_all_salaries = soupped.find("div", {"id": "all_all_salaries"}) comments=all_all_salaries.find_all(string=lambda text:isinstance(text,Comment)) raw_salary_rows = BeautifulSoup(comments[0], "lxml").find("tbody").find_all("tr") for each_raw_salary in raw_salary_rows: year = each_raw_salary.find("th").text.replace("-","_").encode("utf8") salary = self.salaryTextToFloat(each_raw_salary.find_all("td")[2].text) total_salaries.append((year, salary)) return total_salaries
def clean_tag(doc): for tag in doc.find_all(["style", "script","form", "textarea", "input", "iframe", "select","frame", "link"]): tag.extract() comments = doc.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments]
def search_esv(message, verse): """ Search for a bible passage from the English Standard Version. Example:: bible Romans 12:16 """ r = await http.get("http://www.esvapi.org/v2/rest/passageQuery", params={ "key": "IP", "passage": verse, "output-format": "crossway-xml-1.0", "include-simple-entities": "true", }) doc = BeautifulSoup(r.text(), features="lxml") if not doc.passage: raise CommandError("Verse not found.") lines = [] for verse_unit in doc.passage.content.find_all('verse-unit'): num = int(verse_unit.find('verse-num').text) woc = verse_unit.find('woc') if woc: text = woc.text else: text = "".join([str(node) for node in verse_unit.children if isinstance(node, NavigableString) and not isinstance(node, Comment)]) lines.append("**{}** {}".format(num, text.strip())) return "\n".join(lines)
def unwrapUseless(soup): # unwrap?????? for a in soup.select('a'): a.unwrap() for a in soup.select('b'): a.unwrap() for a in soup.select('font'): a.unwrap() for a in soup.select('span'): a.unwrap() comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments]
def scrape_mythic_card_page(url): r = requests.get(url) soup = BS(r.text, "html.parser") comments = soup.find_all(string=lambda text: isinstance(text, Comment)) card = {} for comment in comments: if comment == 'CARD NAME': card['name'] = comment.next_element.strip().replace('"', '') elif comment == 'MANA COST': try: card['manaCost'] = comment.next_element.strip().replace('"', '') except: pass elif comment == 'TYPE': card['type'] = comment.next_element.strip().replace('"', '') elif comment == 'CARD TEXT': buildText = '' for element in comment.next_elements: try: if not element.strip() in ['CARD TEXT', 'FLAVOR TEXT', '']: if buildText != '': buildText += '\n' buildText += element.strip() if element.strip() == 'FLAVOR TEXT': card['text'] = buildText break except: pass elif comment == 'Set Number': try: card['number'] = comment.next_element.strip() except: pass elif comment == 'P/T': try: if comment.next_element.strip().split('/')[0] != '': card['power'] = comment.next_element.strip().split('/')[0] card['toughness'] = comment.next_element.strip().split('/')[1] except: pass return card
def loadSearch(self, url, firstName='results'): """ Loads the search page using the url provided and returns raw search results """ print " inside loadSearch .." ''' 97.77.104.22:80 174.129.204.124:80 ''' proxy = { "http":"209.222.25.83:3128", } headers = {'Accept-Encoding': 'identity'} html2 = requests.get(url, proxies=proxy, headers=headers) print "HTML 2" # print html2.content # html = html2.content html = self.loadPage(url) print "SPAGE" # print sPage[:200] spContent = BeautifulSoup(html) #title = spContent.find('title') #if title is not None: #if title.string is not lSrchTitle: #sys.exit('There is some problem with url provided, it does not correspond to Linkedin Search') comment = None comments = spContent.findAll(text=lambda text:isinstance(text, Comment)) print "COMMENTS" # print comments # print " >> BEAUTIFULSOUP FINDALL" #print comments cLen = len(comments) print "Length of COmments"+cLen.__str__() if cLen > 0 and cLen > 11: comment = comments[11] if comment is None: for cmnt in comments: if firstName in cmnt: comment = cmnt print "output COMMENTS :" # print comment return comment
def dealLocalFile(): rootDir = os.getcwd() list_dirs = os.walk(rootDir) for root, dirs, files in list_dirs: # for d in dirs: # print os.path.join(root, d) for f in files: if f.endswith('html'): path = os.path.join(root, f) soup = BeautifulSoup(open(path), 'html.parser') soup = soup.body #???? comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] #??span?? spans = soup.select("span") [span.unwrap() for span in spans] #??font?? fonts = soup.select("font") [font.unwrap() for font in fonts] pps = soup.select("p") for pp in pps: del pp['style'] # text = pp.get_text() # text = text.strip() # if text is '' or len(text) < 1:#????p??,?? # pp.extract() # # # imgs = soup.select("img") # for img in imgs: # src = img['src'] # index = src.find('/') # if index != -1: # newSrc = 'imgs' + src[index:] # img['src'] = newSrc # # print newSrc ps = soup.select('p') title = '' for p in ps: if p.get_text() != '' and len(p.get_text()) > 0: title = p.get_text() p.extract() break fo = open(title + ".html", "w") soup.prettify() fo.write(str(soup)); # ??????? fo.close() # print soup.prettify()
def _parse_tags(cls, html): excluded_tags = ['script', 'style', 'noscript', 'html', 'head', 'meta', 'link', 'body', 'input', 'form', 'a'] minimum_text_node_length = 8 y_data = [] text_data = [] tag_signatures = [] soup = BeautifulSoup(html, 'html.parser') for tag in soup.findAll(): path = '.'.join(reversed([p.name for p in tag.parentGenerator() if p])) tag_signature = '.'.join([path, tag.name]) if (tag.name not in excluded_tags) and ('table' not in path): tag_text = [] for text in tag.contents: if isinstance(text, Comment): continue try: text = text.strip() aux = BeautifulSoup(text, 'html.parser') if aux.find() is None: tag_text.append(text) except Exception, e: pass tag_text = "\n".join(tag_text) if tag_text and len(tag_text) > minimum_text_node_length: if tag_text not in text_data: # Remove line returns and tabs tag_text = cls._remove_chars(tag_text) if tag_text: y_data.append(len(tag_text)) text_data.append(tag_text) tag_signatures.append(path) x = np.array(y_data) return x, text_data, tag_signatures