我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用html.parser.HTMLParser()。
def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed in all supported versions. # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): real_name = int(name.lstrip('X'), 16) else: real_name = int(name) try: data = chr(real_name) except (ValueError, OverflowError) as e: data = "\N{REPLACEMENT CHARACTER}" self.handle_data(data)
def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3.
def getXKCDImageTitle ( html ): comicBlock = find_last_between( html, 'div id="comic"', "</div>") if not comicBlock: return None imageTitle = find_last_between( comicBlock, "alt=", ">" ) # Drop srcset= if there imageTitle = imageTitle.split('srcset=')[0] h = HTMLParser() imageTitle = h.unescape(imageTitle) imageTitle = imageTitle.replace('"', '').strip() imageTitle = imageTitle.replace('/', '').strip() return imageTitle # Garfield Minus Garfield Methods
def get_steps(protocol_id): """ Get steps of a protocol :param protocol_id: int, protocol id :return: list, list of unresolved steps """ step_list = [] steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order') html_parser = HTMLParser() workspace_path = settings['env']['workspace'] for index, step in enumerate(steps): # priority for self-compiled tool software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'), str(step.software)) if os.path.exists(software_path) and os.path.isfile(software_path): step.software = software_path step_list.append({ 'id': index, 'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)), 'specify_output': step.specify_output, 'hash': step.hash, }) return step_list
def unescape_html(html_): """ Replace HTML entities (e.g. `£`) in a string. :param html_: The escaped HTML. :return: The input string with entities replaces. """ # http://stackoverflow.com/a/2360639 if sys.version_info.major == 2: # 2.7 # noinspection PyUnresolvedReferences,PyCompatibility from HTMLParser import HTMLParser return HTMLParser().unescape(html_) if sys.version_info.minor == 3: # 3.3 # noinspection PyCompatibility from html.parser import HTMLParser # noinspection PyDeprecation return HTMLParser().unescape(html_) # 3.4+ # noinspection PyCompatibility import html return html.unescape(html_)
def _get_links(url): class LinkParser(HTMLParser): def handle_starttag(self, tag, attrs): if tag == 'a': attrs = dict(attrs) links.add(attrs.get('href').rstrip('/')) links = set() try: log.debug('Getting links from %s' % url) u = urlopen(url) parser = LinkParser() parser.feed(u.read().decode('utf-8')) except Exception as e: log.debug('Could not get links. %s', e) log.debug('links: %r', links) return links
def _provider_auth(self, url, qs, username, password, html): url += '?sid=0' # prepare auth r = self.session.post(url + '&id=tve&option=credential', proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) # authenticate post_data = { 'option': 'credential', 'urlRedirect': url, 'Ecom_User_ID': username, 'Ecom_Password': password, } r1 = self.session.post(url, data=post_data, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) r2 = self.session.get(url, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) try: html_parser = HTMLParser.HTMLParser() redirurl = re.findall(r'<form method=\"POST\" enctype=\"application/x-www-form-urlencoded\" action=\"(.*)\">', r2.text)[0] argsre = dict([(match.group(1), html_parser.unescape(match.group(2))) for match in re.finditer(r'<input type=\"hidden\" name=\"(\w+)\" value=\"([^\"]+)\"/>', r2.text)]) return self.session.post(redirurl, data=argsre, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) except: raise Exception('Invalid user name or password.')
def zeroclick(irc, source, msgtarget, args): params = {"q":args[0]} url = "http://duckduckgo.com/lite/?" #try: data = requests.get(url, params=params).content.decode() search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""",data,re.M|re.DOTALL) if search: answer = HTMLParser().unescape(search[-1].replace("<br>"," ").replace("<code>"," ").replace("</code>"," ")) answer = re.sub("<[^<]+?>"," ",answer) out = re.sub("\s+"," ",answer.strip()) if out: #if len(out.split(" More at")[0].split("}")[-1].strip()) < 400: irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip()) #else: # irc.msg(source.split("!")[0], out.split(" More at")[0].split("}")[-1].strip()) else: irc.msg(msgtarget, "No results") else: irc.msg(msgtarget, "No results found.")
def request_first_token(self): class Parser(HTMLParser): def __init__(p_self): p_self.token = None super().__init__() def handle_starttag(p_self, tag, attrs): attrs = dict(attrs) if attrs.get("id") == "recaptcha-token": p_self.token = attrs.get("value") text = self.get("anchor", params={"co": self.co}).text parser = Parser() parser.feed(text) if not parser.token: raise RuntimeError( "Could not get first token. Response:\n{}".format(text), ) self.first_token = parser.token self.current_token = self.first_token
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def getImageTitle ( html ): imageTitle = find_between( html, "data-title=", "data-tags=" ) h = HTMLParser() imageTitle = h.unescape(imageTitle) #print(h.unescape(imageTitle)) return imageTitle.replace('"', '').strip() # C&H Methods
def strip_tags(self, html): parser = HTMLParser() html = parser.unescape(html) s = MLStripper() s.feed(html) return s.get_data()
def feed(self, data): data = data.replace("</' + 'script>", "</ignore>") HTMLParser.HTMLParser.feed(self, data)
def close(self): HTMLParser.HTMLParser.close(self) self.pbr() self.o('', 0, 'end') self.outtext = self.outtext.join(self.outtextlist) if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext
def unescape_html(content): if unescape is not None: return unescape(content) else: return HTMLParser().unescape(content)
def check(self, silent=None): if silent is None: silent = positive(self.parameters['SILENT'].value) result = CHECK_PROBABLY # check parameters silent = positive(self.parameters['SILENT'].value) if not positive(self.parameters['BACKGROUND'].value) and not negative(self.parameters['BACKGROUND'].value): if not silent: log.err('Bad %s value: %s.', 'BACKGROUND', self.parameters['BACKGROUND'].value) result = CHECK_FAILURE # can import urlib and html.parser? try: from urllib.request import urlretrieve except: if not silent: log.err('Cannot import urllib.request library (urllib5).') # TODO other ways? result = CHECK_FAILURE try: from html.parser import HTMLParser except: if not silent: log.err('Cannot import html.parser.') result = CHECK_FAILURE return result
def __init__(self, out=None, baseurl=''): HTMLParser.HTMLParser.__init__(self) if out is None: self.out = self.outtextf else: self.out = out self.outtextlist = [] # empty list to store output characters before they are "joined" try: self.outtext = unicode() except NameError: # Python3 self.outtext = str() self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.code = False self.br_toggle = '' self.lastWasNL = 0 self.lastWasList = False self.style = 0 self.style_def = {} self.tag_stack = [] self.emphasis = 0 self.drop_white_space = 0 self.inheader = False self.abbr_title = None # current abbreviation definition self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl if options.google_doc: del unifiable_n[name2cp('nbsp')] unifiable['nbsp'] = ' _place_holder;'
def close(self): HTMLParser.HTMLParser.close(self) self.pbr() self.o('', 0, 'end') self.outtext = self.outtext.join(self.outtextlist) if options.google_doc: self.outtext = self.outtext.replace(' _place_holder;', ' '); return self.outtext
def __init__(self): html_parser.HTMLParser.__init__(self) self._in_anchor = False self.editions = []