我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用HTMLParser.HTMLParser()。
def __init__(self): super(EmuApi, self).__init__() self.service = 'EmuParadise' self.base_url = 'https://www.emuparadise.me' self.referrer = None self._parser = HTMLParser.HTMLParser() self.endpoints = ENDPOINTS self.response = self.get_response() self.search_regex = '<div class="roms">' \ '<a .*?href="(.*?)">(.*?)</a>.*?' \ '<a href="\/roms\/roms\.php\?sysid=(\d+)".*?class="sysname">' \ '(.*?)</a>.*?<b>Size:</b> (.*?) .*?</div>' self.download_url = 'http://direct.emuparadise.me/roms/get-download.php?gid={download_id}' \ '&token={token}' \ '&mirror_available=true' self.requires_arguments = True self.token = '211217baa2d87c57b360b9a673a12cfd'
def getXKCDImageTitle ( html ): comicBlock = find_last_between( html, 'div id="comic"', "</div>") if not comicBlock: return None imageTitle = find_last_between( comicBlock, "alt=", ">" ) # Drop srcset= if there imageTitle = imageTitle.split('srcset=')[0] h = HTMLParser() imageTitle = h.unescape(imageTitle) imageTitle = imageTitle.replace('"', '').strip() imageTitle = imageTitle.replace('/', '').strip() return imageTitle # Garfield Minus Garfield Methods
def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed in all supported versions. # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): real_name = int(name.lstrip('X'), 16) else: real_name = int(name) try: data = unichr(real_name) except (ValueError, OverflowError), e: data = u"\N{REPLACEMENT CHARACTER}" self.handle_data(data)
def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3.
def get_steps(protocol_id): """ Get steps of a protocol :param protocol_id: int, protocol id :return: list, list of unresolved steps """ step_list = [] steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order') html_parser = HTMLParser() workspace_path = settings['env']['workspace'] for index, step in enumerate(steps): # priority for self-compiled tool software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'), str(step.software)) if os.path.exists(software_path) and os.path.isfile(software_path): step.software = software_path step_list.append({ 'id': index, 'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)), 'specify_output': step.specify_output, 'hash': step.hash, }) return step_list
def twitter_url(match, bot=None): # Find the tweet ID from the URL tweet_id = match.group(1) # Get the tweet using the tweepy API api = get_api(bot) if not api: return try: tweet = api.get_status(tweet_id) user = tweet.user except tweepy.error.TweepError: return # Format the return the text of the tweet text = " ".join(tweet.text.split()) if user.verified: prefix = u"\u2713" else: prefix = "" time = timesince.timesince(tweet.created_at, datetime.utcnow()) h = HTMLParser() return u"{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, h.unescape(text), time)
def insert_to(project_url, destination, find_what, indent=0): url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/') response = urllib2.urlopen(url) if response.getcode() == 200: with open(destination, 'r') as dest: dest_contents = dest.readlines() lines = ''.join(dest_contents) content = HTMLParser().unescape(response.read()) if content.replace(' ', '') in lines.replace(' ', ''): print_out('IGNORED', destination) return generated = [] for line in dest_contents: generated.append(line) if line.lower().find(find_what.lower()) >= 0: spaces = len(line) - len(line.lstrip()) for l in content.split('\n'): if l: generated.append('%s%s\n' % (' ' * (spaces + indent), l)) with open(destination, 'w') as dest: for line in generated: dest.write(line) print_out('INSERT', destination)
def process(keyword,page): url='https://www.google.com/search?q=%s&start=%s&num=100'%(keyword,page*100) urlinfos=[] #urlinfo1={"url":"http://www.baidu.com/link?url=966OdUyxuwFJoAYx_XGYq7_FiVLcej4qEA3Q84e-lLAtLPRGGHA6tsNFNsTN9zka&wd=&eqid=a64931cc000026c3000000035994fd9e","title":"python Django?? ?????????????????..._???","info":'? W3School,???????????????? jQuery ??? jQuery ??jQuery ???? ?W3School,???????? jQuery ????????????? jQuery...'} page = ct.crawlerTool.getPage(url) #print page #print url segments = ct.crawlerTool.getXpath('//div[@class="g"]',page) #print segments for segment in segments: #print segment try: urlinfo={} urlinfo['url']= ct.crawlerTool.getXpath('//h3/a/@href',segment)[0]#/text()??????? urlinfo['title'] = ct.crawlerTool.getXpath('//h3/a/text()',segment)[0] urlinfo['info'] = HTMLParser().unescape(ct.crawlerTool.extractorText(ct.crawlerTool.getXpath('//div[@class="s"]', segment))) #print urlinfo['url'],urlinfo['title'],urlinfo['info'] #info?????????? urlinfos.append(urlinfo) except: print('error') traceback.print_exc() return {"urlinfos":urlinfos}
def ParseGTestXML(xml_content): """Parse gtest XML result.""" results = [] html = HTMLParser.HTMLParser() # TODO(jbudorick): Unclear how this handles crashes. testsuites = xml.etree.ElementTree.fromstring(xml_content) for testsuite in testsuites: suite_name = testsuite.attrib['name'] for testcase in testsuite: case_name = testcase.attrib['name'] result_type = base_test_result.ResultType.PASS log = [] for failure in testcase: result_type = base_test_result.ResultType.FAIL log.append(html.unescape(failure.attrib['message'])) results.append(base_test_result.BaseTestResult( '%s.%s' % (suite_name, TestNameWithoutDisabledPrefix(case_name)), result_type, int(float(testcase.attrib['time']) * 1000), log=('\n'.join(log) if log else ''))) return results
def unescape_html(html_): """ Replace HTML entities (e.g. `£`) in a string. :param html_: The escaped HTML. :return: The input string with entities replaces. """ # http://stackoverflow.com/a/2360639 if sys.version_info.major == 2: # 2.7 # noinspection PyUnresolvedReferences,PyCompatibility from HTMLParser import HTMLParser return HTMLParser().unescape(html_) if sys.version_info.minor == 3: # 3.3 # noinspection PyCompatibility from html.parser import HTMLParser # noinspection PyDeprecation return HTMLParser().unescape(html_) # 3.4+ # noinspection PyCompatibility import html return html.unescape(html_)
def feeds(page_url): """Search the given URL for possible feeds, returning a list of them.""" # If the URL is a feed, there's no need to scan it for links. if is_feed(page_url): return [page_url] data = fetch_url(page_url) parser = FeedFinder(page_url) try: parser.feed(data) except HTMLParser.HTMLParseError: pass found = parser.urls() # Return only feeds that feedparser can understand. return [feed for feed in found if is_feed(feed)]
def _provider_auth(self, url, qs, username, password, html): url += '?sid=0' # prepare auth r = self.session.post(url + '&id=tve&option=credential', proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) # authenticate post_data = { 'option': 'credential', 'urlRedirect': url, 'Ecom_User_ID': username, 'Ecom_Password': password, } r1 = self.session.post(url, data=post_data, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) r2 = self.session.get(url, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) try: html_parser = HTMLParser.HTMLParser() redirurl = re.findall(r'<form method=\"POST\" enctype=\"application/x-www-form-urlencoded\" action=\"(.*)\">', r2.text)[0] argsre = dict([(match.group(1), html_parser.unescape(match.group(2))) for match in re.finditer(r'<input type=\"hidden\" name=\"(\w+)\" value=\"([^\"]+)\"/>', r2.text)]) return self.session.post(redirurl, data=argsre, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'}) except: raise Exception('Invalid user name or password.')
def get_url(domain,port,timeout): url_list = [] if port ==443: surl = 'https://' + domain else: surl = 'http://' + domain res = urllib2.urlopen(surl, timeout=timeout) html = res.read() root_url = res.geturl() m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I) if m: for url in m: ParseResult = urlparse.urlparse(url[1]) if ParseResult.netloc and ParseResult.scheme: if domain == ParseResult.hostname: url_list.append(HTMLParser.HTMLParser().unescape(url[1])) elif not ParseResult.netloc and not ParseResult.scheme: url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1]))) return list(set(url_list))
def ParseGTestXML(xml_content): """Parse gtest XML result.""" results = [] html = HTMLParser.HTMLParser() # TODO(jbudorick): Unclear how this handles crashes. testsuites = xml.etree.ElementTree.fromstring(xml_content) for testsuite in testsuites: suite_name = testsuite.attrib['name'] for testcase in testsuite: case_name = testcase.attrib['name'] result_type = base_test_result.ResultType.PASS log = [] for failure in testcase: result_type = base_test_result.ResultType.FAIL log.append(html.unescape(failure.attrib['message'])) results.append(base_test_result.BaseTestResult( '%s.%s' % (suite_name, case_name), result_type, int(float(testcase.attrib['time']) * 1000), log=('\n'.join(log) if log else ''))) return results
def test_cdata_with_closing_tags(self): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. # The normal event collector normalizes the events in get_events, # so we override it to return the original list of events. class Collector(EventCollector): def get_events(self): return self.events content = """<!-- not a comment --> ¬-an-entity-ref; <a href="" /> </p><p> & <span></span></style> '</script' + '>' </html> </head> </scripter>!""" for element in [' script', 'script ', ' script ', '\nscript', 'script\n', '\nscript\n']: s = u'<script>{content}</{element}>'.format(element=element, content=content) self._run_check(s, [("starttag", "script", []), ("data", content), ("endtag", "script")], collector=Collector)
def lrc2dict(lrc): time_stamps = re.findall(r'\[[^\]]+\]', lrc) html_parser = HTMLParser.HTMLParser() if time_stamps: # ???? lyric = lrc for tplus in time_stamps: lyric = lyric.replace(tplus, '').replace('\r', '').replace('\n', '').replace('????','').replace('???','').replace('?????','').replace('???','').replace('??','').replace('??','').replace('??','').replace('??','') lyric = lyric.replace('???', '').replace('??', '').replace('????', '').replace('???', '').replace('??', '').replace('???', '') # ???? # tplus: [02:31.79] # t 02:31.79 # print lyric print html_parser.unescape(lyric) return html_parser.unescape(lyric) else: return ''
def HTML(text, encoding=None): """Parse the given HTML source and return a markup stream. Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be iterated over multiple times: >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8') >>> print(html) <body><h1>Foo</h1></body> >>> print(html.select('h1')) <h1>Foo</h1> >>> print(html.select('h1/text()')) Foo :param text: the HTML source :return: the parsed XML event stream :raises ParseError: if the HTML text is not well-formed, and error recovery fails """ if isinstance(text, unicode): # If it's unicode text the encoding should be set to None. # The option to pass in an incorrect encoding is for ease # of writing doctests that work in both Python 2.x and 3.x. return Stream(list(HTMLParser(StringIO(text), encoding=None))) return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
def downloader_html(self,url): ''' :param url: ??????url :return ???????,????????????0 ''' try: print url BAIDU_UA = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'#????UA headers = {'User-Agent': BAIDU_UA} ##????????User-Agent ?UA????????????????? data = requests.get(url, headers=headers) ##?????????????????? html_parser = HTMLParser.HTMLParser() data = html_parser.unescape(data.text) return data except: print '?????????',url return '0'
def http_response(self, request, response): if not hasattr(response, "seek"): response = response_seek_wrapper(response) http_message = response.info() url = response.geturl() ct_hdrs = http_message.getheaders("content-type") if is_html(ct_hdrs, url, self._allow_xhtml): try: try: html_headers = parse_head(response, self.head_parser_class()) finally: response.seek(0) except (HTMLParser.HTMLParseError, sgmllib.SGMLParseError): pass else: for hdr, val in html_headers: # add a header http_message.dict[hdr.lower()] = val text = hdr + ": " + val for line in text.split("\n"): http_message.headers.append(line + "\n") return response
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: self.html = self.opened_file.read() self.soup = BeautifulSoup4(self.html) self.user = user_id self.urls = dict() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for bmark in self.check_duplicates_query: self.check_duplicates[bmark.main_url] = bmark self.tags_dict = dict() self.tags_set = set() self.html_parser = HTMLParser.HTMLParser() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: self.data = self.opened_file.read() self.user = user_id self.data = ujson.loads(self.data) self.urls = dict() self.tags_dict = dict() self.tags_set = set() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for x in self.check_duplicates_query: self.check_duplicates[x.main_url] = x self.html_parser = HTMLParser.HTMLParser() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def play_video(self, datas): url = datas.get('url') data = channel.get_url(url) regex = r"""src="(https://www.rtbf.be/auvio/embed/media[^"]+)""" iframe_url = re.findall(regex, data)[0] iframe_url data = channel.get_url(iframe_url) regex = r"""data-media="([^"]+)""" media = re.findall(regex, data)[0] h = HTMLParser.HTMLParser() media_json = h.unescape(media) regex = r""""high":"([^"]+)""" all_url = re.findall(regex, media_json) if len(all_url) > 0: video_url = all_url[0] else: regex = r"""url":"([^&]+)""" iframe_url = re.findall(regex, data)[0] video_url = iframe_url.replace("\\", "") channel.playUrl(video_url)
def getBelvedereArtistsGenerator(): """ Generator to return Auckland Art Gallery paintings """ htmlparser = HTMLParser.HTMLParser() basesearchurl=u'http://digital.belvedere.at/people/%s' urlregex = u'\<h3\>\<a href\=\"\/people\/(?P<id>\d+)\/[^\"]+\"\>(?P<name>[^\<]+)\<\/a\>\<\/h3\>\<div\>(?P<description>[^\<]+)\<\/div\>' # Just loop over the pages for i in string.ascii_lowercase: searchurl = basesearchurl % (i,) print searchurl searchPage = requests.get(searchurl) matches = re.finditer(urlregex, searchPage.text) for match in matches: artist = {} artist[u'id'] = match.group(u'id') artist[u'name'] = htmlparser.unescape(match.group(u'name')) artist[u'description'] = htmlparser.unescape(match.group(u'description')) artist[u'url'] = u'http://digital.belvedere.at/people/%s/' % (match.group(u'id'),) yield artist