我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用html.parser.feed()。
def _get_eol_list(self) -> typing.List[str]: """Scrapes the FreeBSD website and returns a list of EOL RELEASES""" request = urllib.request.Request( self.eol_url, headers={ "Accept-Charset": "utf-8" } ) with urllib.request.urlopen(request) as response: # nosec: B310 if response.getcode() != 200: # noqa: T484 iocage.lib.errors.DistributionEOLWarningDownloadFailed( logger=self.logger, level="warning" ) return [] parser = EOLParser() data = response.read().decode("utf-8", "ignore") parser.feed(data) parser.close() return parser.eol_releases
def _run_check(self, source, expected_events, collector=None): if collector is None: collector = self.get_collector() parser = collector for s in source: parser.feed(s) parser.close() events = parser.get_events() if events != expected_events: self.fail("received events did not match expected events\n" "Expected:\n" + pprint.pformat(expected_events) + "\nReceived:\n" + pprint.pformat(events))
def _parse_error(self, source): def parse(source=source): parser = self.get_collector() parser.feed(source) parser.close() self.assertRaises(html.parser.HTMLParseError, parse)
def parse(html): '''Esegue il parsing HTML del testo html e ritorna la radice dell'albero.''' parser = _MyHTMLParser() parser.feed(html) return parser.root
def main(): htm = open("sheet001.htm").read() parser = ToolHireParser() parser.feed(htm) print(parser.dates)
def _run_check(self, source, expected_events, collector=None): if collector is None: collector = self.get_collector() parser = collector for s in source: parser.feed(s) parser.close() events = parser.get_events() if events != expected_events: self.fail("received events did not match expected events" + "\nSource:\n" + repr(source) + "\nExpected:\n" + pprint.pformat(expected_events) + "\nReceived:\n" + pprint.pformat(events))
def _parse_error(self, source): def parse(source=source): parser = self.get_collector() parser.feed(source) parser.close() with self.assertRaises(html.parser.HTMLParseError): with self.assertWarns(DeprecationWarning): parse()
def test_convert_charrefs_dropped_text(self): # #23144: make sure that all the events are triggered when # convert_charrefs is True, even if we don't call .close() parser = EventCollector(convert_charrefs=True) # before the fix, bar & baz was missing parser.feed("foo <a>link</a> bar & baz") self.assertEqual( parser.get_events(), [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'), ('endtag', 'a'), ('data', ' bar & baz')] )
def extract_links(from_string): """Return a list with all links contained in the HTML page passed as input parameter.""" parser = LinkExtractor() parser.feed(from_string) return parser.links
def find_scripts(site): parser = ScriptParser() parser.feed(site) return parser.scripts
def execute(self, context): import html.parser import urllib.request remote_platforms = [] ps = context.scene.ge_publish_settings # create lib folder if not already available lib_path = bpy.path.abspath(ps.lib_path) if not os.path.exists(lib_path): os.makedirs(lib_path) print("Retrieving list of platforms from blender.org...", end=" ", flush=True) class AnchorParser(html.parser.HTMLParser): def handle_starttag(self, tag, attrs): if tag == 'a': for key, value in attrs: if key == 'href' and value.startswith('blender'): remote_platforms.append(value) url = 'http://download.blender.org/release/Blender' + bpy.app.version_string.split()[0] parser = AnchorParser() data = urllib.request.urlopen(url).read() parser.feed(str(data)) print("done", flush=True) print("Downloading files (this will take a while depending on your internet connection speed).", flush=True) for i in remote_platforms: src = '/'.join((url, i)) dst = os.path.join(lib_path, i) dst_dir = '.'.join([i for i in dst.split('.') if i not in {'zip', 'tar', 'bz2'}]) if not os.path.exists(dst) and not os.path.exists(dst.split('.')[0]): print("Downloading " + src + "...", end=" ", flush=True) urllib.request.urlretrieve(src, dst) print("done", flush=True) else: print("Reusing existing file: " + dst, flush=True) print("Unpacking " + dst + "...", end=" ", flush=True) if os.path.exists(dst_dir): shutil.rmtree(dst_dir) shutil.unpack_archive(dst, dst_dir) print("done", flush=True) print("Creating platform from libs...", flush=True) bpy.ops.scene.publish_auto_platforms() return {'FINISHED'}
def serialize(result): """For a given Met result, map that to our database""" imageinfos = result['ImageInfo'] thumbnail = None url = None for info in imageinfos: if info['PrimaryDisplay']: # Use this one thumbnail = ENDPOINT_BASE_IMAGE_URL + info['Thumbnail'] url = ENDPOINT_BASE_IMAGE_URL + info['LargeWebsite'] break if not url: log.warning("Did not get an image URL for %s", result) return image = models.Image(url=url) image.provider = PROVIDER_NAME image.source = SOURCE_NAME # Creator might be a few fields tombstone = result['Tombstone'] creator_names = [] for t in tombstone: if t['Name'] in CREATOR_LABELS: val = t['Value'] parser = CreatorParser() parser.feed(val) creator_names.append(" ".join(parser.out)) if len(creator_names) > 0: image.creator = ", ".join(creator_names) image.thumbnail = thumbnail image.license = "cc0" image.license_version = '1.0' image.foreign_identifier = result['CollectionObject']['CRDID'] image.foreign_landing_url = FOREIGN_LANDING_BASE_URL + str(image.foreign_identifier) image.title = result['CollectionObject']['Title'] image.identifier = signals.create_identifier(image.url) image.last_synced_with_source = timezone.now() try: image.save() log.info("Adding image %s-%s (%s) identifier %s", image.title, image.creator, image.foreign_identifier, image.identifier) except IntegrityError as e: log.warn(e) pass return image