我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用urlparse.html()。
def _start_link(self, attrsD): attrsD.setdefault('rel', u'alternate') if attrsD['rel'] == u'self': attrsD.setdefault('type', u'application/atom+xml') else: attrsD.setdefault('type', u'text/html') context = self._getContext() attrsD = self._itsAnHrefDamnIt(attrsD) if 'href' in attrsD: attrsD['href'] = self.resolveURI(attrsD['href']) expectingText = self.infeed or self.inentry or self.insource context.setdefault('links', []) if not (self.inentry and self.inimage): context['links'].append(FeedParserDict(attrsD)) if 'href' in attrsD: expectingText = 0 if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): context['link'] = attrsD['href'] else: self.push('link', expectingText)
def _l2bytes(l): # Convert a list of ints to bytes if the interpreter is Python 3 try: if bytes is not str: # In Python 2.6 and above, this call won't raise an exception # but it will return bytes([65]) as '[65]' instead of 'A' return bytes(l) raise NameError except NameError: return ''.join(map(chr, l)) # If you want feedparser to allow all URL schemes, set this to () # List culled from Python's urlparse documentation at: # http://docs.python.org/library/urlparse.html # as well as from "URI scheme" at Wikipedia: # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme # Many more will likely need to be added!
def lookslikehtml(self, s): if self.version.startswith('atom'): return if self.contentparams.get('type','text/html') != 'text/plain': return # must have a close tag or a entity reference to qualify if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)): return # all tags must be in a restricted subset of valid HTML tags if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, re.findall(r'</?(\w+)',s)): return # all entities must have been defined as valid HTML entities from htmlentitydefs import entitydefs if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);',s)): return return 1
def _start_link(self, attrsD): attrsD.setdefault('rel', 'alternate') if attrsD['rel'] == 'self': attrsD.setdefault('type', 'application/atom+xml') else: attrsD.setdefault('type', 'text/html') context = self._getContext() attrsD = self._itsAnHrefDamnIt(attrsD) if attrsD.has_key('href'): attrsD['href'] = self.resolveURI(attrsD['href']) expectingText = self.infeed or self.inentry or self.insource context.setdefault('links', []) if not (self.inentry and self.inimage): context['links'].append(FeedParserDict(attrsD)) if attrsD.has_key('href'): expectingText = 0 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): context['link'] = attrsD['href'] else: self.push('link', expectingText)
def _l2bytes(l): return bytes(l) # If you want feedparser to allow all URL schemes, set this to () # List culled from Python's urlparse documentation at: # http://docs.python.org/library/urlparse.html # as well as from "URI scheme" at Wikipedia: # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme # Many more will likely need to be added!
def mapContentType(self, contentType): contentType = contentType.lower() if contentType == 'text' or contentType == 'plain': contentType = u'text/plain' elif contentType == 'html': contentType = u'text/html' elif contentType == 'xhtml': contentType = u'application/xhtml+xml' return contentType
def _start_description(self, attrsD): context = self._getContext() if 'summary' in context: self._summaryKey = 'content' self._start_content(attrsD) else: self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
def _start_content_encoded(self, attrsD): self.pushContent('content', attrsD, u'text/html', 1)
def _parse_date_rfc822(dt): """Parse RFC 822 dates and times, with one minor difference: years may be 4DIGIT or 2DIGIT. http://tools.ietf.org/html/rfc822#section-5""" try: m = _rfc822_match(dt.lower()).groupdict(0) except AttributeError: return None return _parse_date_group_rfc822(m)
def _parse_date_rfc822(dt): """Parse RFC 822 dates and times, with one minor difference: years may be 4DIGIT or 2DIGIT. http://tools.ietf.org/html/rfc822#section-5""" try: m = _rfc822_match(dt.lower()).groupdict(0) except AttributeError: return None # Calculate a date and timestamp for k in ('year', 'day', 'hour', 'minute', 'second'): m[k] = int(m[k]) m['month'] = _rfc822_months.index(m['month']) + 1 # If the year is 2 digits, assume everything in the 90's is the 1990's if m['year'] < 100: m['year'] += (1900, 2000)[m['year'] < 90] stamp = datetime.datetime(*[m[i] for i in ('year', 'month', 'day', 'hour', 'minute', 'second')]) # Use the timezone information to calculate the difference between # the given date and timestamp and Universal Coordinated Time tzhour = 0 tzmin = 0 if m['tz'] and m['tz'].startswith('gmt'): # Handle GMT and GMT+hh:mm timezone syntax (the trailing # timezone info will be handled by the next `if` block) m['tz'] = ''.join(m['tz'][3:].split(':')) or 'gmt' if not m['tz']: pass elif m['tz'].startswith('+'): tzhour = int(m['tz'][1:3]) tzmin = int(m['tz'][3:]) elif m['tz'].startswith('-'): tzhour = int(m['tz'][1:3]) * -1 tzmin = int(m['tz'][3:]) * -1 else: tzhour = _rfc822_tznames[m['tz']] delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) # Return the date and timestamp in UTC return (stamp - delta).utctimetuple()
def mapContentType(self, contentType): contentType = contentType.lower() if contentType == 'text': contentType = 'text/plain' elif contentType == 'html': contentType = 'text/html' elif contentType == 'xhtml': contentType = 'application/xhtml+xml' return contentType
def _start_description(self, attrsD): context = self._getContext() if context.has_key('summary'): self._summaryKey = 'content' self._start_content(attrsD) else: self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
def _start_content_encoded(self, attrsD): self.pushContent('content', attrsD, 'text/html', 1)
def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" # "http://www.w3.org/TR/html4/loose.dtd"> # Reconstruct original DOCTYPE self.pieces.append('<!%(text)s>' % locals())