我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用HTMLParser.HTMLParser.__init__()。
def __init__(self, f, fieldnames=None, encoding=UTF8, **kwds): self.encoding = encoding try: self.reader = csv.reader(UTF8Recoder(f, encoding) if self.encoding != UTF8 else f, dialect=csv.excel, **kwds) if not fieldnames: self.fieldnames = self.reader.next() if len(self.fieldnames) > 0 and self.fieldnames[0].startswith(codecs.BOM_UTF8): self.fieldnames[0] = self.fieldnames[0].replace(codecs.BOM_UTF8, u'', 1) else: self.fieldnames = fieldnames except (csv.Error, StopIteration): self.fieldnames = [] except LookupError as e: Cmd.Backup() usageErrorExit(e) self.numfields = len(self.fieldnames)
def __init__(self, data_def = None, warnaction = "default", warngoal = sys.stderr, caller_id = 0): self.tree_lock = RLock() with self.tree_lock: self.dtc = DataTreeConstants() self.known_urlid = (0, 4, 11, 14) self.known_linkid = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) self.errorcode = dte.dtDataDefOK self.caller_id = caller_id self.cdata_def = {} self.ddtype = "" if sys.modules['DataTreeGrab']._warnings == None: sys.modules['DataTreeGrab']._warnings = _Warnings(warnaction, warngoal, caller_id) elif caller_id not in sys.modules['DataTreeGrab']._warnings._ids or warnaction != None: sys.modules['DataTreeGrab']._warnings.set_warnaction(warnaction, caller_id) if isinstance(data_def, dict): self.data_def = data_def self.convert_data_def() else: self.data_def = {}
def __init__(self, dtree, data = None, parent = None, key = None): self.type = "value" self.key = key self.keys = [] self.key_index = {} self.value = None DATAnode.__init__(self, dtree, parent) with self.node_lock: if isinstance(data, list): self.type = "list" for k in range(len(data)): JSONnode(self.dtree, data[k], self, k) elif isinstance(data, dict): self.type = "dict" for k, item in data.items(): JSONnode(self.dtree, item, self, k) else: self.type = "value" self.value = data
def __init__(self, data, output = sys.stdout, warnaction = "default", warngoal = sys.stderr, caller_id = 0): DATAtree.__init__(self, output, warnaction, warngoal, caller_id) with self.tree_lock: self.tree_type ='json' self.extract_from_parent = True self.data = data # Read the json data into the tree try: self.root = JSONnode(self, data, key = 'ROOT') self.start_node = self.root except: self.warn('Unable to parse the JSON data. Invalid dataset!', dtDataWarning, 1) self.start_node = NULLnode() # end JSONtree
def __init__(self, data_def, data = None, warnaction = "default", warngoal = sys.stderr, caller_id = 0): self.tree_lock = RLock() with self.tree_lock: self.dtc = DataTreeConstants() self.ddconv = DataDef_Convert(warnaction = warnaction , warngoal = warngoal, caller_id = caller_id) self.caller_id = caller_id self.print_tags = False self.print_searchtree = False self.show_result = False self.fle = sys.stdout if sys.modules['DataTreeGrab']._warnings == None: sys.modules['DataTreeGrab']._warnings = _Warnings(warnaction, warngoal, caller_id) else: sys.modules['DataTreeGrab']._warnings.set_warnaction(warnaction, caller_id) self.searchtree = None self.timezone = pytz.utc self.errorcode = dte.dtDataInvalid self.result = [] self.data_def = None self.init_data_def(data_def) if data != None: self.init_data(data)
def __init__(self, *components, **attributes): """ Args: components: any components that should be nested in this element attributes: any attributes you want to give to this element Raises: SyntaxError: when a stand alone tag receives components """ if self.tag[-1:] == '/' and components: raise SyntaxError('<%s> tags cannot have components' % self.tag) if len(components) == 1 and isinstance(components[0], (list, tuple)): self.components = list(components[0]) else: self.components = list(components) self.attributes = attributes self._fixup() # converts special attributes in components attributes self.parent = None for c in self.components: self._setnode(c) self._postprocessing()
def __init__(self, data, **args): self.data = data self.attributes = args self.components = [] if not '_class' in self.attributes: self['_class'] = 'web2py-menu web2py-menu-vertical' if not 'ul_class' in self.attributes: self['ul_class'] = 'web2py-menu-vertical' if not 'li_class' in self.attributes: self['li_class'] = 'web2py-menu-expand' if not 'li_first' in self.attributes: self['li_first'] = 'web2py-menu-first' if not 'li_last' in self.attributes: self['li_last'] = 'web2py-menu-last' if not 'li_active' in self.attributes: self['li_active'] = 'web2py-menu-active' if not 'mobile' in self.attributes: self['mobile'] = False
def __init__(self, text, extra=None, allowed=None, sep='p', url=None, environment=None, latex='google', autolinks='default', protolinks='default', class_prefix='', id_prefix='markmin_', **kwargs): self.text = text self.extra = extra or {} self.allowed = allowed or {} self.sep = sep self.url = URL if url == True else url self.environment = environment self.latex = latex self.autolinks = autolinks self.protolinks = protolinks self.class_prefix = class_prefix self.id_prefix = id_prefix self.kwargs = kwargs
def __init__(self, *components, **attributes): """ :param *components: any components that should be nested in this element :param **attributes: any attributes you want to give to this element :raises SyntaxError: when a stand alone tag receives components """ if self.tag[-1:] == '/' and components: raise SyntaxError('<%s> tags cannot have components' % self.tag) if len(components) == 1 and isinstance(components[0], (list, tuple)): self.components = list(components[0]) else: self.components = list(components) self.attributes = attributes self._fixup() # converts special attributes in components attributes self.parent = None for c in self.components: self._setnode(c) self._postprocessing()
def __init__(self, text, extra=None, allowed=None, sep='p', url=None, environment=None, latex='google', autolinks='default', protolinks='default', class_prefix='', id_prefix='markmin_'): self.text = text self.extra = extra or {} self.allowed = allowed or {} self.sep = sep self.url = URL if url == True else url self.environment = environment self.latex = latex self.autolinks = autolinks self.protolinks = protolinks self.class_prefix = class_prefix self.id_prefix = id_prefix
def __init__(self, data, **args): self.data = data self.attributes = args self.components = [] if '_class' not in self.attributes: self['_class'] = 'web2py-menu web2py-menu-vertical' if 'ul_class' not in self.attributes: self['ul_class'] = 'web2py-menu-vertical' if 'li_class' not in self.attributes: self['li_class'] = 'web2py-menu-expand' if 'li_first' not in self.attributes: self['li_first'] = 'web2py-menu-first' if 'li_last' not in self.attributes: self['li_last'] = 'web2py-menu-last' if 'li_active' not in self.attributes: self['li_active'] = 'web2py-menu-active' if 'mobile' not in self.attributes: self['mobile'] = False
def __init__(self, text, extra=None, allowed=None, sep='p', url=None, environment=None, latex='google', autolinks='default', protolinks='default', class_prefix='', id_prefix='markmin_', **kwargs): self.text = text self.extra = extra or {} self.allowed = allowed or {} self.sep = sep self.url = URL if url is True else url self.environment = environment self.latex = latex self.autolinks = autolinks self.protolinks = protolinks self.class_prefix = class_prefix self.id_prefix = id_prefix self.kwargs = kwargs
def parseLink(html=""): from HTMLParser import HTMLParser class Html(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.result = set() def getResult(self): return self.result def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) def handle_starttag(self, tag, attrs): if tag == 'a': for key, value in attrs: if key == "href": self.result.add(value) h = Html() h.feed(html) return h.result
def __init__(self): """Initialize attributes.""" if sys.version.startswith('3.'): # Python 3.x super().__init__(convert_charrefs=False) else: # use HTMLParser.__init__ because HTMLParser is an 'old' style class, which cannot be passed to super() # see http://codependentcodr.blogspot.com/2012/02/python-htmlparser-and-super.html HTMLParser.__init__(self) self._root = _HtmlHeaderNode(level=0) # root node with no data of itself, only 'children' matters self._curr_node = self._root # most recently handled header node self._in_header = False self._header_id_count = {} # record header ids to avoid collisions self._html = '' # full HTML string parsed self._temp_start_tag = '' # temporary HTML start tag of this current header node
def __init__(self, results, url): HTMLParser.__init__(self) self.results = results self.url = url self.current_item = {} # One torrent result self.add_query = True self.torrent_info_index = 0 # Count of the meta data encountered self.torrent_info_array = [] self.meta_data_grabbing = 0 self.meta_data_array = [] self.torrent_no_files = 0 self.torrent_date_added = 0 self.torrent_popularity = 0 self.mangnet_link = "" self.desc_link = "" self.torrent_name = ""
def __init__(self, model, label, data=[]): """ Returns a new Model calibrated on the given data, which is a set of (vector, label)-tuples. """ self._model = model self._label = label # Isotonic regression: y = ((model.predict(v)[label], label == x) for v, x in data) y = sorted(y) # monotonic y = zip(*y) y = list(y or ((),())) x = list(y[0]) y = list(y[1]) y = pav(y) x = [0] + x + [1] y = [0] + y + [1] f = {} i = 0 # Linear interpolation: for p in range(100 + 1): p *= 0.01 while x[i] < p: i += 1 f[p] = (y[i-1] * (x[i] - p) + y[i] * (p - x[i-1])) / (x[i] - x[i-1]) self._f = f
def __init__(self, path='WordNet-3.0'): """ Opens the WordNet database from the given path (that contains dict/index.noun, dict/data.noun, ...) """ self._f = {} # {'n': <open file 'dict/index.noun'>} for k, v in (('n', 'noun'), ('v', 'verb'), ('a', 'adj' ), ('r', 'adv' )): f = cd(path, 'dict', 'data.%s' % v) f = open(f, 'rb') self._f[k] = f f = cd(path, 'dict', 'index.%s' % v) f = open(f, 'r') for s in f: if not s.startswith(' '): s = s.strip() s = s.split(' ') p = s[-int(s[2]):] w = s[0] w = w.replace('_', ' ') self[w, k] = p # {('grasp', 'n'): (offset1, ...)} f.close()
def __init__(self): HTMLParser.__init__(self) self.F_DATA=[] #KEEPS WHOLE TABLE DATA OF THE WEBPAGE self.TABLE=[] self.ROW=[] self.table_no=0 self.row_no=0 self.col_no=0 self.current_table=0 self.current_row=0 self.current_col=0 self.print_flag=False self.col_data='' self.pagination=False self.first_page=False# sets true only if processing the first page of the output for it is from that #page the links of next pages of reports are fetched self.page_link_list=[] # Stores total pages found in the first page"
def get_links(html): # ???????? class URLSeeker(HTMLParser): def __init__(self): HTMLParser.__init__(self) # ?? ???super.__init__(self) self.urls = [] def handle_starttag(self, tag, attrs): href = dict(attrs).get('href') if href and tag == 'a': self.urls.append(href) url_seeker = URLSeeker() url_seeker.feed(html) print('@@'*20) print(url_seeker.urls) print('@@'*20) return url_seeker.urls # ?????????
def __init__(self, data): """ The data holds the characters. Example: html = Html() data = '<body><em>alpha</em></body>' dom = html.feed(data) x = dom.fst('em') x.append(Data('\nbeta')) It outputs. <body ><em >alpha beta</em></body> """ Root.__init__(self, DATA) self.data = data
def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs == None: attrs = [] self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities def convert(kval): "Converts HTML, XML and numeric entities in the attribute value." k, val = kval if val is None: return kval return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs)
def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isString(attrs): kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text
def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions.
def __init__(self, soup): HTMLParser.__init__(self) self.soup = soup # We inherit feed() and reset().
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False): self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): self.originalEncoding = None self.unicode = unicode(markup) return u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) if u: break if not u: for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break self.unicode = u if not u: self.originalEncoding = None
def __init__(self, argv): self.user_stock_list = USER_STOCK_LIST self.stock_list = [] self.query_list = [] self.stock_query_str = '' self.data = [] self.twse_url = TWSE_SERVER + '/stock/api/getStockInfo.jsp?ex_ch=' self.json_data = '' self.argv = argv # append stock in monitor mode
def __init__(self): HTMLParser.__init__(self) self.data = [] self.item_limit = 15 self.item_index = 0
def __init__(self, q, start, proxy=None, check=True, callback=None): HTMLParser.__init__(self) self.__q = quote(q) self.__start = int(start) self.__s = 0 self.__callback = callback self.__check = check self.__buffer = "" self.__proxy = proxy
def __init__(self, ldomain, scandpth, lps): HTMLParser.__init__(self) self.url = ldomain self.db = {self.url: 1} self.node = [self.url] self.depth = scandpth self.max_span = lps self.links_found = 0
def __init__(self): HTMLParser.__init__(self) self.__text = []
def __init__(self, f, encoding): self.reader = codecs.getreader(encoding)(f)
def __init__(self, f, encoding, **kwds): # Redirect output to a queue import cStringIO self.queue = cStringIO.StringIO() self.writer = csv.writer(self.queue, **kwds) self.stream = f self.encoding = encoding self.encoder = codecs.getincrementalencoder(self.encoding)()
def __init__(self, f, fieldnames, encoding, **kwds): super(UnicodeDictWriter, self).__init__(f, fieldnames, **kwds) self.writer = UnicodeWriter(f, encoding, **kwds) # Open a CSV file, get optional arguments [charset <String>] [columndelimiter <Character>] [quotechar <Character>] [fields <FieldNameList>]
def __init__(self): HTMLParser.__init__(self) self.fed = [] self.fed_text = None self.table_counter = 0
def __init__(self): HTMLParser.__init__(self) self.fed = [] self.fed_in_section = [] self.fed_text = None self.section_found = False self.section_name = False self.table_counter = 0 self.lead_found = False self.tracking_link = False self.tracking_see_also = False self.navbox_counter = 0 self.in_section = False # self.paragraph_found = False # self.paragraph_counter = 0
def __init__(self, allows = []): HTMLParser.__init__(self) self.allow_tags = allows if allows else self.allow_tags self.result = [] self.start = [] self.data = []
def __init__(self): HTMLParser.__init__(self)
def __init__(self): HTMLParser.__init__(self) self.files = []
def __init__(self, warnaction = None, warngoal = sys.stderr, caller_id = 0): self.warn_lock = RLock() self.onceregistry = {} self.filters = [] self._ids = [] if not caller_id in self._ids: self._ids.append(caller_id) self.warngoal = warngoal if warnaction == None: warnaction = "default" self.set_warnaction(warnaction, caller_id)
def __init__(self, dtree, parent = None): self.node_lock = RLock() with self.node_lock: self.dtc = DataTreeConstants() self.children = [] self.dtree = dtree self.parent = parent self.value = None self.child_index = 0 self.level = 0 self.links = {} self.links["values"] = {} self.links["nodes"] = {} self.end_links = {} self.end_links["values"] = {} self.end_links["nodes"] = {} self.is_root = bool(self.parent == None) n = self while not n.is_root: n = n.parent self.root = n if isinstance(parent, DATAnode): self.parent.append_child(self) self.level = parent.level + 1
def __init__(self, dtree, data = None, parent = None): self.tag = u'' self.text = u'' self.tail = u'' self.attributes = {} self.attr_names = [] DATAnode.__init__(self, dtree, parent) with self.node_lock: if isinstance(data, (str, unicode)): self.tag = data.lower().strip() elif isinstance(data, list): if len(data) > 0: self.tag = data[0].lower().strip() if len(data) > 1 and isinstance(data[1], (list, tuple)): for a in data[1]: if isinstance(a[1], (str, unicode)): self.attributes[a[0].lower().strip()] = a[1].strip() else: self.attributes[a[0].lower().strip()] = a[1] if 'class' in self.attributes.keys(): self.attr_names.append('class') if 'id' in self.attributes.keys(): self.attr_names.append('id') for a in self.attributes.keys(): if a not in self.attr_names: self.attr_names.append(a)
def __init__(self, data, autoclose_tags=[], print_tags = False, output = sys.stdout, warnaction = "default", warngoal = sys.stderr, caller_id = 0): HTMLParser.__init__(self) DATAtree.__init__(self, output, warnaction, warngoal, caller_id) with self.tree_lock: self.tree_type ='html' self.print_tags = print_tags self.autoclose_tags = autoclose_tags self.is_tail = False self.root = HTMLnode(self, 'root') self.current_node = self.root self.last_node = None self.text = u'' self.open_tags = {} self.count_tags(data) # read the html page into the tree try: # Cover for incomplete reads where the essentiel body part is retrieved for ctag in ('body', 'BODY', 'html', 'HTML', 'xml', 'XML'): if u'<%s>' % (ctag, ) in data and not u'</%s>' % (ctag, ) in data: data = u'%s</%s>' % (data, ctag) self.feed(data) self.reset() self.start_node = self.root except: self.warn('Unable to parse the HTML data. Invalid dataset!', dtDataWarning, 1) self.start_node = NULLnode()