Python BeautifulSoup 模块,BeautifulSoup() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用BeautifulSoup.BeautifulSoup()

项目:Python    作者:Guzi219    | 项目源码 | 文件源码
def ParseHtml(self, html):
        soup = BeautifulSoup(html)
        links = soup.findAll('a', attrs={'class': 'ulink'})
        #print len(links)
        if len(links) == 0: #the js return
            # tmp_js = soup.find(name='script', attrs={'language': 'javascript'})
            js_str = soup.script.string #two ways to get the <script></script>
            new_url = js_str[16:-1] #get the new url
            new_url = eval(new_url) #eval:??????????
            self.ParseHtml(self.LoadPage(new_url))
        else:
            # print type(links)
            for link in links:
                # print type(link)
                # print type(link.string)
                # print unicode(link.string)
                titles = re.findall(r'?(.+?)?', str(link.string)) #unicode(link.string))
                if len(titles) <> 0:
                    print titles[0]
                # print 'url is %s, title is %s.' %(link['href'], titles[0])
项目:Python    作者:Guzi219    | 项目源码 | 文件源码
def GetTotalPage(self, html):
        # create the BeautifulSoup
        some_soup = BeautifulSoup(html)
        #get the page div
        ele_a = some_soup.find('div', attrs={'class': 'page'})
        #get the last div>a text='??'
        last_a = ele_a.findAll('a')[-1]
        #substr 0:.html
        pagenum = last_a.get('href')[:-5]
        print 'pagenum :', pagenum
        # print type(last_a)

        self.SaveTotalPageToFile(pagenum)

    # store the max page number to totalpage.ini
    #new_page_num: new max page num
项目:song-cli    作者:ankitmathur3193    | 项目源码 | 文件源码
def list_of_all_href(self,html):
        '''
        It will return all hyper links found in the mr-jatt page for download
        ''' 
        soup=BeautifulSoup(html)
        links=[]
        a_list=soup.findAll('a','touch')
        for x in xrange(len(a_list)-1):
            link = a_list[x].get('href')
            name = a_list[x]
            name = str(name)
            name=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',name)
            name=re.sub(r'^[0-9]+\.','',name)
            links.append([link,name])

        #quit()
        return links
项目:FreeFoodCalendar    作者:Yuliang-Zou    | 项目源码 | 文件源码
def crawler(urls, max_urls):
    crawled = Set()
    queued = Set(urls)
    pairs = []
    while urls and len(crawled) < max_urls:
        page=urls.pop(0)
        if is_html(page):
            if page not in crawled:
                try:
                    print(page)
                    links=BeautifulSoup(urllib2.urlopen(page,timeout=5).read(), parseOnlyThese=SoupStrainer('a'))
                    for link in links:
                        url = domain + link['href']
                        if verify(url) and url not in queued:
                            # print(url)
                            urls.append('http://' +url)
                            # print(urls)
                            queued.add('http://' +url)
                    # print(page)
                    crawled.add(page)
                    # print(crawled)
                except:
                    continue
    return crawled,pairs
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _extract_description(self, result):
        desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
        if not desc_div:
            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
            return None

        desc_strs = []
        def looper(tag):
            if not tag: return
            for t in tag:
                try:
                    if t.name == 'br': break
                except AttributeError:
                    pass

                try:
                    desc_strs.append(t.string)
                except AttributeError:
                    desc_strs.append(t)

        looper(desc_div)
        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>

        desc = ''.join(s for s in desc_strs if s)
        return self._html_unescape(desc)
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _extract_description(self, result):
        desc_td = result.findNext('td')
        if not desc_td:
            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
            return None

        desc_strs = []
        def looper(tag):
            if not tag: return
            for t in tag:
                try:
                    if t.name == 'br': break
                except AttributeError:
                    pass

                try:
                    desc_strs.append(t.string)
                except AttributeError:
                    desc_strs.append(t)

        looper(desc_td)
        looper(desc_td.find('wbr')) # BeautifulSoup does not self-close <wbr>

        desc = ''.join(s for s in desc_strs if s)
        return self._html_unescape(desc)
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _get_results_page(self, set_type):
        if set_type == LARGE_SET:
            url = GoogleSets.URL_LARGE
        else:
            url = GoogleSets.URL_SMALL

        safe_items = [urllib.quote_plus(i) for i in self.items]
        blank_items = 5 - len(safe_items)
        if blank_items > 0:
            safe_items += ['']*blank_items

        safe_url = url % tuple(safe_items)

        try:
            page = self.browser.get_page(safe_url)
        except BrowserError, e:
            raise GSError, "Failed getting %s: %s" % (e.url, e.error)

        return BeautifulSoup(page)
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _get_results_page(self):
        if self._page == 0:
            if self._results_per_page == 10:
                url = SponsoredLinks.SEARCH_URL_0
            else:
                url = SponsoredLinks.SEARCH_URL_1
        else:
            if self._results_per_page == 10:
                url = SponsoredLinks.NEXT_PAGE_0
            else:
                url = SponsoredLinks.NEXT_PAGE_1

        safe_url = url % { 'query': urllib.quote_plus(self.query),
                           'start': self._page * self._results_per_page,
                           'num': self._results_per_page }

        try:
            page = self.browser.get_page(safe_url)
        except BrowserError, e:
            raise SLError, "Failed getting %s: %s" % (e.url, e.error)

        return BeautifulSoup(page)
项目:minerva    作者:linzhi    | 项目源码 | 文件源码
def get_content(cls, url=None, session=None):
        """
        @brief: ??url????????????
        """

        hyperlinks = set()
        soup_context = None

        # ???????????????
        html_context = cls.parse_page(url, session)
        if html_context:
            soup_context = BeautifulSoup.BeautifulSoup(html_context)
            if soup_context:
                soup_context = BeautifulSoup.BeautifulSoup(html_context)
                for each_link in soup_context.findAll('a'):
                    hyperlink = urlparse.urljoin(url, (each_link or {}).get('href'))
                    hyperlinks.add(hyperlink)

        return hyperlinks, soup_context
项目:google_scholar_paper_finder    作者:maikelronnau    | 项目源码 | 文件源码
def make_soup(markup, parser=None):
        """Factory method returning a BeautifulSoup instance. The created
        instance will use a parser of the given name, if supported by
        the underlying BeautifulSoup instance.
        """
        if 'bs4' in sys.modules:
            # We support parser specification. If the caller didn't
            # specify one, leave it to BeautifulSoup to pick the most
            # suitable one, but suppress the user warning that asks to
            # select the most suitable parser ... which BS then
            # selects anyway.
            if parser is None:
                warnings.filterwarnings('ignore', 'No parser was explicitly specified')
            return BeautifulSoup(markup, parser)

        return BeautifulSoup(markup)
项目:citations    作者:frederick0329    | 项目源码 | 文件源码
def make_soup(markup, parser=None):
        """Factory method returning a BeautifulSoup instance. The created
        instance will use a parser of the given name, if supported by
        the underlying BeautifulSoup instance.
        """
        if 'bs4' in sys.modules:
            # We support parser specification. If the caller didn't
            # specify one, leave it to BeautifulSoup to pick the most
            # suitable one, but suppress the user warning that asks to
            # select the most suitable parser ... which BS then
            # selects anyway.
            if parser is None:
                warnings.filterwarnings('ignore', 'No parser was explicitly specified')
            return BeautifulSoup(markup, parser)

        return BeautifulSoup(markup)
项目:DevOps    作者:YoLoveLife    | 项目源码 | 文件源码
def get_member_attributes(self):
        """ Returns a dictionary of a balancer member's attributes."""

        balancer_member_page = fetch_url(self.module, self.management_url)

        try:
            assert balancer_member_page[1]['status'] == 200
        except AssertionError:
            self.module.fail_json(msg="Could not get balancer_member_page, check for connectivity! " + balancer_member_page[1])
        else:
            try:
                soup = BeautifulSoup(balancer_member_page[0])
            except TypeError:
                self.module.fail_json(msg="Cannot parse balancer_member_page HTML! " + str(soup))
            else:
                subsoup = soup.findAll('table')[1].findAll('tr')
                keys = subsoup[0].findAll('th')
                for valuesset in subsoup[1::1]:
                    if re.search(pattern=self.host, string=str(valuesset)):
                        values = valuesset.findAll('td')
                        return dict((keys[x].string, values[x].string) for x in range(0, len(keys)))
项目:plugin.video.brplay    作者:olavopeixoto    | 项目源码 | 文件源码
def get_categories():

    url = "http://sexyhotplay.com.br/categorias/"
    html = client.request(url, headers={'Cookie': 'disclaimer-sexyhotplay=1;'})

    soup = bs(html)
    div = soup.find('div', attrs={'class': 'colunas-3-15'})

    links = div.findAll('a', attrs={'class': 'link'}, recursive=True)

    results = []
    for link in links:
        label = link.find('strong').string
        url = 'http://sexyhotplay.com.br' + link['href']
        results.append({
            'name': label,
            # 'clearlogo': os.path.join(artPath, 'logo_sexyhot.png'),
            'url': url
        })

    return results
项目:MalwrAgent    作者:michaelschratt    | 项目源码 | 文件源码
def f_grab_cmd_from_twitter_profile(profile_name):
        """grab 0xXXXXXXXX tag from profile, tag must match [a-zA-Z0-9_]
        :rtype: string
        :param profile_name: twitter profile name without leading @
        :return: string embedded in the profile description
        """
        url = 'https://twitter.com/%(profile)s'
        payload = {
            'profile': profile_name
        }
        html = requests.get(url % payload)
        soup = soupy(html.text)
        profile_description = soup.find('meta', {'name': 'description'})['content']
        match = re.search('0x(\w+)', profile_description)
        output = match.group(1)  # group 1 consists of match between ( )

        return str(output)
项目:download-manager    作者:thispc    | 项目源码 | 文件源码
def decrypt(self, pyfile):
        self.pyfile = pyfile

        if self.article.match(pyfile.url):
            html = self.load(pyfile.url)
            soup = BeautifulSoup.BeautifulSoup(
                html, convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES)

            links = []
            for a in soup.findAll("a", attrs={'href': self.hoster_links}):
                for decrypted_link in self.decrypt_folder(a.get('href')):
                    links.append(decrypted_link)

            self.packages.append((pyfile.name, links, pyfile.name))
        else:
            self.links = self.decrypt_folder(pyfile.url)
项目:USTC-Today    作者:HengRuiZ    | 项目源码 | 文件源码
def search(key_word):
    global x
    search_url='http://news.sogou.com/news?ie=utf8&p=40230447&interV=kKIOkrELjboMmLkEkLoTkKIMkLELjb8TkKIMkrELjboImLkEk74TkKILmrELjbgRmLkEkLY=_485898072&query=%E4%B8%AD%E7%A7%91%E5%A4%A7&'
    req=urllib2.urlopen(search_url.replace('key_word',key_word))
    real_visited=0
    html=req.read()
    soup=BeautifulSoup(html)
    #print soup
    content  = soup.findAll(name="a",attrs={"href":True,"data-click":True,"target":True}) #resultset object
    num = len(content)
    #print num
    for i in range(9):
        #???????????????????url
        p_str= content[2*i] #if no result then nontype object
        tit[i]=p_str.renderContents()
        tit[i]=tit[i].decode('utf-8', 'ignore')#need it
        tit[i]= re.sub("<[^>]+>","",tit[i])
        print(tit[i])
        url[i]=str(p_str.get("href"))
        print(url[i])
        #???????url???
        img[i]=getimg(url[i])
        w, h = img[i].size
        img[i]=resize(w,h, w_box, h_box,img[i])
项目:ProxyIPCrawler    作者:uilliu    | 项目源码 | 文件源码
def ProxyIPSpider(self):
    # get the proxy
    f = open('proxy.txt', 'w')
    for page in range(1,50):
        url = 'http://www.xicidaili.com/nn/%s' %page
        user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
        request = urllib2.Request(url)
        request.add_header("User-Agent", user_agent)
        content = urllib2.urlopen(request)
        soup = BeautifulSoup(content)
        trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
        for tr in trs[1:]:
            tds = tr.findAll('td')
            ip = tds[2].text.strip()
            port = tds[3].text.strip()
            protocol = tds[6].text.strip()
            if protocol == 'HTTP' or protocol == 'HTTPS':
                f.write('%s=%s:%s\n' % (protocol, ip, port))
                print '%s://%s:%s' % (protocol, ip, port)
项目:ProxyIPCrawler    作者:uilliu    | 项目源码 | 文件源码
def caiji2(self):   #“???IP”
    # ??????IP???????????????????????????????
    of = open('proxy.txt', 'w')
    url = 'http://www.haodailiip.com/guonei/'
    for i in range(1,20):
        Url = 'http://www.haodailiip.com/guonei/' + str(i)
        print u"????"+Url
        html = requests.get(Url).text
        bs = BeautifulSoup(html)
        table = bs.find('table',{"class":"proxy_table"})
        tr = table.findAll('tr')
        for i in range(1,31):
            td = tr[i].findAll('td')
            proxy_ip = td[0].text.strip()
            proxy_port = td[1].text.strip()
            of.write('http=%s:%s\n' %(proxy_ip,proxy_port))
            print 'http=%s:%s\n' %(proxy_ip,proxy_port)
        time.sleep(2)
    of.closed
项目:ProxyIPCrawler    作者:uilliu    | 项目源码 | 文件源码
def ProxyIPSpider(self):
    # get the proxy
    f = open('proxy.txt', 'w')
    for page in range(1,50):
        url = 'http://www.xicidaili.com/nn/%s' %page
        user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
        request = urllib2.Request(url)
        request.add_header("User-Agent", user_agent)
        content = urllib2.urlopen(request)
        soup = BeautifulSoup(content)
        trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
        for tr in trs[1:]:
            tds = tr.findAll('td')
            ip = tds[2].text.strip()
            port = tds[3].text.strip()
            protocol = tds[6].text.strip()
            if protocol == 'HTTP' or protocol == 'HTTPS':
                f.write('%s=%s:%s\n' % (protocol, ip, port))
                print '%s://%s:%s' % (protocol, ip, port)
项目:ProxyIPCrawler    作者:uilliu    | 项目源码 | 文件源码
def caiji2(self):   #“???IP”
    # ??????IP???????????????????????????????
    of = open('proxy.txt', 'w')
    url = 'http://www.haodailiip.com/guonei/'
    for i in range(1,20):
        Url = 'http://www.haodailiip.com/guonei/' + str(i)
        print u"????"+Url
        html = requests.get(Url).text
        bs = BeautifulSoup(html)
        table = bs.find('table',{"class":"proxy_table"})
        tr = table.findAll('tr')
        for i in range(1,31):
            td = tr[i].findAll('td')
            proxy_ip = td[0].text.strip()
            proxy_port = td[1].text.strip()
            of.write('http=%s:%s\n' %(proxy_ip,proxy_port))
            print 'http=%s:%s\n' %(proxy_ip,proxy_port)
        time.sleep(2)
    of.closed
项目:homemadescripts    作者:helioloureiro    | 项目源码 | 文件源码
def DuckDuckGo(cmd):
    debug(cmd.text)
    q = cmd.text.split()
    if len(q) == 1:
        return
    question = "+".join(q[1:])
    debug("Question=%s" % question)
    req = requests.get("https://duckduckgo.com/html/?q=%s" % question)
    answer = None
    html = bp.BeautifulSoup(req.text)
    responses = html.findAll("div", id="zero_click_abstract")
    try:
        answer = responses[0].text
    except Exception as e:
        print e # get internal
        pass
    if not answer:
        bot.reply_to(cmd, "Não tenho a menor idéia.  Tem de perguntar no google.")
        return
    try:
        bot.reply_to(cmd, answer)
    except Exception as e:
        bot.reply_to(cmd, "Deu merda: %s" % e)
项目:pixiebot    作者:umago    | 项目源码 | 文件源码
def _update_cache(release):
    LOG.debug('Updating cache for the release "%s"', release)
    url = BASE_URL % release
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page)
    specs = {}
    for link in soup.findAll('a', attrs={'href': re.compile('.html$')}):
        href = link.get('href')
        title = ' '.join(href.replace('.html', '').split('-'))
        link = url + href
        specs[title] = link

    _CACHE[release] = {}
    _CACHE[release]['specs'] = specs
    _CACHE[release]['updated_at'] = datetime.datetime.utcnow()
    LOG.info('Cache updated for the release "%s"', release)
项目:yogame    作者:tivisse    | 项目源码 | 文件源码
def update_planet_fleet(self, planet):
        resp = self.br.open(self._get_url('fleet', planet))
        soup = BeautifulSoup(resp)
        ships = {}
        for k, v in self.SHIPS.iteritems():
            available = 0
            try:
                s = soup.find(id='button' + v)
                available = int(s.find('span', 'textlabel').nextSibling.replace('.', ''))
            except:
                available = 0
            ships[k] = available

        #self.logger.info('Updating %s fleet' % planet)
        #self.logger.info('%s' % fleet)
        planet.ships = ships
项目:yogame    作者:tivisse    | 项目源码 | 文件源码
def update_planet_research(self, planet):
        resp = self.br.open(self._get_url('research', planet))
        soup = BeautifulSoup(resp)
        try:
            ButtonList = soup.find(id='buttonz')
            AllResearchList = ButtonList.findAll('li')
            for research in AllResearchList:
                if research.get('class') == 'on':
                    fb = research.find('a', 'fastBuild')
                    if fb:
                        build_url = fb.get('onclick') if fb else ''
                        build_url = self._parse_research_url(build_url)
                        self.logger.info('Research launched on %s:%s'% (planet, fb.get('title')))
                        self.br.open(build_url)
                        break
        except:
            self.logger.exception('Exception while retrieving researches')
项目:yogame    作者:tivisse    | 项目源码 | 文件源码
def update_planet_facilities(self, planet):
        resp = self.br.open(self._get_url('station', planet))
        soup = BeautifulSoup(resp)
        try:
            ButtonList = soup.find(id='stationbuilding')
            AllResearchList = ButtonList.findAll('li')
            for research in AllResearchList:
                if research.get('class') == 'on':
                    fb = research.find('a', 'fastBuild')
                    if fb:
                        build_url = fb.get('onclick') if fb else ''
                        build_url = self._parse_research_url(build_url)
                        self.logger.info('Facility upgraded on %s:%s'% (planet, fb.get('title')))
                        self.br.open(build_url)
                        break
        except:
            self.logger.exception('Exception while retrieving facilities statuses')


        return True
项目:BiLSTM-CCM    作者:codedecde    | 项目源码 | 文件源码
def getFirstPostData(forum_text):
    soup = BeautifulSoup(forum_text)
    title = ""
    date = ""
    body = ""
    try:
        date = soup.find("div", attrs={"class": "postDate"}).text
    except AttributeError:
        print "Date not found"
    try:
        title = soup.find("div", attrs={"class": "postTitle"}).text
    except AttributeError:
        print "Title not found"
    try:
        body = soup.find("div", attrs={"class": "postBody"}).text
    except AttributeError:
        print "Body not found, now this is weird"
    return [title,date,body]
项目:snowballing    作者:JoaoFelipe    | 项目源码 | 文件源码
def parse(self, html):
        """
        This method initiates parsing of HTML content, cleans resulting
        content as needed, and notifies the parser instance of
        resulting instances via the handle_article callback.
        """
        self.soup = BeautifulSoup(html, "html.parser")

        # This parses any global, non-itemized attributes from the page.
        self._parse_globals()

        # Now parse out listed articles:
        for div in self.soup.findAll(ScholarArticleParser._tag_results_checker):
            self._parse_article(div)
            self._clean_article()
            if self.article['title']:
                self.handle_article(self.article)
项目:script.module.metadatautils    作者:marcelveldt    | 项目源码 | 文件源码
def get_data(self, search_query):
        '''helper method to get data from google images by scraping and parsing'''
        params = {"site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query}
        headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \
            IEMobile/7.0; LG; GW910)'}
        html = ''
        try:
            html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text
        except Exception as exc:
            log_exception(__name__, exc)
        soup = BeautifulSoup.BeautifulSoup(html)
        results = []
        for div in soup.findAll('div'):
            if div.get("id") == "images":
                for a_link in div.findAll("a"):
                    page = a_link.get("href")
                    try:
                        img = page.split("imgurl=")[-1]
                        img = img.split("&imgrefurl=")[0]
                        results.append(img)
                    except Exception:
                        pass
        return results
项目:script.module.metadatautils    作者:marcelveldt    | 项目源码 | 文件源码
def get_top250_db(self):
        '''
            get the top250 listing for both movies and tvshows as dict with imdbid as key
            uses 7 day cache to prevent overloading the server
        '''
        results = {}
        for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]:
            html = requests.get(
                "http://www.imdb.com/chart/%s" %
                listing[0], headers={
                    'User-agent': 'Mozilla/5.0'}, timeout=20)
            soup = BeautifulSoup.BeautifulSoup(html.text)
            for table in soup.findAll('table'):
                if table.get("class") == "chart full-width":
                    for td_def in table.findAll('td'):
                        if td_def.get("class") == "titleColumn":
                            a_link = td_def.find("a")
                            if a_link:
                                url = a_link["href"]
                                imdb_id = url.split("/")[2]
                                imdb_rank = url.split(listing[1])[1]
                                results[imdb_id] = try_parse_int(imdb_rank)
        self.write_kodidb(results)
        return results
项目:Anki-Addons    作者:searene    | 项目源码 | 文件源码
def purgeAttributes(self, mime, _old):
    html = mime.html()
    soup = BeautifulSoup(html)
    newMime = QMimeData()
    for tag in soup.recursiveChildGenerator():
        # remove attributes in the list
        index = -1
        try:
            for key, value in tag.attrs:
                index += 1
                if key != 'style':
                    continue
                new = value.split(';')
                new = ';'.join([s for s in new
                    if s.split(':')[0].strip() not in REMOVE_ATTRIBUTES])
                tag.attrs[index] = (u'style', new)
        except AttributeError: 
            # 'NavigableString' object has no attribute 'attrs'
            pass

    # assign the modified html to new Mime
    newMime.setHtml(str(soup).decode('utf8'))

    # default _processHtml method
    return _old(self, newMime)
项目:Ohio-Basketball-Scaper    作者:NulledExceptions    | 项目源码 | 文件源码
def get_address(self,id):

        url= self.base_url.format(school_id=id)
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        #print soup
        for breaks in soup.findAll('br'):
            breaks.extract()
        for ahref in soup.findAll('a'):
            ahref.extract()
        # print soup
        span_content = soup.find("span", {"class": "schoolAddress"})
        if not span_content:
            print span_content, id
            return None
        ##RAISE EXCEPTION INSTEAD OF RETURNING NONE

        address= span_content.renderContents().replace('\n', '').strip()

        return address


##
项目:Pokemon-Vortex-3-Ultimate-Bot    作者:Kyzaghan    | 项目源码 | 文件源码
def get_balls(self, response):
        """
        Getting pokéballs
        :param response: Inventory response text (html)
        :return: None
        """
        try:
            ph = BeautifulSoup(response, "html.parser")
            pokeball_div = ph.find_all('div', attrs={"class": "list autowidth"})
            ph = BeautifulSoup(str(pokeball_div[1]), "html.parser")
            ph = BeautifulSoup(str(ph.find_all("tr")), "html.parser")
            i = 0
            for tdList in ph.find_all("td"):
                if i == 3:  # Pokeball
                    self.Pokeball = int(tdList.text)
                elif i == 10:  # Great Ball
                    self.GreatBall = int(tdList.text)
                elif i == 17:  # Ultra Ball
                    self.UltraBall = int(tdList.text)
                elif i == 24:  # Master Ball
                    self.MasterBall = int(tdList.text)
                i += 1
        except Exception as e:
            self.l.writelog(str(e), "critical")
            return None
项目:Pokemon-Vortex-3-Ultimate-Bot    作者:Kyzaghan    | 项目源码 | 文件源码
def get_pots(self, response):
        """
        Getting pots
        :param response: Inventory response text (html)
        :return: None
        """
        try:
            ph = BeautifulSoup(response, "html.parser")
            pokeball_div = ph.find_all('div', attrs={"class": "list autowidth"})
            ph = BeautifulSoup(str(pokeball_div[0]), "html.parser")
            ph = BeautifulSoup(str(ph.find_all("tr")), "html.parser")
            i = 0
            for tdList in ph.find_all("td"):
                if i == 3:  # Potion
                    self.Potion = int(tdList.text)
                elif i == 10:  # Super Potion
                    self.SuperPotion = int(tdList.text)
                elif i == 17:  # Hyper Potion
                    self.HyperPotion = int(tdList.text)
                i += 1
        except Exception as e:
            self.l.writelog(str(e), "critical")
            return None
项目:Pokemon-Vortex-3-Ultimate-Bot    作者:Kyzaghan    | 项目源码 | 文件源码
def get_war_status(self, response):
        """
        Parse war status
        :param response: html response
        """
        ph = BeautifulSoup(response, "html.parser")
        tmp_image = ph.find_all("img")
        if len(tmp_image) >= 4:
            self.your_hp = tmp_image[2]['width']
            self.enemy_hp = tmp_image[3]['width']
        else:
            self.enemy_hp = 0
            self.your_hp = 0

        tmp_status = ph.find_all("td", attrs={"valign": "top"})
        if len(tmp_status) >= 2:
            self.your_status = tmp_status[0].text
            self.enemy_status = tmp_status[1].text
        else:
            self.your_status = ""
            self.enemy_status = ""
项目:plugin.video.amazon65    作者:phil65    | 项目源码 | 文件源码
def parseSubs(data):
    subs = []
    if addon.getSetting('subtitles') == 'false':
        return subs
    for sub in data:
        lang = sub['displayName'].split('(')[0].strip()
        common.Log('Convert %s Subtitle' % lang)
        file = xbmc.translatePath('special://temp/%s.srt' % lang).decode('utf-8')
        soup = BeautifulSoup(common.getURL(sub['url']))
        enc = soup.originalEncoding
        num = 0
        with codecs.open(file, 'w', encoding='utf-8') as srt:
            for caption in soup.findAll('tt:p'):
                num += 1
                subtext = caption.renderContents().decode(enc).replace('<tt:br>', '\n').replace('</tt:br>', '')
                srt.write(u'%s\n%s --> %s\n%s\n\n' % (num, caption['begin'], caption['end'], subtext))
        subs.append(file)
    return subs
项目:slack_scholar    作者:xLeitix    | 项目源码 | 文件源码
def parse(self, html):
        """
        This method initiates parsing of HTML content, cleans resulting
        content as needed, and notifies the parser instance of
        resulting instances via the handle_article callback.
        """
        self.soup = BeautifulSoup(html)

        # This parses any global, non-itemized attributes from the page.
        self._parse_globals()

        # Now parse out listed articles:
        for div in self.soup.findAll(ScholarArticleParser._tag_results_checker):
            self._parse_article(div)
            self._clean_article()
            if self.article['title']:
                self.handle_article(self.article)
项目:toll_road    作者:idosekely    | 项目源码 | 文件源码
def get_price(self):
        r = requests.get('https://www.fastlane.co.il/mobile.aspx', verify=False)
        parsed_html = BeautifulSoup(r.content)
        price = parsed_html.find('span', attrs={'id': 'lblPrice'}).text
        return int(price)
项目:Python    作者:Guzi219    | 项目源码 | 文件源码
def GetTotalPage(self, html):
        # create the BeautifulSoup
        some_soup = BeautifulSoup(html)
        ele_a = some_soup.findAll('a', attrs={'class': 'page-numbers'})
        if len(ele_a) > 0:
            last_page_html = ele_a[len(ele_a) - 1]  # get the max page number.
            #print last_page_html

            # get the lastpage number
            pagenum = last_page_html.text

            self.SaveTotalPageToFile(pagenum)

    # store the max page number to totalpage.ini
    #new_page_num: new max page num
项目:jx-sqlite    作者:mozilla    | 项目源码 | 文件源码
def list(self, prefix=None, marker=None, delimiter=None):
        # https://s3.amazonaws.com/net-mozaws-stage-fx-test-activedata?marker=jenkins-go-bouncer.prod-3019/py27.log
        # <ListBucketResult>
        #     <Name>net-mozaws-stage-fx-test-activedata</Name>
        #     <Prefix/>
        #     <Marker>jenkins-go-bouncer.prod-3019/py27.log</Marker>
        #     <MaxKeys>1000</MaxKeys>
        #     <IsTruncated>true</IsTruncated>
        #     <Contents>
        #         <Key>jenkins-go-bouncer.prod-3020/py27.log</Key>
        #         <LastModified>2017-03-05T07:02:20.000Z</LastModified>
        #         <ETag>"69dcb19e91eb3eec51e1b659801523d6"</ETag>
        #         <Size>10037</Size>
        #         <StorageClass>STANDARD</StorageClass>
        state = Data()
        state.prefix =prefix
        state.delimiter = delimiter
        state.marker = marker
        state.get_more = True

        def more():
            xml = http.get(self.url + "?" + value2url_param(state)).content
            data = BeautifulSoup(xml)

            state.get_more = data.find("istruncated").contents[0] == "true"
            contents = data.findAll("contents")
            state.marker = contents[-1].find("key").contents[0]
            return [{k: t(d.find(k).contents[0]) for k, t in content_keys.items()} for d in contents]

        while state.get_more:
            content = more()
            for c in content:
                yield wrap(c)
项目:v2ex-tornado-2    作者:coderyy    | 项目源码 | 文件源码
def getNodeDetails(self, node):
        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
            return (_base.DOCUMENT,)

        elif isinstance(node, Declaration): # DocumentType
            string = unicode(node.string)
            #Slice needed to remove markup added during unicode conversion,
            #but only in some versions of BeautifulSoup/Python
            if string.startswith('<!') and string.endswith('>'):
                string = string[2:-1]
            m = self.doctype_regexp.match(string)
            #This regexp approach seems wrong and fragile
            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
            #been modified at all
            #We could just feed to it a html5lib tokenizer, I guess...
            assert m is not None, "DOCTYPE did not match expected format"

            name = m.group('name')
            publicId = m.group('publicId')
            if publicId is not None:
                systemId = m.group('systemId1')
            else:
                systemId = m.group('systemId2')
            return _base.DOCTYPE, name, publicId or "", systemId or ""

        elif isinstance(node, Comment):
            string = unicode(node.string)
            if string.startswith('<!--') and string.endswith('-->'):
                string = string[4:-3]
            return _base.COMMENT, string

        elif isinstance(node, unicode): # TextNode
            return _base.TEXT, node

        elif isinstance(node, Tag): # Element
            return (_base.ELEMENT, namespaces["html"], node.name,
                    dict(node.attrs).items(), node.contents)
        else:
            return _base.UNKNOWN, node.__class__.__name__
项目:v2ex-tornado-2    作者:coderyy    | 项目源码 | 文件源码
def __init__(self, namespaceHTMLElements):
        if namespaceHTMLElements:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
项目:v2ex-tornado-2    作者:coderyy    | 项目源码 | 文件源码
def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup, None)
项目:v2ex-tornado-2    作者:coderyy    | 项目源码 | 文件源码
def elementClass(self, name, namespace):
        if namespace is not None:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        return Element(Tag(self.soup, name), self.soup, namespace)
项目:v2ex-tornado-2    作者:coderyy    | 项目源码 | 文件源码
def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)
项目:song-cli    作者:ankitmathur3193    | 项目源码 | 文件源码
def Parse(self,song_name):
        song_name = '+'.join(song_name)
        url="https://www.youtube.com/results?search_query="
        url=url+song_name
        file_download=FileDownload()
        html=file_download.get_html_response(url)
        soup=BeautifulSoup(html)
        download_url = soup.find('a',attrs={'class':'yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink      spf-link '})
        temp_url='https://www.youtube.com'
        final_url=temp_url+download_url.get('href')
        return final_url
项目:song-cli    作者:ankitmathur3193    | 项目源码 | 文件源码
def missing_schema(self,html,song_name):
        '''
        It will print the list of songs that can be downloaded
        '''
        #html=self.get_html_response(url)
        soup=BeautifulSoup(html)
        name=' '.join(song_name)
        print '%s not found'%name
        print "But you can download any of the following songs :"
        a_list=soup.findAll('a','touch')
        for x in xrange(len(a_list)-1):
            r=a_list[x]
            p=str(r)
            q=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',p)
            print q
项目:song-cli    作者:ankitmathur3193    | 项目源码 | 文件源码
def check_if_song_name(self,html):
        '''
        Returns true if user entered artist or movie name
        '''
        soup=BeautifulSoup(html)
        a_list=soup.findAll('a','touch')
        #print a_list
        text=[str(x) for x in a_list]
        text=''.join(text)
        text=text.lower()
        string1='download in 48 kbps'
        string2='download in 128 kbps'
        string3='download in 320 kbps'

        href=''
        if string3 in text:
            #print 'Downloading in 320 kbps'
            href=a_list[2].get('href')

        elif string2 in text:
            #print 'Downloading in 128 kbps'
            href=a_list[1].get('href')

        elif string1 in text:
            #print 'Downloading in 48 kbps' 
            href=a_list[0].get('href')
        else:
            return (True,'nothing')     

        return (False,href)
项目:WeenieBot    作者:Beefywhale    | 项目源码 | 文件源码
def g_search_custom(message, client, search):
    loop = asyncio.get_event_loop()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    search = search.replace(' ', '+')
    async with aiohttp.get('https://www.google.com/search?q={}&start=1&num=1'.format(search), headers=headers) as gr:
        try: 
            from BeautifulSoup import BeautifulSoup
        except ImportError:
            from bs4 import BeautifulSoup
        html = await gr.text()
        results = []
        parsed_html = BeautifulSoup(html, "html.parser")
        for item in parsed_html.find_all('h3', attrs={'class': 'r'}):
            results.append(str(item.a['href']).replace('/url?q=', '').split('&sa=U&ved=')[0])
    await client.send_message(message.channel, 'Top result for `{}`: '.format(search) + ''.join(results[0]))
项目:Learning-Concurrency-in-Python    作者:PacktPublishing    | 项目源码 | 文件源码
def getLinks():
  req = urllib2.urlopen('http://www.example.com')
  soup = BeautifulSoup(req.read())
  for link in soup.findAll('a'):
    linkArray.append(link.get('href'))
    print(len(linkArray))
项目:touch-pay-client    作者:HackPucBemobi    | 项目源码 | 文件源码
def expand_html(html, cdict=None):
    if not have_soup:
        raise RuntimeError("Missing BeautifulSoup")
    soup = BeautifulSoup(html)
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]
    for txt in soup.findAll(text=True):
        if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'):
            ntxt = regex_link.sub(
                lambda match: expand_one(match.group(0), cdict), txt)
            txt.replaceWith(BeautifulSoup(ntxt))
    return str(soup)