我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用BeautifulSoup.BeautifulSoup()。
def ParseHtml(self, html): soup = BeautifulSoup(html) links = soup.findAll('a', attrs={'class': 'ulink'}) #print len(links) if len(links) == 0: #the js return # tmp_js = soup.find(name='script', attrs={'language': 'javascript'}) js_str = soup.script.string #two ways to get the <script></script> new_url = js_str[16:-1] #get the new url new_url = eval(new_url) #eval:?????????? self.ParseHtml(self.LoadPage(new_url)) else: # print type(links) for link in links: # print type(link) # print type(link.string) # print unicode(link.string) titles = re.findall(r'?(.+?)?', str(link.string)) #unicode(link.string)) if len(titles) <> 0: print titles[0] # print 'url is %s, title is %s.' %(link['href'], titles[0])
def GetTotalPage(self, html): # create the BeautifulSoup some_soup = BeautifulSoup(html) #get the page div ele_a = some_soup.find('div', attrs={'class': 'page'}) #get the last div>a text='??' last_a = ele_a.findAll('a')[-1] #substr 0:.html pagenum = last_a.get('href')[:-5] print 'pagenum :', pagenum # print type(last_a) self.SaveTotalPageToFile(pagenum) # store the max page number to totalpage.ini #new_page_num: new max page num
def list_of_all_href(self,html): ''' It will return all hyper links found in the mr-jatt page for download ''' soup=BeautifulSoup(html) links=[] a_list=soup.findAll('a','touch') for x in xrange(len(a_list)-1): link = a_list[x].get('href') name = a_list[x] name = str(name) name=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',name) name=re.sub(r'^[0-9]+\.','',name) links.append([link,name]) #quit() return links
def crawler(urls, max_urls): crawled = Set() queued = Set(urls) pairs = [] while urls and len(crawled) < max_urls: page=urls.pop(0) if is_html(page): if page not in crawled: try: print(page) links=BeautifulSoup(urllib2.urlopen(page,timeout=5).read(), parseOnlyThese=SoupStrainer('a')) for link in links: url = domain + link['href'] if verify(url) and url not in queued: # print(url) urls.append('http://' +url) # print(urls) queued.add('http://' +url) # print(page) crawled.add(page) # print(crawled) except: continue return crawled,pairs
def _extract_description(self, result): desc_div = result.find('div', {'class': re.compile(r'\bs\b')}) if not desc_div: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None desc_strs = [] def looper(tag): if not tag: return for t in tag: try: if t.name == 'br': break except AttributeError: pass try: desc_strs.append(t.string) except AttributeError: desc_strs.append(t) looper(desc_div) looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr> desc = ''.join(s for s in desc_strs if s) return self._html_unescape(desc)
def _extract_description(self, result): desc_td = result.findNext('td') if not desc_td: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None desc_strs = [] def looper(tag): if not tag: return for t in tag: try: if t.name == 'br': break except AttributeError: pass try: desc_strs.append(t.string) except AttributeError: desc_strs.append(t) looper(desc_td) looper(desc_td.find('wbr')) # BeautifulSoup does not self-close <wbr> desc = ''.join(s for s in desc_strs if s) return self._html_unescape(desc)
def _get_results_page(self, set_type): if set_type == LARGE_SET: url = GoogleSets.URL_LARGE else: url = GoogleSets.URL_SMALL safe_items = [urllib.quote_plus(i) for i in self.items] blank_items = 5 - len(safe_items) if blank_items > 0: safe_items += ['']*blank_items safe_url = url % tuple(safe_items) try: page = self.browser.get_page(safe_url) except BrowserError, e: raise GSError, "Failed getting %s: %s" % (e.url, e.error) return BeautifulSoup(page)
def _get_results_page(self): if self._page == 0: if self._results_per_page == 10: url = SponsoredLinks.SEARCH_URL_0 else: url = SponsoredLinks.SEARCH_URL_1 else: if self._results_per_page == 10: url = SponsoredLinks.NEXT_PAGE_0 else: url = SponsoredLinks.NEXT_PAGE_1 safe_url = url % { 'query': urllib.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page } try: page = self.browser.get_page(safe_url) except BrowserError, e: raise SLError, "Failed getting %s: %s" % (e.url, e.error) return BeautifulSoup(page)
def get_content(cls, url=None, session=None): """ @brief: ??url???????????? """ hyperlinks = set() soup_context = None # ??????????????? html_context = cls.parse_page(url, session) if html_context: soup_context = BeautifulSoup.BeautifulSoup(html_context) if soup_context: soup_context = BeautifulSoup.BeautifulSoup(html_context) for each_link in soup_context.findAll('a'): hyperlink = urlparse.urljoin(url, (each_link or {}).get('href')) hyperlinks.add(hyperlink) return hyperlinks, soup_context
def make_soup(markup, parser=None): """Factory method returning a BeautifulSoup instance. The created instance will use a parser of the given name, if supported by the underlying BeautifulSoup instance. """ if 'bs4' in sys.modules: # We support parser specification. If the caller didn't # specify one, leave it to BeautifulSoup to pick the most # suitable one, but suppress the user warning that asks to # select the most suitable parser ... which BS then # selects anyway. if parser is None: warnings.filterwarnings('ignore', 'No parser was explicitly specified') return BeautifulSoup(markup, parser) return BeautifulSoup(markup)
def get_member_attributes(self): """ Returns a dictionary of a balancer member's attributes.""" balancer_member_page = fetch_url(self.module, self.management_url) try: assert balancer_member_page[1]['status'] == 200 except AssertionError: self.module.fail_json(msg="Could not get balancer_member_page, check for connectivity! " + balancer_member_page[1]) else: try: soup = BeautifulSoup(balancer_member_page[0]) except TypeError: self.module.fail_json(msg="Cannot parse balancer_member_page HTML! " + str(soup)) else: subsoup = soup.findAll('table')[1].findAll('tr') keys = subsoup[0].findAll('th') for valuesset in subsoup[1::1]: if re.search(pattern=self.host, string=str(valuesset)): values = valuesset.findAll('td') return dict((keys[x].string, values[x].string) for x in range(0, len(keys)))
def get_categories(): url = "http://sexyhotplay.com.br/categorias/" html = client.request(url, headers={'Cookie': 'disclaimer-sexyhotplay=1;'}) soup = bs(html) div = soup.find('div', attrs={'class': 'colunas-3-15'}) links = div.findAll('a', attrs={'class': 'link'}, recursive=True) results = [] for link in links: label = link.find('strong').string url = 'http://sexyhotplay.com.br' + link['href'] results.append({ 'name': label, # 'clearlogo': os.path.join(artPath, 'logo_sexyhot.png'), 'url': url }) return results
def f_grab_cmd_from_twitter_profile(profile_name): """grab 0xXXXXXXXX tag from profile, tag must match [a-zA-Z0-9_] :rtype: string :param profile_name: twitter profile name without leading @ :return: string embedded in the profile description """ url = 'https://twitter.com/%(profile)s' payload = { 'profile': profile_name } html = requests.get(url % payload) soup = soupy(html.text) profile_description = soup.find('meta', {'name': 'description'})['content'] match = re.search('0x(\w+)', profile_description) output = match.group(1) # group 1 consists of match between ( ) return str(output)
def decrypt(self, pyfile): self.pyfile = pyfile if self.article.match(pyfile.url): html = self.load(pyfile.url) soup = BeautifulSoup.BeautifulSoup( html, convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES) links = [] for a in soup.findAll("a", attrs={'href': self.hoster_links}): for decrypted_link in self.decrypt_folder(a.get('href')): links.append(decrypted_link) self.packages.append((pyfile.name, links, pyfile.name)) else: self.links = self.decrypt_folder(pyfile.url)
def search(key_word): global x search_url='http://news.sogou.com/news?ie=utf8&p=40230447&interV=kKIOkrELjboMmLkEkLoTkKIMkLELjb8TkKIMkrELjboImLkEk74TkKILmrELjbgRmLkEkLY=_485898072&query=%E4%B8%AD%E7%A7%91%E5%A4%A7&' req=urllib2.urlopen(search_url.replace('key_word',key_word)) real_visited=0 html=req.read() soup=BeautifulSoup(html) #print soup content = soup.findAll(name="a",attrs={"href":True,"data-click":True,"target":True}) #resultset object num = len(content) #print num for i in range(9): #???????????????????url p_str= content[2*i] #if no result then nontype object tit[i]=p_str.renderContents() tit[i]=tit[i].decode('utf-8', 'ignore')#need it tit[i]= re.sub("<[^>]+>","",tit[i]) print(tit[i]) url[i]=str(p_str.get("href")) print(url[i]) #???????url??? img[i]=getimg(url[i]) w, h = img[i].size img[i]=resize(w,h, w_box, h_box,img[i])
def ProxyIPSpider(self): # get the proxy f = open('proxy.txt', 'w') for page in range(1,50): url = 'http://www.xicidaili.com/nn/%s' %page user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36" request = urllib2.Request(url) request.add_header("User-Agent", user_agent) content = urllib2.urlopen(request) soup = BeautifulSoup(content) trs = soup.find('table', {"id":"ip_list"}).findAll('tr') for tr in trs[1:]: tds = tr.findAll('td') ip = tds[2].text.strip() port = tds[3].text.strip() protocol = tds[6].text.strip() if protocol == 'HTTP' or protocol == 'HTTPS': f.write('%s=%s:%s\n' % (protocol, ip, port)) print '%s://%s:%s' % (protocol, ip, port)
def caiji2(self): #“???IP” # ??????IP??????????????????????????????? of = open('proxy.txt', 'w') url = 'http://www.haodailiip.com/guonei/' for i in range(1,20): Url = 'http://www.haodailiip.com/guonei/' + str(i) print u"????"+Url html = requests.get(Url).text bs = BeautifulSoup(html) table = bs.find('table',{"class":"proxy_table"}) tr = table.findAll('tr') for i in range(1,31): td = tr[i].findAll('td') proxy_ip = td[0].text.strip() proxy_port = td[1].text.strip() of.write('http=%s:%s\n' %(proxy_ip,proxy_port)) print 'http=%s:%s\n' %(proxy_ip,proxy_port) time.sleep(2) of.closed
def DuckDuckGo(cmd): debug(cmd.text) q = cmd.text.split() if len(q) == 1: return question = "+".join(q[1:]) debug("Question=%s" % question) req = requests.get("https://duckduckgo.com/html/?q=%s" % question) answer = None html = bp.BeautifulSoup(req.text) responses = html.findAll("div", id="zero_click_abstract") try: answer = responses[0].text except Exception as e: print e # get internal pass if not answer: bot.reply_to(cmd, "Não tenho a menor idéia. Tem de perguntar no google.") return try: bot.reply_to(cmd, answer) except Exception as e: bot.reply_to(cmd, "Deu merda: %s" % e)
def _update_cache(release): LOG.debug('Updating cache for the release "%s"', release) url = BASE_URL % release html_page = urllib.request.urlopen(url) soup = BeautifulSoup(html_page) specs = {} for link in soup.findAll('a', attrs={'href': re.compile('.html$')}): href = link.get('href') title = ' '.join(href.replace('.html', '').split('-')) link = url + href specs[title] = link _CACHE[release] = {} _CACHE[release]['specs'] = specs _CACHE[release]['updated_at'] = datetime.datetime.utcnow() LOG.info('Cache updated for the release "%s"', release)
def update_planet_fleet(self, planet): resp = self.br.open(self._get_url('fleet', planet)) soup = BeautifulSoup(resp) ships = {} for k, v in self.SHIPS.iteritems(): available = 0 try: s = soup.find(id='button' + v) available = int(s.find('span', 'textlabel').nextSibling.replace('.', '')) except: available = 0 ships[k] = available #self.logger.info('Updating %s fleet' % planet) #self.logger.info('%s' % fleet) planet.ships = ships
def update_planet_research(self, planet): resp = self.br.open(self._get_url('research', planet)) soup = BeautifulSoup(resp) try: ButtonList = soup.find(id='buttonz') AllResearchList = ButtonList.findAll('li') for research in AllResearchList: if research.get('class') == 'on': fb = research.find('a', 'fastBuild') if fb: build_url = fb.get('onclick') if fb else '' build_url = self._parse_research_url(build_url) self.logger.info('Research launched on %s:%s'% (planet, fb.get('title'))) self.br.open(build_url) break except: self.logger.exception('Exception while retrieving researches')
def update_planet_facilities(self, planet): resp = self.br.open(self._get_url('station', planet)) soup = BeautifulSoup(resp) try: ButtonList = soup.find(id='stationbuilding') AllResearchList = ButtonList.findAll('li') for research in AllResearchList: if research.get('class') == 'on': fb = research.find('a', 'fastBuild') if fb: build_url = fb.get('onclick') if fb else '' build_url = self._parse_research_url(build_url) self.logger.info('Facility upgraded on %s:%s'% (planet, fb.get('title'))) self.br.open(build_url) break except: self.logger.exception('Exception while retrieving facilities statuses') return True
def getFirstPostData(forum_text): soup = BeautifulSoup(forum_text) title = "" date = "" body = "" try: date = soup.find("div", attrs={"class": "postDate"}).text except AttributeError: print "Date not found" try: title = soup.find("div", attrs={"class": "postTitle"}).text except AttributeError: print "Title not found" try: body = soup.find("div", attrs={"class": "postBody"}).text except AttributeError: print "Body not found, now this is weird" return [title,date,body]
def parse(self, html): """ This method initiates parsing of HTML content, cleans resulting content as needed, and notifies the parser instance of resulting instances via the handle_article callback. """ self.soup = BeautifulSoup(html, "html.parser") # This parses any global, non-itemized attributes from the page. self._parse_globals() # Now parse out listed articles: for div in self.soup.findAll(ScholarArticleParser._tag_results_checker): self._parse_article(div) self._clean_article() if self.article['title']: self.handle_article(self.article)
def get_data(self, search_query): '''helper method to get data from google images by scraping and parsing''' params = {"site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query} headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \ IEMobile/7.0; LG; GW910)'} html = '' try: html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text except Exception as exc: log_exception(__name__, exc) soup = BeautifulSoup.BeautifulSoup(html) results = [] for div in soup.findAll('div'): if div.get("id") == "images": for a_link in div.findAll("a"): page = a_link.get("href") try: img = page.split("imgurl=")[-1] img = img.split("&imgrefurl=")[0] results.append(img) except Exception: pass return results
def get_top250_db(self): ''' get the top250 listing for both movies and tvshows as dict with imdbid as key uses 7 day cache to prevent overloading the server ''' results = {} for listing in [("top", "chttp_tt_"), ("toptv", "chttvtp_tt_")]: html = requests.get( "http://www.imdb.com/chart/%s" % listing[0], headers={ 'User-agent': 'Mozilla/5.0'}, timeout=20) soup = BeautifulSoup.BeautifulSoup(html.text) for table in soup.findAll('table'): if table.get("class") == "chart full-width": for td_def in table.findAll('td'): if td_def.get("class") == "titleColumn": a_link = td_def.find("a") if a_link: url = a_link["href"] imdb_id = url.split("/")[2] imdb_rank = url.split(listing[1])[1] results[imdb_id] = try_parse_int(imdb_rank) self.write_kodidb(results) return results
def purgeAttributes(self, mime, _old): html = mime.html() soup = BeautifulSoup(html) newMime = QMimeData() for tag in soup.recursiveChildGenerator(): # remove attributes in the list index = -1 try: for key, value in tag.attrs: index += 1 if key != 'style': continue new = value.split(';') new = ';'.join([s for s in new if s.split(':')[0].strip() not in REMOVE_ATTRIBUTES]) tag.attrs[index] = (u'style', new) except AttributeError: # 'NavigableString' object has no attribute 'attrs' pass # assign the modified html to new Mime newMime.setHtml(str(soup).decode('utf8')) # default _processHtml method return _old(self, newMime)
def get_address(self,id): url= self.base_url.format(school_id=id) page = urllib2.urlopen(url) soup = BeautifulSoup(page.read()) #print soup for breaks in soup.findAll('br'): breaks.extract() for ahref in soup.findAll('a'): ahref.extract() # print soup span_content = soup.find("span", {"class": "schoolAddress"}) if not span_content: print span_content, id return None ##RAISE EXCEPTION INSTEAD OF RETURNING NONE address= span_content.renderContents().replace('\n', '').strip() return address ##
def get_balls(self, response): """ Getting pokéballs :param response: Inventory response text (html) :return: None """ try: ph = BeautifulSoup(response, "html.parser") pokeball_div = ph.find_all('div', attrs={"class": "list autowidth"}) ph = BeautifulSoup(str(pokeball_div[1]), "html.parser") ph = BeautifulSoup(str(ph.find_all("tr")), "html.parser") i = 0 for tdList in ph.find_all("td"): if i == 3: # Pokeball self.Pokeball = int(tdList.text) elif i == 10: # Great Ball self.GreatBall = int(tdList.text) elif i == 17: # Ultra Ball self.UltraBall = int(tdList.text) elif i == 24: # Master Ball self.MasterBall = int(tdList.text) i += 1 except Exception as e: self.l.writelog(str(e), "critical") return None
def get_pots(self, response): """ Getting pots :param response: Inventory response text (html) :return: None """ try: ph = BeautifulSoup(response, "html.parser") pokeball_div = ph.find_all('div', attrs={"class": "list autowidth"}) ph = BeautifulSoup(str(pokeball_div[0]), "html.parser") ph = BeautifulSoup(str(ph.find_all("tr")), "html.parser") i = 0 for tdList in ph.find_all("td"): if i == 3: # Potion self.Potion = int(tdList.text) elif i == 10: # Super Potion self.SuperPotion = int(tdList.text) elif i == 17: # Hyper Potion self.HyperPotion = int(tdList.text) i += 1 except Exception as e: self.l.writelog(str(e), "critical") return None
def get_war_status(self, response): """ Parse war status :param response: html response """ ph = BeautifulSoup(response, "html.parser") tmp_image = ph.find_all("img") if len(tmp_image) >= 4: self.your_hp = tmp_image[2]['width'] self.enemy_hp = tmp_image[3]['width'] else: self.enemy_hp = 0 self.your_hp = 0 tmp_status = ph.find_all("td", attrs={"valign": "top"}) if len(tmp_status) >= 2: self.your_status = tmp_status[0].text self.enemy_status = tmp_status[1].text else: self.your_status = "" self.enemy_status = ""
def parseSubs(data): subs = [] if addon.getSetting('subtitles') == 'false': return subs for sub in data: lang = sub['displayName'].split('(')[0].strip() common.Log('Convert %s Subtitle' % lang) file = xbmc.translatePath('special://temp/%s.srt' % lang).decode('utf-8') soup = BeautifulSoup(common.getURL(sub['url'])) enc = soup.originalEncoding num = 0 with codecs.open(file, 'w', encoding='utf-8') as srt: for caption in soup.findAll('tt:p'): num += 1 subtext = caption.renderContents().decode(enc).replace('<tt:br>', '\n').replace('</tt:br>', '') srt.write(u'%s\n%s --> %s\n%s\n\n' % (num, caption['begin'], caption['end'], subtext)) subs.append(file) return subs
def parse(self, html): """ This method initiates parsing of HTML content, cleans resulting content as needed, and notifies the parser instance of resulting instances via the handle_article callback. """ self.soup = BeautifulSoup(html) # This parses any global, non-itemized attributes from the page. self._parse_globals() # Now parse out listed articles: for div in self.soup.findAll(ScholarArticleParser._tag_results_checker): self._parse_article(div) self._clean_article() if self.article['title']: self.handle_article(self.article)
def get_price(self): r = requests.get('https://www.fastlane.co.il/mobile.aspx', verify=False) parsed_html = BeautifulSoup(r.content) price = parsed_html.find('span', attrs={'id': 'lblPrice'}).text return int(price)
def GetTotalPage(self, html): # create the BeautifulSoup some_soup = BeautifulSoup(html) ele_a = some_soup.findAll('a', attrs={'class': 'page-numbers'}) if len(ele_a) > 0: last_page_html = ele_a[len(ele_a) - 1] # get the max page number. #print last_page_html # get the lastpage number pagenum = last_page_html.text self.SaveTotalPageToFile(pagenum) # store the max page number to totalpage.ini #new_page_num: new max page num
def list(self, prefix=None, marker=None, delimiter=None): # https://s3.amazonaws.com/net-mozaws-stage-fx-test-activedata?marker=jenkins-go-bouncer.prod-3019/py27.log # <ListBucketResult> # <Name>net-mozaws-stage-fx-test-activedata</Name> # <Prefix/> # <Marker>jenkins-go-bouncer.prod-3019/py27.log</Marker> # <MaxKeys>1000</MaxKeys> # <IsTruncated>true</IsTruncated> # <Contents> # <Key>jenkins-go-bouncer.prod-3020/py27.log</Key> # <LastModified>2017-03-05T07:02:20.000Z</LastModified> # <ETag>"69dcb19e91eb3eec51e1b659801523d6"</ETag> # <Size>10037</Size> # <StorageClass>STANDARD</StorageClass> state = Data() state.prefix =prefix state.delimiter = delimiter state.marker = marker state.get_more = True def more(): xml = http.get(self.url + "?" + value2url_param(state)).content data = BeautifulSoup(xml) state.get_more = data.find("istruncated").contents[0] == "true" contents = data.findAll("contents") state.marker = contents[-1].find("key").contents[0] return [{k: t(d.find(k).contents[0]) for k, t in content_keys.items()} for d in contents] while state.get_more: content = more() for c in content: yield wrap(c)
def getNodeDetails(self, node): if isinstance(node, BeautifulSoup): # Document or DocumentFragment return (_base.DOCUMENT,) elif isinstance(node, Declaration): # DocumentType string = unicode(node.string) #Slice needed to remove markup added during unicode conversion, #but only in some versions of BeautifulSoup/Python if string.startswith('<!') and string.endswith('>'): string = string[2:-1] m = self.doctype_regexp.match(string) #This regexp approach seems wrong and fragile #but beautiful soup stores the doctype as a single thing and we want the seperate bits #It should work as long as the tree is created by html5lib itself but may be wrong if it's #been modified at all #We could just feed to it a html5lib tokenizer, I guess... assert m is not None, "DOCTYPE did not match expected format" name = m.group('name') publicId = m.group('publicId') if publicId is not None: systemId = m.group('systemId1') else: systemId = m.group('systemId2') return _base.DOCTYPE, name, publicId or "", systemId or "" elif isinstance(node, Comment): string = unicode(node.string) if string.startswith('<!--') and string.endswith('-->'): string = string[4:-3] return _base.COMMENT, string elif isinstance(node, unicode): # TextNode return _base.TEXT, node elif isinstance(node, Tag): # Element return (_base.ELEMENT, namespaces["html"], node.name, dict(node.attrs).items(), node.contents) else: return _base.UNKNOWN, node.__class__.__name__
def __init__(self, namespaceHTMLElements): if namespaceHTMLElements: warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) _base.TreeBuilder.__init__(self, namespaceHTMLElements)
def documentClass(self): self.soup = BeautifulSoup("") return Element(self.soup, self.soup, None)
def elementClass(self, name, namespace): if namespace is not None: warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) return Element(Tag(self.soup, name), self.soup, namespace)
def fragmentClass(self): self.soup = BeautifulSoup("") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None)
def Parse(self,song_name): song_name = '+'.join(song_name) url="https://www.youtube.com/results?search_query=" url=url+song_name file_download=FileDownload() html=file_download.get_html_response(url) soup=BeautifulSoup(html) download_url = soup.find('a',attrs={'class':'yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link '}) temp_url='https://www.youtube.com' final_url=temp_url+download_url.get('href') return final_url
def missing_schema(self,html,song_name): ''' It will print the list of songs that can be downloaded ''' #html=self.get_html_response(url) soup=BeautifulSoup(html) name=' '.join(song_name) print '%s not found'%name print "But you can download any of the following songs :" a_list=soup.findAll('a','touch') for x in xrange(len(a_list)-1): r=a_list[x] p=str(r) q=re.sub(r'<a.*/>|<span.*">|</span>|</a>|<a.*html">|<font.*">|</font>','',p) print q
def check_if_song_name(self,html): ''' Returns true if user entered artist or movie name ''' soup=BeautifulSoup(html) a_list=soup.findAll('a','touch') #print a_list text=[str(x) for x in a_list] text=''.join(text) text=text.lower() string1='download in 48 kbps' string2='download in 128 kbps' string3='download in 320 kbps' href='' if string3 in text: #print 'Downloading in 320 kbps' href=a_list[2].get('href') elif string2 in text: #print 'Downloading in 128 kbps' href=a_list[1].get('href') elif string1 in text: #print 'Downloading in 48 kbps' href=a_list[0].get('href') else: return (True,'nothing') return (False,href)
def g_search_custom(message, client, search): loop = asyncio.get_event_loop() headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} search = search.replace(' ', '+') async with aiohttp.get('https://www.google.com/search?q={}&start=1&num=1'.format(search), headers=headers) as gr: try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup html = await gr.text() results = [] parsed_html = BeautifulSoup(html, "html.parser") for item in parsed_html.find_all('h3', attrs={'class': 'r'}): results.append(str(item.a['href']).replace('/url?q=', '').split('&sa=U&ved=')[0]) await client.send_message(message.channel, 'Top result for `{}`: '.format(search) + ''.join(results[0]))
def getLinks(): req = urllib2.urlopen('http://www.example.com') soup = BeautifulSoup(req.read()) for link in soup.findAll('a'): linkArray.append(link.get('href')) print(len(linkArray))
def expand_html(html, cdict=None): if not have_soup: raise RuntimeError("Missing BeautifulSoup") soup = BeautifulSoup(html) comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] for txt in soup.findAll(text=True): if not txt.parent.name in ('a', 'script', 'pre', 'code', 'embed', 'object', 'audio', 'video'): ntxt = regex_link.sub( lambda match: expand_one(match.group(0), cdict), txt) txt.replaceWith(BeautifulSoup(ntxt)) return str(soup)