我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用bs4.SoupStrainer()。
def getJournalURL(jname): # get journal URL given the journal name for retrieving article PIIs urlstr = "http://api.elsevier.com/sitemap/page/sitemap/" + jname[0].lower() + ".html" retl = "" with urllib.request.urlopen(urlstr) as url: response = url.read() linkcnt = 0 for link in BeautifulSoup(response, parse_only=SoupStrainer("a")): if linkcnt == 0: linkcnt += 1 continue if link.has_attr("href"): if link.text.lower() == jname.lower(): #print(link["href"]) retl = link["href"] break linkcnt += 1 return retl
def __init__(self, data, encoding=None): """ Initialize serializer class :param data: ori data :param encoding: encoding type of your ori data """ self.data = data if not self.data: raise ValueError("You must input origin data to this class") # if you don't support encoding type we will use chardet to check the type self.encoding = encoding if encoding else UnicodeDammit(self.data).original_encoding self.encoding = None if self.encoding == "utf-8" else self.encoding # initialize beautiful soup # only_content_div = SoupStrainer("body") self.obj = BeautifulSoup(data, features="lxml", from_encoding=self.encoding)
def get_title(html): """ Get the title element from a HTML document :param str html: The html to parse :Example: >>> Link.get_title("xxxx<title>Title</title>xxxx") 'Title' >>> print(Link.get_title("xxxx<>Title</title>xxxx")) None """ bs = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('title')) title = bs.find("title") if not title: return None if not title.string: return None return title.string.strip().replace('\n', ' ')
def get_child_urls(main_page, max_child=20): """retrieve urls from giving html page. args: main_page(str): html file. max_child(int): max number of return urls. return: list of url string. """ from bs4 import BeautifulSoup, SoupStrainer children = [] for link in BeautifulSoup(main_page, "html.parser", parse_only=SoupStrainer('a')): if link.has_attr('href') and link['href'].startswith("http"): children.append(link['href']) if len(children) > max_child: children = children[:max_child] return children
def __get_menu_items(self, url, soupstrainer_parser_selector, routing_action, video_dictionary_action=None): response = requests.get(url) tiles = SoupStrainer('a', soupstrainer_parser_selector) soup = BeautifulSoup(response.content, "html.parser", parse_only=tiles) listing = [] for tile in soup.find_all(class_="tile"): link_to_video = tile["href"] thumbnail, title = self.__get_thumbnail_and_title(tile) video_dictionary = None if video_dictionary_action is not None: video_dictionary = video_dictionary_action(tile) item = helperobjects.TitleItem(title, {'action': routing_action, 'video': link_to_video}, False, thumbnail, video_dictionary) listing.append(item) return listing
def read(self): with io.open(self.filename, 'rb') as dhtml_file: def strain(name, attrs): if name == 'title': return True if name == 'div' and dict(attrs).get('id', None) in self.ids: return True return False soup = BeautifulSoup(dhtml_file, "lxml", parse_only=SoupStrainer(strain)) parser = html_parser.HTMLParser() self.title = parser.unescape(soup.title.decode_contents()) if soup.title else _('Untitled') for an_id in self.ids: found_elements = soup.find_all(id=an_id) if found_elements: [element] = found_elements self.elements[an_id] = element.decode_contents() else: self.elements[an_id] = '' self.original_encoding = soup.original_encoding
def get_lyrics_with_urls(urls): # TODO ret = [] for url in urls: time.sleep(3) print(url) response = urlopen(url, timeout=5) content = response.read() for lyrics in bs(content, "html.parser", parse_only=SoupStrainer('p')): if(lyrics.has_attr('style')): lyrics = re.sub('</?br/?>', '\n', str(lyrics)) lyrics = re.sub('<.*?>', '', str(lyrics)) lyrics = re.sub('\n', ' \n', str(lyrics)) ret.append(lyrics) print(lyrics) print(str(get_sentiment(lyrics))) return ret
def get_lyrics(artist, song): artist = format_artist(artist) song = format_song(song) time.sleep(1) url = LYRICS_URL.format(artist, song) content = None try: response = urlopen(url) content = response.read() except Exception as e: print(url) print(e) print("failed\n") return None soup = bs(content, "html.parser", parse_only=SoupStrainer('div')) for l in soup: for lyrics in soup.find_all(string=lambda t: isinstance(t, Comment)): if "start of lyrics" in lyrics or "Usage" in lyrics: lyrics = re.sub('</?br/?>', '', str(lyrics.parent)) lyrics = re.sub('<.*?>', '', str(lyrics)) return str(lyrics)
def scrape_category_page(url): global ALL_TEXT, non_bmp_map, threads, count soup = BeautifulSoup(urllib.request.urlopen(url), 'lxml', parse_only=SoupStrainer('div')) ### accounts for categories with over 200 pages link = soup.find('a', href=True, text='next page') if (link != None): try: t = catThread('https://en.wikipedia.org' + link['href']) t.daemon = True t.start() threads.append(t) except: print ("Error: Unable to thread.") ### sends links of wikipedia articles in the category to be scraped pages_in_category = soup.find('div', {'id':'mw-pages'}).find('div',{'class':'mw-category'}) for obj in pages_in_category.findAll('a'): tempbun = scrape(Bundle('https://en.wikipedia.org' + obj['href'], False)) with lock: ALL_TEXT += tempbun.text.translate(non_bmp_map) print (count) count += 1
def get_soup(game_html): """ Uses Beautiful soup to parses the html document. Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order :param game_html: html doc :return: "soupified" html and player_shifts portion of html (it's a bunch of td tags) """ strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')}) soup = BeautifulSoup(game_html.text, "lxml", parse_only=strainer) soup = soup.select('td.+.bborder') if len(soup) == 0: soup = BeautifulSoup(game_html.text, "html.parser", parse_only=strainer) soup = soup.select('td.+.bborder') if len(soup) == 0: soup = BeautifulSoup(game_html.text, "html5lib") soup = soup.select('td.+.bborder') return soup
def collectArticles(urlstr): # get article PIIs retl = [] with urllib.request.urlopen(urlstr) as url: response = url.read() linkcnt = 0 for link in BeautifulSoup(response, parse_only=SoupStrainer("a")): if linkcnt == 0: linkcnt += 1 continue if link.has_attr("href"): #print(link["href"]) retl.append(link["href"]) linkcnt += 1 return retl
def get_links ( url ): ''' Get all the links off of the page: gd2.mlb.com/components/game/mlb/year/month/day/ And finds the links for the games that have the following format: gid_year_mm_dd_team1mlb_team2mlb ''' f = get_page (url) if f==False: return False # Compile the regex to match links outside of the loop for # performance links = [] regex = re.compile("\"gid_(.*?)\"", re.IGNORECASE) # Find all links on page and if they are links to games then add to list for link in BeautifulSoup(f, "lxml",parse_only=SoupStrainer('a', href=True) ): match = regex.findall(str(link)) if match: links.extend(match) return links
def _check_latest_version(self, url, package, package_regex, current_version, ud, d): """ Return the latest version of a package inside a given directory path If error or no version, return "" """ valid = 0 version = ['', '', ''] bb.debug(3, "VersionURL: %s" % (url)) soup = BeautifulSoup(self._fetch_index(url, ud, d), "html.parser", parse_only=SoupStrainer("a")) if not soup: bb.debug(3, "*** %s NO SOUP" % (url)) return "" for line in soup.find_all('a', href=True): bb.debug(3, "line['href'] = '%s'" % (line['href'])) bb.debug(3, "line = '%s'" % (str(line))) newver = self._parse_path(package_regex, line['href']) if not newver: newver = self._parse_path(package_regex, str(line)) if newver: bb.debug(3, "Upstream version found: %s" % newver[1]) if valid == 0: version = newver valid = 1 elif self._vercmp(version, newver) < 0: version = newver pupver = re.sub('_', '.', version[1]) bb.debug(3, "*** %s -> UpstreamVersion = %s (CurrentVersion = %s)" % (package, pupver or "N/A", current_version[1])) if valid: return pupver return ""
def _parse_multiple_apps(self, list_response): """Extracts app ids from a list's Response object, sends GET requests to each app, parses detailed info and returns all apps in a list. :param list_response: the Response object from a list request :return: a list of app dictionaries """ list_strainer = SoupStrainer('span', {'class': 'preview-overlay-container'}) soup = BeautifulSoup(list_response.content, 'lxml', parse_only=list_strainer) app_ids = [x.attrs['data-docid'] for x in soup.select('span.preview-overlay-container')] responses = multi_app_request(app_ids) app_strainer = SoupStrainer('div', {'class': 'main-content'}) apps = [] errors = [] for i, r in enumerate(responses): if r is not None and r.status_code == requests.codes.ok: soup = BeautifulSoup(r.content, 'lxml', parse_only=app_strainer) apps.append(self._parse_app_details(soup)) else: errors.append(app_ids[i]) if errors: self._log.error("There was an error parsing the following apps: {errors}.".format( errors=", ".join(errors))) return apps
def get_categories(): """ Sends a GET request to the front page (base url of the app store), parses and returns a list of all available categories. Note: May contain some promotions, e.g. "Popular Characters" """ categories = {} strainer = SoupStrainer('a', {'class': 'child-submenu-link'}) response = send_request('GET', s.BASE_URL) soup = BeautifulSoup(response.content, 'lxml', parse_only=strainer) category_links = soup.select('a.child-submenu-link') age = '?age=' for cat in category_links: url = urljoin(s.BASE_URL, cat.attrs['href']) category_id = url.split('/')[-1] name = cat.string if age in category_id: category_id = 'FAMILY' url = url.split('?')[0] name = 'Family' if category_id not in categories: categories[category_id] = { 'name': name, 'url': url, 'category_id': category_id} return categories
def resolve_title(url): #grab the first title if there's more than one try: pnk_log(mod, "Requesting %s" % url) r = pnk_request(url) response_text = r.text for title in BeautifulSoup(response_text, 'html.parser', parse_only=SoupStrainer('title')): return title.text.strip() except: return None
def getSingle(s): # load in your friends dictionary structDir = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'Structs')) with open(structDir + '/friendsDict.pkl','rb') as input: friendsDict = pickle.load(input) # -------------- Now, let's compile a list of friends who are single ------------ Single = [] iteration = 1 relatStrainer = SoupStrainer(text=re.compile("Single</div>")) relatExt = "/about?section=relationship&pnref=about" relatExtBeta = "&sk=about§ion=relationship" fbook = "https://facebook.com" for friend in friendsDict: if (friendsDict[friend].find("php") != -1): relatURL = fbook + friendsDict[friend] + relatExtBeta else: relatURL = fbook + friendsDict[friend] + relatExt relatInfo = s.get(relatURL) soup = BeautifulSoup(relatInfo.text,"lxml",parse_only=relatStrainer) comment = soup.find(text=re.compile("Single</div>")) if (comment != None): # since some names have special characters, we need to strip these temp = friend.encode('utf-8').strip() Single.append(temp + "\n") print friend + " is single = " + str(comment != None) # print iteration iteration += 1 # print Single singleStr = ''.join(Single) with open(structDir + "/single.txt","wb") as f: f.write(singleStr)
def getFriendsList(friends, part,s): ID = vanity if(part == 1): index = 0; elif(part == 2): index = 24; elif(part == 3): index = 24+36 else: index = 24+36+36 # find scrape their total number of friends temp = s.get('https://www.facebook.com/' + ID + '/friends') soup = BeautifulSoup(temp.text,"lxml") strainer = SoupStrainer('a',href=re.compile("fref=fr_tab")) # iterator over entire friends list and pull out the relevant information from # the html docs that display 24 or 36 friends each while (index < (numFriends)): if index == 0: temp = s.get('https://m.facebook.com/' + ID + '/friends') soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer) tempLst = soup.findAll('a') for item in tempLst: friends.append(item) index = 24 + 36*3 else: temp = (s.get('https://m.facebook.com/' + ID + '/friends?startindex=' + str(index))) soup = BeautifulSoup(temp.text,"lxml",parse_only=strainer) tempLst = soup.findAll('a') for item in tempLst: friends.append(item) index = index + 36*4 return
def __init__(self, *args, **kwargs): super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args, **kwargs) from bs4 import SoupStrainer self._strainer = SoupStrainer('table')
def scrape(webpage, extension=".mid"): # Get all the files of a given extension from a webpage http = httplib2.Http() status, response = http.request(webpage) files = [] for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')): if link.has_attr('href'): linkname = link['href'] if linkname[-len(extension):] == extension: files += [linkname] return files
def get_film_info_subhd(): items = [] target_url = 'http://subhd.com' content = urllib2.urlopen(target_url).read().decode('utf-8') only_hotl_tags = SoupStrainer(class_='hotl') soup = BeautifulSoup(content, "html.parser", parse_only=only_hotl_tags) i = 0 for link in soup.find_all('a', limit=7): link_url = target_url + link.get('href') link_img = target_url + link.findChildren('img')[0].get('src') cover_img = 'http://img3.doubanio.com/view/movie_poster_cover/spst/public/' + link_img.split('/sub/poster/l/')[ 1] link_title = link.findChildren('img')[0].get('title') save_path = os.path.abspath("./icons/icon-s") imgData = urllib2.urlopen(cover_img).read() fileName = save_path + str(i) + '.jpg' output = open(fileName, 'wb+') output.write(imgData) output.close() json_item = dict(title=link_title, subtitle='', arg=link_url, icon='icons/icon-s' + str(i) + '.jpg') items.append(json_item) i = i + 1 return generate_xml(items)
def __get_version(self): ''' get jenkins version :return: ''' try: html = urllib2.urlopen(self.url + '/login?from=%2F').read() links = SoupStrainer('a' ,href = re.compile(VERSION_TAG)) version_text = BeautifulSoup(html, "html.parser", parse_only= links) if version_text.text != "": color_output("[+]....jenkins version is %s" % version_text.text) version_re = re.findall(u"ver.\s(.*)" ,version_text.text) if len(version_re) != 0: if version_re[0][0:4] >= self.check_version: self.user_link = ASYNCH_PEOPEL_PERFIX else: self.user_link = PEOPLE_PERFIX else: color_output("[-]....can't get jenkins version!") sys.exit() except urllib2.URLError,e: color_output("[-]....can't get jenkins version!") sys.exit() except Exception,e: color_output("[-]....get version error:%s" % str(e)) sys.exit()
def scrape(url): ### opens url so it's like a file try: link = urllib.request.urlopen(url) except urllib.error.HTTPError: return '' soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=SoupStrainer('p')) alltxt = '' ### iterate thru the <p> tags for para in soup.find_all('p'): alltxt = alltxt + para.get_text() + ' ' return alltxt
def scrape(bun): ### opens url so it's like a file link = urllib.request.urlopen(bun.URL) soup = None ### flag for retrieving categories (or not) if bun.categories: soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml') else: p_tags = SoupStrainer('p') soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=p_tags) ### dictionary of paragraphs doc = {} ### add token and count to replace paragraphs in HTML token = 'Waka' count = 0 ### all the paragraph texts in one string alltxt = '' ### iterate thru the <p> tags for para in soup.find_all('p'): ### put raw text in dictionary doc[token+str(count)] = para.get_text() alltxt = alltxt + para.get_text() + ' ' ### replace <p> contents with a token para.string = token + str(count) count+=1 ### get the list of categories cats = [] if bun.categories: for cat in soup.find('div', {'id': 'catlinks'}).find('ul').findAll('li'): cats.append('https://en.wikipedia.org' + cat.find('a')['href']) for css in soup.find_all('link', rel='stylesheet'): css['href'] = '//en.wikipedia.org/' + css['href'] for js in soup.find_all('script', src=re.compile('.*')): js['src'] = '//en.wikipedia.org/' + js['src'] ### update stuff in Bundle bun.paragraphs = doc bun.text = alltxt bun.html = str(soup.encode('ascii', 'xmlcharrefreplace').decode('utf-8')) bun.categories = cats return bun
def __init__(self, text_blob, *args, **kwargs): TextParser.text_strainer = SoupStrainer(TextParser.strain_through) self.soup = BeautifulSoup(text_blob, 'html.parser', parse_only=TextParser.text_strainer) self.text = self._extract_text()
def get_tuko(): tuko_url = 'https://www.tuko.co.ke' if check_connection(tuko_url): tuko = requests.get(tuko_url) soup = BeautifulSoup(tuko.text, 'lxml', parse_only=SoupStrainer('a')) tuko = [] for link in soup.select('a.news__link', limit=6): news_title = '{}({})'.format(link.get_text(), link.get('href')) tuko_link = requests.get(link.get('href')) soup_link = BeautifulSoup(tuko_link.text, 'lxml', parse_only=SoupStrainer(['p', 'meta', 'img'])) try: article_date = soup_link.find("meta", itemprop="datePublished")['content'] except (TypeError, ValueError): print('Tuko: No article date meta') continue image = '' try: image = soup_link.find("meta", property="og:image")['content'] except (TypeError, ValueError): try: image = soup_link.find('img', class_='article-image__picture')['src'] except (TypeError, ValueError): print('Tuko: No image found') news_dict = { 'category': 'news', 'source': 'tuko', 'title': link.get_text(), 'link': link.get('href'), 'image': image, 'content': [link_inner.get_text().strip(' ,.-') for link_inner in soup_link.select('p.align-left > strong', limit=3) if not link_inner.get_text().startswith('READ ALSO')], 'date': article_date, 'date_added': datetime.datetime.utcnow() } collection.update({'link': link.get('href')}, news_dict, upsert=True) tuko.append(news_dict) return tuko
def get_capital(): capital_url = 'http://www.capitalfm.co.ke/news/{}/{:02}'.format(today.year, today.month) if check_connection(capital_url): capital = requests.get(capital_url) soup = BeautifulSoup(capital.text, 'lxml', parse_only=SoupStrainer('div')) capital = [] for article in soup.select('div.entry-information'): article_link = article.a link = article_link['href'] title = article_link.get_text() capital_link = requests.get(link) soup_link = BeautifulSoup(capital_link.text, 'lxml', parse_only=SoupStrainer(['meta', 'img', 'div'])) article_date = soup_link.find("meta", property="article:published_time")['content'] image = '' try: image = soup_link.find("meta", property="og:image")['content'] except (TypeError, ValueError): try: image = soup_link.find('img', class_='size-full')['src'] except (TypeError, ValueError): print('Capital: No image found') try: content = get_content(soup_link, 'entry-content').split('\u2013')[1].strip() except IndexError: content = get_content(soup_link, 'entry-content').strip() news_dict = { 'category': 'news', 'source': 'capital', 'title': title, 'link': link, 'image': image, 'content': content, 'date': article_date, 'date_added': datetime.datetime.utcnow() } collection.update({'link': link}, news_dict, upsert=True) capital.append(news_dict) return capital
def discover_domains(subdomain_id, request_result_text): # retrieve subdomain object subdomain = Subdomain.objects.get(id=subdomain_id) # Create and start logger logger = create_logger('discover_{0}.log'.format(subdomain.id)) logger.info('discover {0} START'.format(subdomain.id)) # keep list or extracted subdomains to limit db queries extracted_subdomain = [] for link in BeautifulSoup(request_result_text, 'html.parser', # todo use lxml to speed things up parseOnlyThese=SoupStrainer('a')): # todo this only saves 'href' attributes in 'a' elements, can be missing valid entries if link.has_attr('href'): href = link['href'] extract_result = extract_subdomain(href) if extract_result not in extracted_subdomain: extracted_subdomain.append(extract_result) new_subdomain = import_subdomain(href, discovered_by=subdomain) logger.info('discover found {0}'.format(new_subdomain)) logger.info('discover {0} DONE'.format(subdomain_id)) # release memory gc.collect()
def run(self): while True: data = self._queue_data.get() self._index = data[0] html_contents = data[1] html_contents = re.sub('<br />', '\n', html_contents) only_main3 = SoupStrainer(class_="main3") soup_only_main3 = BeautifulSoup( html_contents, 'html.parser', parse_only=only_main3) # ????? if self._num_empty > 1000: break # ????? if soup_only_main3.get_text(strip=True) == self._delete: self._num_empty += 1 continue else: self._num_empty = 0 title_poetry = soup_only_main3.find(class_='son1').h1.string soup_only_main3.find(class_='son2').p.span.decompose() dynasty_poetry = soup_only_main3.find(class_='son2').p.string soup_only_main3.find(class_='son2').p.decompose() soup_only_main3.find(class_='son2').p.span.decompose() author_poetry = soup_only_main3.find(class_='son2').p.string soup_only_main3.find(class_='son2').p.decompose() soup_only_main3.find(class_='son2').p.decompose() soup_only_main3.find(class_='yizhu').decompose() content_poetry = soup_only_main3.find( class_='cont',id='cont').get_text() content_poetry = re.sub('[\n]+', '\n', content_poetry) content_poetry = content_poetry.strip('\n') path_html, path_txt = get_output_path(dynasty_poetry, self._index) file_html = open(path_html, 'w') file_html.writelines(data[1].encode('utf-8')) file_html.close() file_txt = open(path_txt, 'w') file_txt.writelines(title_poetry.encode('utf-8') + '\n') file_txt.writelines(dynasty_poetry.encode('utf-8') + '\n') file_txt.writelines(author_poetry.encode('utf-8') + '\n') file_txt.writelines(content_poetry.encode('utf-8') + '\n') file_txt.close() print '-----------------------------------------------------------' print 'Parser: ', self._index print '???', title_poetry print '???', dynasty_poetry print '???', author_poetry print '???\n', content_poetry print 'Parser finish'
def _check_latest_version_by_dir(self, dirver, package, package_regex, current_version, ud, d): """ Scan every directory in order to get upstream version. """ version_dir = ['', '', ''] version = ['', '', ''] dirver_regex = re.compile("(?P<pfx>\D*)(?P<ver>(\d+[\.\-_])+(\d+))") s = dirver_regex.search(dirver) if s: version_dir[1] = s.group('ver') else: version_dir[1] = dirver dirs_uri = bb.fetch.encodeurl([ud.type, ud.host, ud.path.split(dirver)[0], ud.user, ud.pswd, {}]) bb.debug(3, "DirURL: %s, %s" % (dirs_uri, package)) soup = BeautifulSoup(self._fetch_index(dirs_uri, ud, d), "html.parser", parse_only=SoupStrainer("a")) if not soup: return version[1] for line in soup.find_all('a', href=True): s = dirver_regex.search(line['href'].strip("/")) if s: sver = s.group('ver') # When prefix is part of the version directory it need to # ensure that only version directory is used so remove previous # directories if exists. # # Example: pfx = '/dir1/dir2/v' and version = '2.5' the expected # result is v2.5. spfx = s.group('pfx').split('/')[-1] version_dir_new = ['', sver, ''] if self._vercmp(version_dir, version_dir_new) <= 0: dirver_new = spfx + sver path = ud.path.replace(dirver, dirver_new, True) \ .split(package)[0] uri = bb.fetch.encodeurl([ud.type, ud.host, path, ud.user, ud.pswd, {}]) pupver = self._check_latest_version(uri, package, package_regex, current_version, ud, d) if pupver: version[1] = pupver version_dir = version_dir_new return version[1]
def extract_links(response_content, unique=False, blacklist_domains=[], whitelist_domains=[], regex=None, zen_path=None, blacklist_extensions=[], whitelist_extensions=[]): """Extract links from a response content. Args: response_content (str): The HTML page received in a Response Object. unique (bool): A parameter defining if the list can contain duplicates. Defaults to False. blacklist_domains (list): List of domains to exclude from the result. whitelist_domains (list): List of domains to include from the result. regex (list): A regular expression filter on the link. Defaults to None. zen_path (list): A selector to restrict the XPath to parse with bs4. Returns: links (list): A list of extracted and filtered links. """ if any([item in blacklist_domains for item in whitelist_domains]) \ or any([item in blacklist_extensions for item in whitelist_extensions]): raise LinkExtractorException('blacklist_domains and whitelist_domains ' 'can`t contain common value(s).') soup = BeautifulSoup( response_content, "html.parser", parse_only=SoupStrainer('a') ) links = [a.text for a in soup] if unique: links = list(set(links)) if regex: links = filter_links(links, regex) if whitelist_domains: for domn in whitelist_domains: links = filter_links(links, domn.replace('.', '\.'), include=True) if blacklist_domains: for domn in blacklist_domains: links = filter_links(links, domn.replace('.', '\.'), include=False) if whitelist_extensions: for ext in whitelist_extensions: links = filter_links(links, ext.replace('.', '\.'), include=True) if blacklist_extensions: for ext in blacklist_extensions: links = filter_links(links, ext.replace('.', '\.'), include=False) return links
def getFriendsBirthdays(birthdays,friendsDict,s): # --------- Getting Birthday Info ----------- relatStrainer = SoupStrainer(text=re.compile("Birthday")) relatExt = "/about" relatExtBeta = "&sk=about" fbook = "https://facebook.com" #***** Note: will have to perform additional string methods because scraping from main page for friend in friendsDict: if (friendsDict[friend].find("php") != -1): relatURL = fbook + friendsDict[friend] + relatExtBeta else: relatURL = fbook + friendsDict[friend] + relatExt relatInfo = s.get(relatURL) soup = BeautifulSoup(relatInfo.text,"lxml",parse_only=relatStrainer) subString = soup.find(text=re.compile("Birthday")) if (subString != None): # Cut off everthing before Birthday stringIndex = subString.find('Birthday') subString = subString[stringIndex:] # Cut off the prefix to get the birthdate and everything after stringIndex = subString.find('<div>') subString = subString[(stringIndex+5):] # Get rid of everything after the birthday stringIndex = subString.find('</div>') subString = subString[:stringIndex] # Standardize the birthday date by cutting off the year if there is one commaIndex = subString.find(',') if (commaIndex != -1): subString = subString[:commaIndex] if (subString in birthdays): birthdays[subString].append(friend) else: birthdays[subString] = [friend] print friend + " has birthday " + subString return
def get_film_info_dytt(): items = [] target_url = 'http://www.dy2018.com/' content = urllib2.urlopen(target_url).read() content = unicode(content,'GBK').encode('utf-8') only_hotl_tags = SoupStrainer(class_='co_content222') soup = BeautifulSoup(content, "html.parser", parse_only=only_hotl_tags) i = 0 key = re.compile(r'?(.+?)?') for link in soup.find_all('li', limit=8): if i != 0: link_url = target_url + link.findChildren('a')[0].get('href') link_time = link.findChildren('span')[0].string link_title = link.findChildren('a')[0].get('title')[5:] file_name = re.findall(u'?(.*?)[?|?]', link_title)[0] # print file_name.encode("utf-8") douban_api = 'https://api.douban.com/v2/movie/search?q=' + file_name.encode("utf-8") user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' headers = {'User-Agent': user_agent} req = urllib2.Request(douban_api, None, headers) api_content = urllib2.urlopen(req) json_content = json.load(api_content)['subjects'][0]['images']['small'] img_url = json_content #print img_url save_path = os.path.abspath("./icons/icon") img_data = urllib2.urlopen(img_url).read() file_name = save_path + str(i) + '.jpg' output = open(file_name, 'wb+') output.write(img_data) output.close() json_item = dict(title=link_title, subtitle='??: '+link_time, arg=link_url, icon='icons/icon' + str(i) + '.jpg') items.append(json_item) i = i + 1 return generate_xml(items) # print(get_film_info_dytt())
def get_standard(): standard_url = 'https://www.standardmedia.co.ke/' if check_connection(standard_url): standard = requests.get(standard_url) soup = BeautifulSoup(standard.text, 'lxml', parse_only=SoupStrainer('div')) standard = [] for link in soup.select('.col-xs-8.zero a', limit=11): if link.get_text(): news_title = '{}({})'.format(link.get_text().strip(), link.get('href')) standard_link = requests.get(link.get('href')) soup_link = BeautifulSoup(standard_link.text, 'lxml', parse_only=SoupStrainer(['script', 'div'])) try: data = json.loads(soup_link.find('script', type='application/ld+json').text.replace("\\", r"\\")) article_date = data['dateModified'] image = data['image']['url'] if image == 'https://www.standardmedia.co.ke': image = '' except (ValueError, AttributeError): print('Standard: invalid json detected') continue try: content = get_content(soup_link, 'main-article') except AttributeError: try: content = get_content(soup_link, 'story') except AttributeError: print('Standard: No content found') continue news_dict = { 'category': 'news', 'source': 'standard', 'title': link.get_text().strip(), 'link': link.get('href'), 'image': image, 'content': content, 'date': article_date, 'date_added': datetime.datetime.utcnow() } collection.update({'link': link.get('href')}, news_dict, upsert=True) standard.append(news_dict) return standard