Python
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('http://coreyms.com').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'video_link'])
for article in soup.find_all('article'):
headline = article.h2.a.text
print(headline)
summary = article.find('div', class_='entry-content').p.text
print(summary)
try:
vid_src = article.find('iframe', class_='youtube-player')['src']
vid_id = vid_src.split('/')[4]
vid_id = vid_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={vid_id}'
except Exception as e:
yt_link = None
print(yt_link)
print()
csv_writer.writerow([headline, summary, yt_link])
csv_file.close()
HTML
<!doctype html>
<html class="no-js" lang="">
<head>
<title>Test - A Sample Website</title>
<meta charset="utf-8">
<link rel="stylesheet" href="css/normalize.css">
<link rel="stylesheet" href="css/main.css">
</head>
<body>
<h1 id='site_title'>Test Website</h1>
<hr></hr>
<div class="article">
<h2><a href="article_1.html">Article 1 Headline</a></h2>
<p>This is a summary of article 1</p>
</div>
<hr></hr>
<div class="article">
<h2><a href="article_2.html">Article 2 Headline</a></h2>
<p>This is a summary of article 2</p>
</div>
<hr></hr>
<div class='footer'>
<p>Footer Information</p>
</div>
<script src="js/vendor/modernizr-3.5.0.min.js"></script>
<script src="js/plugins.js"></script>
<script src="js/main.js"></script>
</body>
</html>
学习更多Python教程