我是python初学者。
我正在尝试抓取Google Play商店并导出到csv文件。但是我收到一条错误消息。
UnicodeEncodeError: 'cp949' codec can't encode character '\u20a9' in position 90: illegal multibyte sequence
这是我的源代码。
当我命令打印时,它可以工作。但是在导出到csv文件时显示错误消息
请帮我
from bs4 import BeautifulSoup import urllib.request import urllib.parse import codecs import json import pickle from datetime import datetime import sys import csv import os req = 'https://play.google.com/store/search?q=hana&c=apps&num=300' response = urllib.request.urlopen(req) the_page = response.read() soup = BeautifulSoup(the_page) #app_link = soup.find('a', {'class' : 'title'}) #app_url = app_link.get('href') for div in soup.findAll( 'div', {'class' : 'details'} ): title = div.find( 'a', {'class':'title'} ) #print(title.get('href')) app_url = title.get('href') app_details={} g_app_url = 'https://play.google.com' + app_url app_response = urllib.request.urlopen(g_app_url) app_page = app_response.read() soup = BeautifulSoup(app_page) #print(soup) #print( g_app_url ) title_div = soup.find( 'div', {'class':'document-title'} ) app_details['title'] = title_div.find( 'div' ).get_text().strip() subtitle = soup.find( 'a', {'class' : 'document-subtitle primary'} ) app_details['developer'] = subtitle.get_text().strip() app_details['developer_link'] = subtitle.get( 'href' ).strip() price_buy_span = soup.find( 'span', {'class' : 'price buy'} ) price = price_buy_span.find_all( 'span' )[-1].get_text().strip() price = price[:-4].strip() if price != 'Install' else 'Free' app_details['price'] = price rating_value_meta = soup.find( 'meta', {'itemprop' : 'ratingValue'} ) app_details['rating'] = rating_value_meta.get( 'content' ).strip() reviewers_count_meta = soup.find( 'meta', {'itemprop' : 'ratingCount'} ) app_details['reviewers'] = reviewers_count_meta.get( 'content' ).strip() num_downloads_div = soup.find( 'div', {'itemprop' : 'numDownloads'} ) if num_downloads_div: app_details['downloads'] = num_downloads_div.get_text().strip() date_published_div = soup.find( 'div', {'itemprop' : 'datePublished'} ) app_details['date_published'] = date_published_div.get_text().strip() operating_systems_div = soup.find( 'div', {'itemprop' : 'operatingSystems'} ) app_details['operating_system'] = operating_systems_div.get_text().strip() content_rating_div = soup.find( 'div', {'itemprop' : 'contentRating'} ) app_details['content_rating'] = content_rating_div.get_text().strip() category_span = soup.find( 'span', {'itemprop' : 'genre'} ) app_details['category'] = category_span.get_text() #print(app_details) with open('result.csv', 'w') as f: # Just use 'w' mode in 3.x w = csv.DictWriter(f, app_details.keys()) w.writeheader() w.writerow(app_details)
Python 3以语言环境默认编码打开文本文件;如果该编码无法处理您尝试写入的Unicode值,请选择其他编解码器:
with open('result.csv', 'w', encoding='UTF-8', newline='') as f:
它将所有Unicode字符串编码为UTF-8,该编码可以处理所有Unicode标准。
请注意,该csv模块建议您使用newline=''来打开文件,以防止换行。
csv
newline=''
您还需要在循环外仅 一次 打开文件for:
for
with open('result.csv', 'w') as f: # Just use 'w' mode in 3.x fields = ('title', 'developer', 'developer_link', 'price', 'rating', 'reviewers', 'downloads', 'date_published', 'operating_system', 'content_rating', 'category') w = csv.DictWriter(f, ) w.writeheader() for div in soup.findAll( 'div', {'class' : 'details'} ): # # build app_details # w.writerow(app_details)