我们从Python开源项目中,提取了以下40个代码示例,用于说明如何使用scrapy.loader.ItemLoader()。
def parse(self, response): mongoClient = mongodb_client('localhost', 27017) list = [] print "************************" # print response.xpath('//div[@class="about_fonts clearfix"]/p[@class="time_f"]/text()').extract() player_away = response.xpath('//table[@id="J_away_content"]/tbody/tr') player_home = response.xpath('//table[@id="J_home_content"]/tbody/tr') if player_away: for player in player_away: playerName=player.xpath('td/a/text()').extract() if playerName: list.append(playerName + player.xpath('td/text()').extract()) for player in player_home: playerName = player.xpath('td/a/text()').extract() if playerName: list.append(playerName + player.xpath('td/text()').extract()) # print response.xpath('//div[@class="message"]/h2/text()').extract() print "************************" time = response.xpath('//div[@class="about_fonts clearfix"]/p[@class="time_f"]/text()').extract() team = response.xpath('//div[@class="message"]/p/a/text()').extract() score = response.xpath('//div[@class="message"]/h2/text()').extract() url = response.url g = game(time, team, score, list, url) print g.__dict__ # json_g = parsejson(g) # print json_g # g = ItemLoader(game(), response=response) # g.add_xpath('time', '//div[@class="about_fonts clearfix"]/p[@class="time_f"]/text()') # g.add_xpath('team', '//div[@class="message"]/p/a/text()') # g.add_xpath('score', '//div[@class="message"]/h2/text()') # g.add_value('players', list) # return g.load_item() client = mongoClient.connect() db = mongoClient.useDB(client, "hupu_data") print mongoClient.insert_one(db, "games", g.__dict__)
def parse_question(self, response): # ??question????????????question item question_id = response.meta.get("zhihu_id", "") item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def _extract_item(self, response): #?scrapy shell???response #inspect_response(response, self) #???????scrapy????response????????????? #open_in_browser(response) #??????? l = ItemLoader(response=response, item=MyspiderItem(), type='html') l.add_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()') l.add_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()') l.add_xpath('movie_type', '//span[@property="v:genre"]/text()') l.add_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()') l.add_value('url', response.url) #????????????load_item()????scrapy.Item?? #?scrapy-redis????json?item???????redis?item??? #??json?????python?????????????item????? return dict(l.load_item())
def parse_first_page(self, response): count = int(response.xpath('//div[@id="aplist"]/ul/li[1]/a/text()')[0].re(r'.*?(\d+).*?')[0]) title = response.request.cookies['title'] albumURL = response.url.replace(".html", '') for x in xrange(1,count+1): suffix = ".html" if x > 1: suffix = "_"+str(x)+".html" request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title}) yield request l = ItemLoader(item=PageItem(), response=response) l.add_value('title', title) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src') yield l.load_item()
def parse_first_page(self, response): count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0]) title = response.request.cookies['title'] albumURL = response.url.replace(".shtml", '') # print u'', count, title, albumURL for x in xrange(1,count+1): suffix = ".shtml" if x > 1: suffix = "_"+str(x)+".shtml" # print u'',albumURL+suffix request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title}) yield request l = ItemLoader(item=PageItem(), response=response) l.add_value('title', title) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//td[@valign="top"]/img/@src') yield l.load_item()
def parse(self, response): for outer in response.css('#comapreTable tr:not(:first-child)'): if outer.css('td[align="center"]'): ccode = outer.css('td[align="center"]>a::attr(id)').extract_first() cname = outer.css('td[align="center"]>a::text').extract_first() for inner in outer.xpath('td[div[@align="left"]/a]'): loader = ItemLoader(item=EolZhuanyeItem(), selector=inner) loader.add_value('ccode', ccode) loader.add_value('cname', cname) loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0])) loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip)) loader.add_css('name', 'a::text', MapCompose(unicode.strip)) item = loader.load_item() yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
def parse_question(self, response): question_pattern = re.compile('(.*zhihu.com/question/(\d+))(/|$).*') match_object = re.match(question_pattern, response.url) question_id = match_object.group(2) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css('topics', '.TopicLink .Popover div::text') item_loader.add_value('url', response.url) item_loader.add_css('content', '.QuestionHeader-detail div div span::text') item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num', '.NumberBoard-value::text') item = item_loader.load_item() yield item yield scrapy.Request(self.start_answer_url.format(question_id=question_id, offset=0, limit=20), headers=self.headers, callback=self.parse_answer)
def parse(self, response): try: l = ItemLoader(item=MovieItem(), response=response) l.add_value('name', response.css('div#content h1 [property="v:itemreviewed"]::text').extract_first().strip()) year = response.css('div#content h1 span.year::text').extract_first() if year.startswith('('): year = year[1:-1] l.add_value('year', year) newStrL = [] for val in response.css('div#info::text').extract(): newStr = val.strip().strip('/') if newStr != '': newStrL.append(newStr) if len(newStrL) == 2: break if len(newStrL) == 2: l.add_value('region', newStrL[0].split('/')) l.add_value('language', newStrL[1].split('/')) l.add_value('duration', response.css('div#info [property="v:runtime"]::attr(content)').extract_first()) l.add_value('types', response.css('div#info [property="v:genre"]::text').extract()) l.add_value('directors', response.css('div#info [rel="v:directedBy"]::text').extract()) l.add_value('actors', response.css('div#info [rel="v:starring"]::text').extract()) l.add_value('runtime', response.css('div#info [property="v:initialReleaseDate"]::text').extract()) l.add_value('detailurl', response.url) l.add_value('IMDburl', response.css('div#info [rel="nofollow"]::attr(href)').extract()) l.add_value('stars', response.css('strong[property="v:average"]::text').extract_first()) return l.load_item() except Exception: pass
def parse_item(self, response): url = response.url item_idx = self.all_urls[url] self.logger.info("Trying page %s %s" % (item_idx, url)) resp_dct = json.loads(response.body) l = ItemLoader(item=HeatMapItem(), response=response) current_hour = time.strftime("%Y%m%d%H", time.localtime()) l.add_value('cur_hour', current_hour) l.add_value('serial', item_idx) l.add_value('data', resp_dct.pop('data')) l.add_value('timestamp', resp_dct.pop('nt')) l.add_value('others', resp_dct) l.add_value('url', url) l.add_value('is_parsed', 0) self.finished.add(item_idx) self.logger.info(u"Crawling %s, %s successfully. :)" % (item_idx, url)) self.claim_completeness() yield l.load_item() # else: # if resp_dct.get("data") == "\\u8be5\\u7528\\u6237\\u8bbf\\u95ee\\u6b21\\u6570\\u8fc7\\u591a".decode( # 'unicode_escape'): # ?????? # banned_cookie = response.request.cookies # self.logger.warning("%s has been BANNED today." % banned_cookie) # self.cookies.remove(banned_cookie) # yield {"BannedCookieToday": banned_cookie} # else: # yield {} # self.logger.error(u"Crawling %s, %s failed. :(" % (item_idx, response.url))
def parse(self, response): l = ItemLoader(item=Area(), response=response) l.add_value('id', parse_qs(response.xpath('//div[@class="clearfix subnav level-1"]//li//a[2]/@href').extract()[0])['area_id'][0]) l.add_xpath('name', '//div[@class="clearfix subnav level-1"]//li//a[2]/text()') l.add_value('updated', datetime.utcnow().isoformat()) # you can also use literal values return l.load_item() #self.log('URL: {}'.format(response.url))
def _extract_item(self, response): #??????? l = ItemLoader(response=response, item=MyspiderItem(), type='html') l.add_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()') l.add_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()') l.add_xpath('movie_type', '//span[@property="v:genre"]/text()') l.add_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()') l.add_value('url', response.url) #????????????load_item()????scrapy.Item?? #?scrapy-redis????json?item???????redis?item??? #??json?????python?????????????item????? return dict(l.load_item())
def get_details(self, response): self.log('Starting the second parsing phase') loader = ItemLoader(item=LibraryOrFrameworkItem(), response=response) # Load the values obtained in the first phase loader.add_value('name', response.meta['name']) language = response.meta['language'] loader.add_value('stable_release', response.meta['stable_version']) loader.add_value('release_date', response.meta['rel_date']) descr = response.xpath('//*[@id="mw-content-text"]/div/p[1] | //*[@id="mw-content-text"]/p[1]').extract_first() cleaned_descr = cleanhtml(descr) loader.add_value('description', cleaned_descr) license_found = False for row in response\ .xpath('//*[@id="mw-content-text"]/div/table[position()<=3]/tr'): header = row.xpath('./th/a/text() | ./th/text()').extract_first() key, value = self.get_key_value(header, row) if key: if key == 'license': # If we find the license in the main page, we will use it license_found = True loader.add_value(key, value) # If we not found the license in the main page # We will use the license found on the start page if not license_found: loader.add_value('license', response.meta['license']) return { "item": loader.load_item(), "language": language # We need to return the language separately in order to manage the many to many relation } # Given a couple (key, elem), obtained during the scraping, he returns the valid couple (key1, value1) # to add to the db. If key is not valid, he will return the tuple (None, None)
def parse_book(self, response): book_loader = ItemLoader(item=BookItem(), response=response) book_loader.default_input_processor = MapCompose(remove_tags) book_loader.default_output_processor = TakeFirst() book_loader.add_xpath("title", "//div[@class='col-sm-6 product_main']/h1") book_loader.add_xpath("price", "//p[@class='price_color']") book_loader.add_xpath("upc", "//table[@class='table table-striped']/tr[1]/td") book_loader.add_xpath("product_type", "//table[@class='table table-striped']/tr[2]/td") book_loader.add_xpath("tax", "//table[@class='table table-striped']/tr[5]/td") book_loader.add_xpath("stock", "//table[@class='table table-striped']/tr[6]/td") book_loader.add_xpath("reviews", "//table[@class='table table-striped']/tr[7]/td") book_loader.add_xpath("rating", "//p[@class='instock availability']/following-sibling::p/@class") yield book_loader.load_item()
def parse_book(self, response): book_loader = ItemLoader(item=BookItem(), response=response) book_loader.default_input_processor = MapCompose(remove_tags) book_loader.add_value("image_urls", response.urljoin(response.css(".item.active > img::attr(src)").extract_first())) book_loader.add_css("title", ".col-sm-6.product_main > h1", TakeFirst()) book_loader.add_css("price", ".price_color", TakeFirst()) book_loader.add_css("upc", ".table.table-striped > tr:nth-child(1) > td", TakeFirst()) book_loader.add_css("product_type", ".table.table-striped > tr:nth-child(2) > td", TakeFirst()) book_loader.add_css("tax", ".table.table-striped > tr:nth-child(5) > td", TakeFirst()) book_loader.add_css("stock", ".table.table-striped > tr:nth-child(6) > td", TakeFirst()) book_loader.add_css("reviews", ".table.table-striped > tr:nth-child(7) > td", TakeFirst()) book_loader.add_css("rating", ".star-rating::attr(class)", TakeFirst()) return book_loader.load_item()
def parse(self, response): for quote in response.css(".quote"): loader = ItemLoader(item=QuoteItem(), selector=quote) loader.add_css("text", ".text") loader.add_css("by", ".authoor") loader.add_css("tags", ".tag") yield loader.load_item()
def parse(self, response): for country in response.css(".col-md-4, .country"): item = ItemLoader(item=CountryItem(), selector=country) item.add_css("country", ".country-name") item.add_css("capital", ".country-capital::text") item.add_css("population", ".country-population::text") item.add_css("area", ".country-area::text") yield item.load_item()
def parse(self, response): #l = ItemLoader(item = LianjiaItem(),response=response) for i in range(0,len(response.xpath("//div[@class='info-panel']/h2/a/text()").extract())): l = ItemLoader(item = LianjiaItem(),response=response) info = response.xpath("//div[@class='info-panel']/h2/a/text()").extract()[i].encode('utf-8') local = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='region']/text()").extract()[i].encode('utf-8') house_layout = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='zone']//text()").extract()[i].encode('utf-8') house_square = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='meters']/text()").extract()[i].encode('utf-8') house_orientation = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='where']//span/text()").extract()[(i + 1) * 4 - 1].encode('utf-8') district = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='con']/a/text()").extract()[i].encode('utf-8')[:-6] floor = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='con']//text()").extract()[(i + 1) * 5 - 3].encode('utf-8') building_year = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='con']//text()").extract()[(i + 1) * 5 - 1].encode('utf-8') price_month = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='num']//text()").extract()[(i + 1) * 2 - 2].encode('utf-8') person_views = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='num']//text()").extract()[(i + 1) * 2 - 1].encode('utf-8') tags = [] for j in range(0,len(response.xpath("//div[@class='view-label left']")[i].xpath(".//span//text()").extract())): tags.append(response.xpath("//div[@class='view-label left']")[i].xpath(".//span//text()").extract()[j].encode("utf-8")) l.add_value('info',info) l.add_value('local',local) l.add_value('house_layout',house_layout) l.add_value('house_square',house_square) l.add_value('house_orientation',house_orientation) l.add_value('district',district) l.add_value('floor',floor) l.add_value('building_year',building_year) l.add_value('price_month',price_month) l.add_value('person_views',person_views) l.add_value('tags',tags) print l yield l.load_item()
def parse(self, response): # l = ItemLoader(item = ItjuziItem(),response=response) jsonresponse = json.loads(response.body_as_unicode()) for i in range(0,len(jsonresponse['data']['list'])): l = ItemLoader(item = LianjiaErshouItem(),response=response) house_code = jsonresponse['data']['list'][i]['house_code'] price_total = jsonresponse['data']['list'][i]['price_total'] ctime = jsonresponse['data']['list'][i]['ctime'] title = jsonresponse['data']['list'][i]['title'] frame_hall_num = jsonresponse['data']['list'][i]['frame_hall_num'] tags = jsonresponse['data']['list'][i]['tags'] house_area = jsonresponse['data']['list'][i]['house_area'] community_id = jsonresponse['data']['list'][i]['community_id'] community_name = jsonresponse['data']['list'][i]['community_name'] is_two_five = jsonresponse['data']['list'][i]['is_two_five'] frame_bedroom_num = jsonresponse['data']['list'][i]['frame_bedroom_num'] l.add_value('house_code',house_code) l.add_value('price_total',price_total) l.add_value('ctime',ctime) l.add_value('title',title) l.add_value('frame_hall_num',frame_hall_num) l.add_value('tags',tags) l.add_value('house_area',house_area) l.add_value('community_id',community_id) l.add_value('community_name',community_name) l.add_value('is_two_five',is_two_five) l.add_value('frame_bedroom_num',frame_bedroom_num) print l yield l.load_item()
def parse_question(self, response): #??question??? ??????????question item if "QuestionHeader-title" in response.text: #????? match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: #????????item?? match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def join_tags(value): return ','.join([i for i in value if i]) # ??ItemLoader????output_processor?? # ItemLoader????list??????????????????output_processor
def parse_page(self, response): #???? # print u'~~~~', unicode(response.body, "gbk").encode("utf8") # print(self.config["xpathImagesPath"]) # print(response.xpath(self.config["xpathImagesPath"])) l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.config["id"]) l.add_value('url', response.url) if self.config.has_key("imageUrlReplacement"): l.add_value('replace', self.config["imageUrlReplacement"]) if self.config.has_key("xpathImagesPath"): l.add_xpath('image_urls', self.config["xpathImagesPath"]) if self.config.has_key("xpathFilesPath"): l.add_xpath('file_urls', self.config["xpathFilesPath"]) yield l.load_item() #TODO??????????????parse_page if self.config.has_key("xpathNextImageUrl"): nextUrls = response.xpath(self.config["xpathNextImageUrl"]) if len(nextUrls) > 0: nextPage = nextUrls.extract()[0] if not nextPage.startswith("http"): if nextPage.startswith("/"): nextPage = response.url[0:response.url.index("/",10)+1]+nextPage else: nextPage = response.url[0:response.url.rfind("/")+1]+nextPage request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']}) yield request
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src') return l.load_item()
def parse_item(self, response): l = ItemLoader(item=PageItem(), response=response) l.add_value('title', response.request.cookies['title']) l.add_value('name', self.name) l.add_value('url', response.url) l.add_xpath('image_urls', '//td[@valign="top"]/img/@src') return l.load_item()
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=ur'-([^-]+)\.html') loader.add_css('name', u'.modTitle>h1::text') def parse_category(): for e in response.css(u'.catType>a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('category', list(parse_category())) loader.add_css('detail', u'.zhiyeShow') item = loader.load_item() return FormRequest( url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html', formdata={'code': item['code'][0]}, meta={'item': item}, dont_filter=True, callback=self.parse_majors )
def parse_item(self, response): loader = ItemLoader(EolZhiyeItem(), response) loader.add_value('url', response.url) loader.add_value('code', response.url, re=r'/(\w+)\.shtml') loader.add_css('name', 'h1#pagetitle::text') loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()') loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()') loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n')) yield loader.load_item()
def parse(self, response): l = ItemLoader(item=PlantItem(), response=response) l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()") l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()") l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()") l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()") # l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()") return l.load_item()
def parse_item(self, response): il = ItemLoader(item=ImageItem(), response=response) il.add_css('image_urls', 'img::attr(src)') return il.load_item()
def parse_question(self,response): # ??question?????????????question item if "QuestionHeader-title" in response.text: # ????? match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # ????????item?? match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) #???question_id????? yield question_item
def parse_song_list(self, response): selector = Selector(response) song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract() song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract() title = selector.xpath('//title/text()').extract() for index, id_ in enumerate(song_id_list): l = ItemLoader(item=PlayListItem()) l.add_value('song_name', song_name_list[index]) l.add_value('title', title) yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET', headers=self.headers, callback=self.parse_single_song)
def parse_question(self, response): # ??question??? ??????????question item if "QuestionHeader-title" in response.text: # ????? match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # ????????item?? match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url # Required: title, raw_content, published_at loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('h1.detailtitle::text') if not title_selectors: # If error, drop from the item pipeline return loader.load_item() title = title_selectors.extract_first().strip() loader.add_value('title', title) # Parse date information date_time = response.css('body > div > div.container > div.page-header > div::text').extract_first().strip() date_time = date_time.split(',')[-1].strip() date_time = ' '.join([_(w) for w in date_time.split(' ')]) # October => Oktober try: published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M') except ValueError: # If error, drop from the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) # If multipage multipage_selectors = response.css('.newsPagingWrap > a') if multipage_selectors: return self.parse_indices(multipage_selectors, loader) # Else if not multipage author_name_selectors = response.css('.newsContent > p > strong::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[-1].strip() loader.add_value('author_name', author_name) # Extract the news content raw_content_selectors = response.css('.newsContent > p') if not raw_content_selectors: # Drop from the item pipeline return loader.load_item() raw_content = ' '.join(raw_content_selectors.extract()) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) parsed_news = json.loads(str(response.body))[0] # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', parsed_news['url']) if not parsed_news['title']: # Will be dropped on the item pipeline return loader.load_item() loader.add_value('title', parsed_news['title']) # Convert HTML text to a scrapy response html_response = HtmlResponse(url=parsed_news['url'], body=parsed_news['content'].encode('utf-8', 'ignore')) xpath_query = ''' //body/node() [not(descendant-or-self::comment()| descendant-or-self::style| descendant-or-self::script| descendant-or-self::div| descendant-or-self::span| descendant-or-self::image| descendant-or-self::img| descendant-or-self::iframe )] ''' raw_content_selectors = html_response.xpath(xpath_query) if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) if not parsed_news['published']: # Will be dropped on the item pipeline return loader.load_item() # Parse date information # Example: 12 Oct 2016 - 05:25 date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')]) try: published_at_wib = datetime.strptime(date_time_str, '%d %b %Y - %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) if not parsed_news['author']: loader.add_value('author_name', '') else: loader.add_value('author_name', parsed_news['author']) # Move scraped news to pipeline return loader.load_item()
def parse_news_metro(self, response): loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) date_selector = response.css('.artikel > div.block-tanggal::text') if not date_selector: return self.parse_news_pilkada(loader, response) try: date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4] date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')]) published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M') except Exception: return loader.load_item() published_at = wib_to_utc(published_at_wib) if (self.media['last_scraped_at'] >= published_at): is_no_update = True self.logger.info('Media have no update') raise CloseSpider('finished') loader.add_value('published_at', published_at) title_selector = response.css('.artikel > h1::text') if not title_selector: return loader.load_item() loader.add_value('title', title_selector.extract()[0]) # Select all p which don't have iframe inside it raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]') if not raw_content_selector: return loader.load_item() raw_content = '' for rsl in raw_content_selector: raw_content = raw_content + rsl.extract().strip() # Go to next page while there is next page button next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href') if next_page_selector: return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content)) loader.add_value('raw_content', raw_content) # The author usually put inside <strong> tag, however, some news is not using <strong> tag. # NOTE: this block of code may need revision in the future author_name = '' for author_name_selector in reversed(raw_content_selector): author_name_selector = author_name_selector.css('strong::text') for tmp in reversed(author_name_selector.extract()): tmp = tmp.strip() if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp): author_name = tmp break if author_name: break author_name = ','.join(author_name.split(' | ')) loader.add_value('author_name', author_name) return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', response.url) title_selectors = response.css('h1[itemprop="headline"]::text') if not title_selectors: # Will be dropped on the item pipeline return loader.load_item() title = title_selectors.extract()[0] loader.add_value('title', title) author_name_selectors = response.css('a[rel="author"] > span::text') if not author_name_selectors: loader.add_value('author_name', '') else: author_name = author_name_selectors.extract()[0] loader.add_value('author_name', author_name) raw_content_selectors = response.css('.content') if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) raw_content = raw_content.strip() loader.add_value('raw_content', raw_content) date_time_str_selectors = response.css('article > div.time::text') if not date_time_str_selectors: # Will be dropped on the item pipeline return loader.load_item() # Parse date information # Example: Selasa, 6 Oktober 2015 - 05:23 WIB date_time_str = date_time_str_selectors.extract()[0] date_time_str = date_time_str.split(',')[1].strip()[:-4] date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')]) try: published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) # Move scraped news to pipeline return loader.load_item()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) loader = ItemLoader(item=News(), response=response) json_response = json.loads(response.body) try: url = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['MoreLink'] except KeyError: return loader.load_item() loader.add_value('url', url) try: title = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['HeadLine'] except KeyError: return loader.load_item() if not title: return loader.load_item() loader.add_value('title', title) try: raw_content = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['ContentItem']['DataContent']['nitf']['body']['body.content']['p'] except KeyError: return loader.load_item() if not raw_content: return loader.load_item() loader.add_value('raw_content', raw_content) try: author_name = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['Author'] except KeyError: return loader.load_item() if not author_name: loader.add_value('author_name', '') else: loader.add_value('author_name', author_name) try: date_time_str = json_response['NewsML']['NewsItem']['NewsManagement']['FirstCreated'] except KeyError: return loader.load_item() if not date_time_str: return loader.load_item() date_time_str = date_time_str.split('T') date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1] try: published_at_wib = datetime.strptime(' '.join(date_time_str), '%Y%m%d %H%M%S'); except Exception: return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) return loader.load_item()
def parse_item(self, response): loader = ItemLoader(GaokaopaiZhuanyeItem(), response) loader.add_value('url', response.url) loader.add_css('name', u'.majorTitle>h1::text') loader.add_xpath('code', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)') loader.add_xpath('degree', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)') loader.add_xpath('period', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)') loader.add_xpath('courses', u'//div[@class="course"]/h3[.="?????"]/following-sibling::p/text()') def parse_related(): for e in response.xpath(u'//div[@class="course"]/h3[.="?????"]/following-sibling::a'): yield { 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'), 'name': e.css('::text').extract_first(), } loader.add_value('related', list(parse_related())) def parse_category(): category = [] for i in [u"????", u"????", u"????"]: x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a'.format(i) e = response.xpath(x) category.append({ 'url': e.css('::attr(href)').extract_first(), 'code': e.css('::attr(href)').re_first(ur'/zhuanye([-0-9]*)\.html').strip('-'), 'name': e.css('::text').extract_first(), }) return category loader.add_value('category', parse_category()) loader.add_css('detail', u'.majorCon') item = loader.load_item() return Request( url='http://www.gaokaopai.com/zhuanye-jiuye-{}.html'.format(item['code'][0]), meta={'item': item}, callback=self.parse_jiuye )
def parse_item(self, response): loader = ItemLoader(ChsiDaxueItem(), response) loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml') loader.add_value('url', response.url) loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url))) loader.add_css('name', u'.topImg::text') loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)') data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip) loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean) loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean) loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean) def parse_votes(): xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank' get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0) return { 'overall': get_vote(u'?????'), 'environment': get_vote(u'???????'), 'life': get_vote(u'?????'), } loader.add_value('votes', parse_votes()) def parse_trending(): css = u'{}>table tr:not(:first-child)' def get_trending(what): majors = [] for e in response.css(css.format(what)): majors.append({ 'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'), 'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(), 'vote': float(e.css(u'.avg_rank::text').extract_first()), 'count': int(e.css(u'.c_f00::text, .red::text').extract_first()), }) return majors return { 'count': get_trending(u'#topNoofPTable'), 'index': get_trending(u'#topIndexTable'), 'like': get_trending(u'.r_r_box_zymyd'), } loader.add_value('trending', parse_trending()) item = loader.load_item() for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response): yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)
def parse_item(self, response): """ Parse a response into a DocumentItem. """ doc_loader = ItemLoader(item=DocumentItem(), response=response) doc_loader.add_value('url', response.url) doc_loader.add_xpath('meta', '//meta[@name=\'description\']/@content') doc_loader.add_value('domain', urlparse(response.url).hostname) doc_loader.add_xpath('title', '//title/text()') hxs = HtmlXPathSelector(response) # For HTML extractions # Extract links # For each link on this page links = [] a_links = hxs.xpath('//a') for link in a_links: link_obj = {} # Extract the link's URL link_str = " ".join(link.xpath('@href').extract()) link_obj['link'] = link_str.replace("\n", "") # Extract the links value link_name_str = " ".join(link.xpath('text()').extract()) link_name_str = link_name_str.replace("\n", "") link_name_str = link_name_str.lstrip() link_name_str = link_name_str.rstrip() link_obj['link_name'] = link_name_str links.append(link_obj) doc_loader.add_value('links', links) # Populate text field title_list = hxs.xpath('//title/text()').extract() title = ' '.join(title_list) body_text = self.html2string(response) text = title + " " + body_text doc_loader.add_value('content', text) doc_loader.add_value('raw_text', text) doc_loader.add_value('raw_title', title) doc_loader.add_value('raw_url', response.url) h1_list = hxs.xpath("//h1/text()").extract() doc_loader.add_value('h1', " ".join(h1_list)) doc_loader.add_value('content_type', response.headers['Content-type']) doc_loader.add_value('updated_on', datetime.datetime.now().strftime( "%Y-%m-%dT%H:%M:%S")) item = doc_loader.load_item() return item
def parse_item(self, response): """ Extract fields from the individual email page and load them into the item. @url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html @returns items 1 1 @scrapes senderName senderEmail timeSent timeReceived subject body @scrapes replyto url """ load = ItemLoader(item=Email(), selector=response) # Take care of easy fields first load.add_value('url', response.url) pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]' pattern_replyto += '/a/@href' link = response.xpath(pattern_replyto).extract() link = [''] if not link else link load.add_value('replyto', link[0]) # Sometime in 2003, the archive changes and the email pages # require specific procedure to extract the following fields: specific_fields = { 'senderName': None, 'senderEmail': None, 'timeSent': None, 'timeReceived': None, 'subject': None } # Detect new archive system with HTML comment new_system = response.xpath('/comment()[1][contains(., "MHonArc")]') if len(new_system) >= 1: # If new archive system is detected... specific_fields = self.parse_new_system(response, specific_fields) body_before_comment = '<!--X-Body-of-Message-->' body_after_comment = '<!--X-Body-of-Message-End-->' else: # Otherwise... specific_fields = self.parse_old_system(response, specific_fields) body_before_comment = '<!-- body="start" -->' body_after_comment = '<!-- body="end" -->' # Load all the values from these specific fields for key, val in specific_fields.items(): load.add_value(key, val) if self.get_body: # Final field, the body of the email pattern_body = body_before_comment + '\n?(.*)' + body_after_comment # Ignore invalid bytes when necessary page_body = response.body.decode('utf-8', 'ignore') body = re.search(pattern_body, page_body, flags=re.S) load.add_value('body', body.group(1)) return load.load_item()