小编典典

Scrapy Images Downloading

scrapy

我的Spider运行时没有显示任何错误,但是图像未存储在文件夹中,这是我的抓取文件:

Spider.py:

import scrapy
import re
import os
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem, ListResidentialItem

class productionSpider(scrapy.Spider):
    name = "production"
    allowed_domains = ["someurl.com"]
    start_urls = [
        "someurl.com"
]

def parse(self, response):
    for sel in response.xpath('//html/body'):
        item = ProductionItem()
        img_url = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract()[0]
        yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo,  meta={'item': item})

def parseBasicListingInfo(item, response):
    item = response.request.meta['item']
    item = ListResidentialItem()
    try:
        image_urls = map(unicode.strip,response.xpath('//a[@itemprop="contentUrl"]/@data-href').extract())
        item['image_urls'] = [ x for x in image_urls]
    except IndexError:
        item['image_urls'] = ''

    return item

settings.py:

from scrapy.settings.default_settings import ITEM_PIPELINES
from scrapy.pipelines.images import ImagesPipeline

BOT_NAME = 'production'

SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'

ROBOTSTXT_OBEY = True
DEPTH_PRIORITY = 1
IMAGE_STORE = '/images'

CONCURRENT_REQUESTS = 250

DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {
    'scrapy.contrib.pipeline.images.ImagesPipeline': 300,
}

items.py

# -*- coding: utf-8 -*-
import scrapy

class ProductionItem(scrapy.Item):
    img_url = scrapy.Field()

# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()

    pass

我的管道文件为空,我不确定应该添加到pipeline.py文件中。


阅读 539

收藏
2020-04-10

共1个答案

小编典典

我的工作最终结果是:

spider.py:

import scrapy
import re
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem
from production.items import ImageItem

class productionSpider(scrapy.Spider):
    name = "production"
    allowed_domains = ["url"]
    start_urls = [
        "startingurl.com"
    ]

def parse(self, response):
    for sel in response.xpath('//html/body'):
        item = ProductionItem()
        img_url = sel.xpath('//a[@idd="followclaslink"]/@href').extract()[0]
        yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseImages,  meta={'item': item})

def parseImages(self, response):
    for elem in response.xpath("//img"):
        img_url = elem.xpath("@src").extract_first()
        yield ImageItem(image_urls=[img_url])

Settings.py

BOT_NAME = 'production'

SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'
ROBOTSTXT_OBEY = True
IMAGES_STORE = '/Users/home/images'

DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
# Disable cookies (enabled by default)
items.py

# -*- coding: utf-8 -*-
import scrapy

class ProductionItem(scrapy.Item):
    img_url = scrapy.Field()
# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()

class ImageItem(scrapy.Item):
    image_urls = scrapy.Field()
    images = scrapy.Field()

pipelines.py

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item
2020-04-10