主要是对上一篇文章的简单仿写,大家以后想批量下载什么图片照格式仿写就好。由于本人是tfboys的粉丝,所以平常没事爱逛贴吧欣赏我家三小只的美图,所以这次就以贴吧[小王的讨论楼]为例,批量爬取该楼的图片[1]
itme.py编写
import scrapy
class WangyuantuItem(scrapy.Item):
image_urls=scrapy.Field()#就编写个图片路径就好
spider的编写
import scrapy
import requests
import os
from wangyuantu.items import WangyuantuItem
class XiaowangSpider(scrapy.Spider):
name = "xiaowang"
allowed_domains = ["tieba.baidu.com/p/3888309273"]
start_urls = [
'http://tieba.baidu.com/p/3888309273?pn=%d' % i for i in range(21,45)
]
def parse(self, response):
item = WangyuantuItem()
item['image_urls']=response.xpath("//img[@class='BDE_Image']/@src").extract()
yield item
pipelines编写:这个部分都是可以套用的
import requests
from wangyuantu import settings
import os
#图片下载类
class ImageDownloadPipeline(object):
def process_item(self, item, spider):
if 'image_urls' in item:#如何‘图片地址’在项目中
images = []#定义图片空集
dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for image_url in item['image_urls']:
us = image_url.split('/')[3:]
image_file_name = '_'.join(us)
file_path = '%s/%s' % (dir_path, image_file_name)
images.append(file_path)
if os.path.exists(file_path):
continue
with open(file_path, 'wb') as handle:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
if not block:
break
settings编写
BOT_NAME = 'wangyuantu'
SPIDER_MODULES = ['wangyuantu.spiders']
NEWSPIDER_MODULE = 'wangyuantu.spiders'
ITEM_PIPELINES = {'wangyuantu.pipelines.ImageDownloadPipeline': 1}
#图片储存
IMAGES_STORE = 'C:\Users\Lenovo\Pictures'