学习网站:爬虫,整站爬取妹子图
1.item.py(定义爬取的内容)
import scrapy
class MeizituItem(scrapy.Item):
url = scrapy.Field()
name = scrapy.Field()
tags = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
2.spider的编写
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
#Item Loaders提供了一种便捷的方式填充抓取到的 :Items
from scrapy.contrib.loader import ItemLoader, Identity
from meizitu.items import MeizituItem
class MeiziSpider(scrapy.Spider):
name = "meizi"
allowed_domains = ["meizitu.com"]
start_urls = (
'http://www.meizitu.com/',
)
def parse(self, response):
#sel是页面源代码,载入scrapy.selector
sel = Selector(response)
#每个连接,用@href属性
for link in sel.xpath('//h2/a/@href').extract():
#请求=Request(连接,parese_item)
request = scrapy.Request(link, callback=self.parse_item)
yield request#返回请求
#获取页码集合
pages = sel.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
print('pages: %s' % pages)#打印页码
if len(pages) > 2:#如果页码集合>2
page_link = pages[-2]#图片连接=读取页码集合的倒数第二个页码
page_link = page_link.replace('/a/', '')#图片连接=page_link(a替换成空)
request = scrapy.Request('http://www.meizitu.com/a/%s' % page_link, callback=self.parse)
yield request#返回请求
def parse_item(self, response):
#l=用ItemLoader载入MeizituItem()
l = ItemLoader(item=MeizituItem(), response=response)
#名字
l.add_xpath('name', '//h2/a/text()')
#标签
l.add_xpath('tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p")
#图片连接
l.add_xpath('image_urls', "//div[@id='picture']/p/img/@src", Identity())
#url
l.add_value('url', response.url)
return l.load_item()
3.pipeline的编写(下载图片,新增图片)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#图片下载部分(自动增量)
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import requests
from meizitu import settings
import os
#图片下载类
class ImageDownloadPipeline(object):
def process_item(self, item, spider):
if 'image_urls' in item:#如何‘图片地址’在项目中
images = []#定义图片空集
dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for image_url in item['image_urls']:
us = image_url.split('/')[3:]
image_file_name = '_'.join(us)
file_path = '%s/%s' % (dir_path, image_file_name)
images.append(file_path)
if os.path.exists(file_path):
continue
with open(file_path, 'wb') as handle:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
item['images'] = images
return item
4.settings
BOT_NAME = 'meizitu'
SPIDER_MODULES = ['meizitu.spiders']
NEWSPIDER_MODULE = 'meizitu.spiders'
#载入ImageDownLoadPipeline类
ITEM_PIPELINES = {'meizitu.pipelines.ImageDownloadPipeline': 1}
#图片储存
IMAGES_STORE = '.'