从如何评价X的话题下开始抓取问题,然后开始爬相关问题再循环
对于每个问题抓取 标题,关注人数,回答数等数据
zhihuTopicSpider.py
# -*- coding: utf-8 -*-
import scrapy
import os
import time
import re
import json
from ..items import zhihuQuestionItem
# mode 1:tencent 2:free
mode = 2
proxy = "https://web-proxy.oa.com:8080" if mode == 1 else ""
# 设置 用户名和密码
email = "youremail"
password = "yourpassword"
class zhihu_topicSpider(scrapy.Spider):
name = 'zhihu_topicSpider'
zhihu_url = "https://www.zhihu.com"
login_url = "https://www.zhihu.com/login/email"
topic = "https://www.zhihu.com/topic"
domain = "https://www.zhihu.com"
# 设置 Headers
headers_dict = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "keep-alive",
"Host": "www.zhihu.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36"
}
def start_requests(self):
yield scrapy.Request(
url=self.zhihu_url,
headers=self.headers_dict,
meta={
"proxy": proxy,
"cookiejar": 1
},
callback=self.request_captcha
)
def request_captcha(self, response):
# 获取_xsrf值
_xsrf = response.css('input[name="_xsrf"]::attr(value)').extract()[0]
# 获得验证码的地址
captcha_url = "http://www.zhihu.com/captcha.gif?r=" + str(time.time() * 1000)
# 准备下载验证码
# 获取请求
yield scrapy.Request(
url=captcha_url,
headers=self.headers_dict,
meta={
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
"_xsrf": _xsrf
},
callback=self.download_captcha
)
def download_captcha(self, response):
# 下载验证码
with open("captcha.gif", "wb") as fp:
fp.write(response.body)
# 打开验证码
os.system('open captcha.gif')
# 输入验证码
print "请输入验证码:\n"
captcha = raw_input()
# 输入账号和密码
yield scrapy.FormRequest(
url=self.login_url,
headers=self.headers_dict,
formdata={
"email": email,
"password": password,
"_xsrf": response.meta["_xsrf"],
"remember_me": "true",
"captcha": captcha
},
meta={
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
},
callback=self.request_zhihu
)
def request_zhihu(self, response):
"""
现在已经登录,请求www.zhihu.com的页面
"""
yield scrapy.Request(url=self.topic + '/19760570',
headers=self.headers_dict,
meta={
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
},
callback=self.get_topic_question,
dont_filter=True)
def get_topic_question(self, response):
# with open("topic.html", "wb") as fp:
# fp.write(response.body)
# 获得话题下的question的url
question_urls = response.css(".question_link[target=_blank]::attr(href)").extract()
length = len(question_urls)
k = -1
j = 0
temp = []
for j in range(length/3):
temp.append(question_urls[k+3])
j+=1
k+=3
for url in temp:
yield scrapy.Request(url = self.zhihu_url+url,
headers = self.headers_dict,
meta = {
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
},
callback = self.parse_question_data)
def parse_question_data(self, response):
item = zhihuQuestionItem()
item["qid"] = re.search('\d+',response.url).group()
item["title"] = response.css(".zm-item-title::text").extract()[0].strip()
item["answers_num"] = response.css("h3::attr(data-num)").extract()[0]
question_nums = response.css(".zm-side-section-inner .zg-gray-normal strong::text").extract()
item["followers_num"] = question_nums[0]
item["visitsCount"] = question_nums[1]
item["topic_views"] = question_nums[2]
topic_tags = response.css(".zm-item-tag::text").extract()
if len(topic_tags) >= 3:
item["topic_tag0"] = topic_tags[0].strip()
item["topic_tag1"] = topic_tags[1].strip()
item["topic_tag2"] = topic_tags[2].strip()
elif len(topic_tags) == 2:
item["topic_tag0"] = topic_tags[0].strip()
item["topic_tag1"] = topic_tags[1].strip()
item["topic_tag2"] = '-'
elif len(topic_tags) == 1:
item["topic_tag0"] = topic_tags[0].strip()
item["topic_tag1"] = '-'
item["topic_tag2"] = '-'
# print type(item["title"])
question_links = response.css(".question_link::attr(href)").extract()
yield item
for url in question_links:
yield scrapy.Request(url = self.zhihu_url+url,
headers = self.headers_dict,
meta = {
"proxy": proxy,
"cookiejar": response.meta["cookiejar"],
},
callback = self.parse_question_data)
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# import json
import MySQLdb
# class JsonDumpPipeline(object):
# def process_item(self, item, spider):
# with open('d.json', 'a') as fp:
# fp.write(json.dumps(dict(item), ensure_ascii = False).encode("utf-8") + '\n')
class MySQLPipeline(object):
print "\n\n\n\n\n\n\n\n"
sql_questions = (
"INSERT INTO questions("
"qid, title, answers_num, followers_num, visitsCount, topic_views, topic_tag0, topic_tag1, topic_tag2) "
"VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')")
count = 0
def open_spider(self, spider):
host = "localhost"
user = "root"
password = "wangqi"
dbname = "zh"
self.conn = MySQLdb.connect(host, user, password, dbname)
self.cursor = self.conn.cursor()
self.conn.set_character_set('utf8')
self.cursor.execute('SET NAMES utf8;')
self.cursor.execute('SET CHARACTER SET utf8;')
self.cursor.execute('SET character_set_connection=utf8;')
print "\n\nMYSQL DB CURSOR INIT SUCCESS!!\n\n"
sql = (
"CREATE TABLE IF NOT EXISTS questions ("
"qid VARCHAR (100) NOT NULL,"
"title varchar(100),"
"answers_num INT(11),"
"followers_num INT(11) NOT NULL,"
"visitsCount INT(11),"
"topic_views INT(11),"
"topic_tag0 VARCHAR (600),"
"topic_tag1 VARCHAR (600),"
"topic_tag2 VARCHAR (600),"
"PRIMARY KEY (qid)"
")")
self.cursor.execute(sql)
print "\n\nTABLES ARE READY!\n\n"
def process_item(self, item, spider):
sql = self.sql_questions % (item["qid"], item["title"], item["answers_num"],item["followers_num"],
item["visitsCount"], item["topic_views"], item["topic_tag0"], item["topic_tag1"], item["topic_tag2"])
self.cursor.execute(sql)
if self.count % 10 == 0:
self.conn.commit()
self.count += 1
print item["qid"] + " DATA COLLECTED!"
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field
class zhihuQuestionItem(scrapy.Item):
qid = Field()
title = Field()
followers_num = Field()
answers_num = Field()
visitsCount = Field()
topic_views = Field()
topic_tag0 = Field()
topic_tag1 = Field()
topic_tag2 = Field()