在重构人人贷爬虫的过程中,主要要爬取的数据是以json数据的格式呈现的,要提取的html内容如下:
<script id="credit-info-data" type="text/x-json">
{
"data": {
"creditInfo": {
"account": "INVALID",
"album": "INVALID",
"borrowStudy": "VALID",
"car": "INVALID",
"child": "INVALID",
"credit": "FAILED",
"creditInfoId": 499250,
"detailInformation": "VALID",
"fieldAudit": "INVALID",
"graduation": "PENDING",
"house": "INVALID",
"identification": "VALID",
"identificationScanning": "VALID",
"incomeDuty": "PENDING",
"kaixin": "INVALID",
"lastUpdateTime": "Aug 1, 2014 12:00:00 AM",
"marriage": "VALID",
"mobile": "VALID",
"mobileAuth": "INVALID",
"mobileReceipt": "INVALID",
"other": "INVALID",
"renren": "INVALID",
"residence": "VALID",
"titles": "INVALID",
"user": 503971,
"version": 24,
"video": "PENDING",
"work": "OVERDUE"
},
"creditPassedTime": {
"creditPassedTimeId": 499214,
"detailInfomation": "Nov 19, 2013 10:57:21 PM",
"identification": "Nov 19, 2013 3:14:27 PM",
"identificationScanning": "Nov 21, 2013 11:36:55 AM",
"lastUpdateTime": "Aug 1, 2014 12:00:00 AM",
"marriage": "Nov 21, 2013 11:37:32 AM",
"mobile": "Nov 19, 2013 3:10:53 PM",
"residence": "Nov 21, 2013 11:37:44 AM",
"user": 503971,
"work": "Nov 21, 2013 11:37:25 AM"
},
"loan": {
"address": "\u5c71\u4e1c",
"allProtected": false,
"allowAccess": true,
"amount": 30000.0,
"amountPerShare": 50.0,
"avatar": "",
"borrowType": "\u8d2d\u8f66\u501f\u6b3e",
"borrowerId": 503971,
"borrowerLevel": "HR",
"currentIsRepaid": false,
"description": "\u672c\u4eba\u662f\u9ad8\u4e2d\u6559\u5e08\uff0c\u5de5\u8d44\u7a33\u5b9a\uff0c\u73b0\u5728\u4e70\u8f66\u5411\u5927\u5bb6\u501f\u6b3e\uff0c\u6bcf\u6708\u53d1\u5de5\u8d44\u6309\u65f6\u5f52\u8fd8\u3002",
"displayLoanType": "XYRZ",
"finishedRatio": 0.0,
"forbidComment": false,
"interest": 22.0,
"interestPerShare": 0.0,
"jobType": "\u5de5\u85aa\u9636\u5c42",
"leftMonths": 24,
"loanId": 123456,
"loanType": "DEBX",
"monthlyMinInterest": "[{\"month\":\"3\",\"minInterest\":\"10\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"6\",\"minInterest\":\"11\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"9\",\"minInterest\":\"12\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"12\",\"minInterest\":\"12\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"15\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"18\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"24\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"36\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0}]",
"months": 24,
"nickName": "sdcsqk",
"oldLoan": false,
"openTime": "Nov 19, 2013 9:11:48 PM",
"overDued": false,
"picture": "",
"principal": 0.0,
"productId": 7,
"productName": "HR",
"repaidByGuarantor": false,
"repayType": "MONTH",
"startTime": "Dec 19, 2013 9:11:48 PM",
"status": "FAILED",
"surplusAmount": 30000.0,
"title": "\u9ad8\u4e2d\u6559\u5e08\uff0c\u5de5\u4f5c\u7a33\u5b9a\u6309\u65f6\u5f52\u8fd8!",
"utmSource": "from-website",
"verifyState": "CANCEL"
}
},
"status": 0
}</script>
在之前的版本中,应用了re进行简单粗暴的正则匹配,效率较低,因此在重构过程中,将使用BS4对这个标签进行提取,之后应用json库将string转为dict,便于后面的调用和输出。
下面简单介绍一下应用到的方法:
# ! /usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'Gao Yuhao'
# 统一 Python 2 和 3 的语法
try:
input = raw_input
except:
pass
import requests
from bs4 import BeartifulSoup
import json
# 确定测试爬虫页面
page_index = input('Pls input the page_index you want to try:')
surl = 'http://www.we.com/lend/detailPage.action?loanId=' + page_index
# 使用requests获取网页
req = requests.get(url = surl)
html = req.text.encode('utf-8')
# 使用BS提取内容
soup = BeautifulSoup(html)
res = soup('script',id = 'credit-info-data')[0].text
# 使用json将其转换为dict
res_json = json.loads(res)
print json.dumps(res_json, indent = 4)
2025 - 快车库 - 我的知识库 重庆启连科技有限公司 渝ICP备16002641号-10
企客连连 表单助手 企服开发 榜单123