BeautifulSoup和json库在爬虫项目中的应用

549 查看

重构人人贷爬虫的过程中,主要要爬取的数据是以json数据的格式呈现的,要提取的html内容如下:

<script id="credit-info-data" type="text/x-json">
{
    "data": {
        "creditInfo": {
            "account": "INVALID", 
            "album": "INVALID", 
            "borrowStudy": "VALID", 
            "car": "INVALID", 
            "child": "INVALID", 
            "credit": "FAILED", 
            "creditInfoId": 499250, 
            "detailInformation": "VALID", 
            "fieldAudit": "INVALID", 
            "graduation": "PENDING", 
            "house": "INVALID", 
            "identification": "VALID", 
            "identificationScanning": "VALID", 
            "incomeDuty": "PENDING", 
            "kaixin": "INVALID", 
            "lastUpdateTime": "Aug 1, 2014 12:00:00 AM", 
            "marriage": "VALID", 
            "mobile": "VALID", 
            "mobileAuth": "INVALID", 
            "mobileReceipt": "INVALID", 
            "other": "INVALID", 
            "renren": "INVALID", 
            "residence": "VALID", 
            "titles": "INVALID", 
            "user": 503971, 
            "version": 24, 
            "video": "PENDING", 
            "work": "OVERDUE"
        }, 
        "creditPassedTime": {
            "creditPassedTimeId": 499214, 
            "detailInfomation": "Nov 19, 2013 10:57:21 PM", 
            "identification": "Nov 19, 2013 3:14:27 PM", 
            "identificationScanning": "Nov 21, 2013 11:36:55 AM", 
            "lastUpdateTime": "Aug 1, 2014 12:00:00 AM", 
            "marriage": "Nov 21, 2013 11:37:32 AM", 
            "mobile": "Nov 19, 2013 3:10:53 PM", 
            "residence": "Nov 21, 2013 11:37:44 AM", 
            "user": 503971, 
            "work": "Nov 21, 2013 11:37:25 AM"
        }, 
        "loan": {
            "address": "\u5c71\u4e1c", 
            "allProtected": false, 
            "allowAccess": true, 
            "amount": 30000.0, 
            "amountPerShare": 50.0, 
            "avatar": "", 
            "borrowType": "\u8d2d\u8f66\u501f\u6b3e", 
            "borrowerId": 503971, 
            "borrowerLevel": "HR", 
            "currentIsRepaid": false, 
            "description": "\u672c\u4eba\u662f\u9ad8\u4e2d\u6559\u5e08\uff0c\u5de5\u8d44\u7a33\u5b9a\uff0c\u73b0\u5728\u4e70\u8f66\u5411\u5927\u5bb6\u501f\u6b3e\uff0c\u6bcf\u6708\u53d1\u5de5\u8d44\u6309\u65f6\u5f52\u8fd8\u3002", 
            "displayLoanType": "XYRZ", 
            "finishedRatio": 0.0, 
            "forbidComment": false, 
            "interest": 22.0, 
            "interestPerShare": 0.0, 
            "jobType": "\u5de5\u85aa\u9636\u5c42", 
            "leftMonths": 24, 
            "loanId": 123456, 
            "loanType": "DEBX", 
            "monthlyMinInterest": "[{\"month\":\"3\",\"minInterest\":\"10\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"6\",\"minInterest\":\"11\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"9\",\"minInterest\":\"12\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"12\",\"minInterest\":\"12\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"15\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"18\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"24\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"36\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0}]", 
            "months": 24, 
            "nickName": "sdcsqk", 
            "oldLoan": false, 
            "openTime": "Nov 19, 2013 9:11:48 PM", 
            "overDued": false, 
            "picture": "", 
            "principal": 0.0, 
            "productId": 7, 
            "productName": "HR", 
            "repaidByGuarantor": false, 
            "repayType": "MONTH", 
            "startTime": "Dec 19, 2013 9:11:48 PM", 
            "status": "FAILED", 
            "surplusAmount": 30000.0, 
            "title": "\u9ad8\u4e2d\u6559\u5e08\uff0c\u5de5\u4f5c\u7a33\u5b9a\u6309\u65f6\u5f52\u8fd8!", 
            "utmSource": "from-website", 
            "verifyState": "CANCEL"
        }
    }, 
    "status": 0
}</script>

在之前的版本中,应用了re进行简单粗暴的正则匹配,效率较低,因此在重构过程中,将使用BS4对这个标签进行提取,之后应用json库将string转为dict,便于后面的调用和输出。

下面简单介绍一下应用到的方法:

# ! /usr/bin/env python 
# -*- coding:utf-8 -*-

__author__ = 'Gao Yuhao'

# 统一 Python 2 和 3 的语法
try:
    input = raw_input
except:
    pass

import requests
from bs4 import BeartifulSoup
import json

# 确定测试爬虫页面
page_index = input('Pls input the page_index you want to try:')
surl = 'http://www.we.com/lend/detailPage.action?loanId=' + page_index

# 使用requests获取网页
req = requests.get(url = surl)
html = req.text.encode('utf-8')

# 使用BS提取内容
soup = BeautifulSoup(html)
res = soup('script',id = 'credit-info-data')[0].text

# 使用json将其转换为dict
res_json = json.loads(res)
print json.dumps(res_json, indent = 4)