知乎爬虫

本文介绍了一个使用Node.js实现的知乎图片爬虫程序,该程序能够抓取指定问题下的答案中包含的所有图片,并将这些图片下载到本地指定的文件夹内。通过设置合适的headers和cookie,程序能够成功绕过知乎的部分反爬机制。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

let superagent = require('superagent');
let cookie = require('cookie');
let fs = require('fs')
const request = require("request")
const cheerio = require("cheerio")
const url = require("url");
const qs = require('querystring');
const path = require("path")
let cookies = '';

// 爬取得文章id
let question = 30087454;
// 自己的cookie与toilet
let meticket = ' capsion_ticket=' + '';
let meCookie = ' z_c0=' + '';
// 存放的文件夹
let background = 'background';


const header = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36',
    "Referer": "http://www.zhihu.com/",
    'Host': 'www.zhihu.com'
}
let uri = `https://www.zhihu.com/questions/${question}`;
const include = "data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics"
let next = `https://www.zhihu.com/api/v4/questions/${question}/answers?sort_by=default&include=${include}&limit=20&offset=0`;
function getPic(next) {
    request({
        url: next,
        headers: {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36",
            "Host": "www.zhihu.com",
            "Referer": uri,
            "cookie": cookies + meCookie + meticket
        }
    }, callback)
};

function callback(error, response, body) {
    if (error) {
        return console.log(error)
    }
    console.log(body)
    // 设置翻页
    item = JSON.parse(body);

    item.data.map(function (v, index) {
        $ = cheerio.load(v.content)
        $('img').map(function () {
            let imgUrl = $(this).attr('data-actualsrc')
            if(imgUrl == undefined){
                return
            }
            console.log(imgUrl)
            let nameArr = url.parse(imgUrl).pathname.split('/')
            let name = nameArr[nameArr.length - 1];
            //写入文件
            if (!fs.existsSync(path.join(`./${background}/`))) {
                fs.mkdir(path.join(`./${background}/`),()=> {
                    copy(imgUrl, `./${background}/` + name)
                })
            }else{
                copy(imgUrl, `./${background}/` + name)
            }
        })
    })

    let offset =  qs.parse(url.parse(item.paging.next).query).offset
    if (item.data.length === 20 && offset < item.paging.totals) {
        console.log(offset)
        getPic(item.paging.next)
    }
}

function copy(form, to) {
    var readStream = request(form)
    var writeStream=fs.createWriteStream(path.resolve(to))

    readStream.pipe(writeStream);
    readStream.on('end', function(response) {
        writeStream.end();
    });

    writeStream.on("finish", function() {
        console.log('文件写入成功:' + to);
    });
}


superagent
    .get('https://www.zhihu.com/')
    .set(header)
    .end(function(err, res) {
        var text = []
        res.headers["set-cookie"].forEach(function (el, i) {
            text.push(el.split(';')[0])
        })
        cookies = text.join('; ')
        getPic(next)
    })

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值