当前位置: 首页 > 后端技术 > Node.js

新手节点爬虫

时间:2023-04-03 23:20:39 Node.js

前言使用爬虫可以做很多事情,查找信息,分享数据,挖掘用户和粉丝。既然前端有了node,就不要再羡慕其他后端同学了。目标是找到成千上万的知乎用户,在其中找到女孩的头像并下载。准备工作:1.项目初始化,执行npminit初始化package.json2.npminstallrequest--save3.npminstallasync--save文件中需要引入的packagevarrequest=require('request');varfs=require('fs');varasync=require('async');爬虫的想法有两种类型的爬虫。第一种是使用cheerio模块获取html节点,从节点中获取数据,比较适合直接使用。终端语言呈现的页面。二是通过调用爬虫网站的接口来请求数据。先登录知乎,想办法通过F12控制台登录知乎。找到一个有follower的用户,打开他的followlist,可以在XHR中找到获取followlist的API。我们要做的就是通过这个api获取大量的用户数据,然后对用户信息进行处理。找到我们想要的数据。启动模拟登录,找到APIvarurl="https://www.zhihu.com/api/v4/members/ni-ba-tie-ren/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20";直接请求上面的链接,会发出如下信息,不会有返回数据{"message":"Identityhasnotbeenverified","code":100,"name":"AuthenticationException"}添加授权登录后的信息到请求头,请求成功。"authorization":"BearerMi4wQUJETTJlanBOQWtBVU1LcDk2QVlDeGNBQUFCaEFsVk5neWJhV0FCWXJueEs2bjJwcUYwdzBTdmVpYmxVS1hmWkl3|1488100019|625fed8bf4dee0970f731c7ecfba9f1886ca4a5b"获取大量的用户数据varurl="https://www.zhihu.com/api/v4/members/ni-ba-tie-ren/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20";varzurl="https://www.zhihu.com/api/v4/members/demouser/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20";//request请求的optionsvaroptions={url:url,headers:{"authorization":"BearerMi4wQUJETTJlanBOQWtBVU1LcDk2QVlDeGNBQUFCaEFsVk5neWJhV0FCWXJueEs2bjJwcUYwdzBTdmVpYmxVS1hmWkl3|1488100019|625fed8bf4dee0970f731c7ecfba9f1886ca4a5b"}}functiongetDataList(url){options.url=网址;request.get(options,function(error,response,body){if(!error&&response.statusCode==200){varresponse=JSON.parse(response.body);varzhList=response.data;zhList.forEach(function(item){//item.gender==0性别判断if(item.gender==0){console.log(`爬取${item.avatar_url}`)users.push({"name":item.name,"img":item.avatar_url.replace("_is",""),"url_token":item.url_token})}})//is_end当前用户关注的用户是否在最后一页if(response.paging.is_end){//这里判断抓取的条数if(users.length>=1000){console.log(`爬取完成`);downLoadContent(JSON.stringify(users));return;}否则{缺点ole.log(`用户数据${i+1}`);getDataList(zurl.replace("demouser",users[i].url_token))i++;}}else{if(users.length>=1000){console.log(`抓取完成`);下载内容(JSON.stringify(用户));返回;}getDataList(response.paging.next);}}})}下载并保存数据到data.jsfunctiondownLoadContent(cont){fs.appendFile('./'+'data.js',"module.exports="+cont,'utf-8',function(err){if(err){console.log(err);}elseconsole.log('success');});}下载图片在下载图片之前,使用eyekey提供的人脸识别API之前得到的数据都是item.gender==0。但是不代表他们的头像都是女生头像。所以,为了提高下载图片的质量,还是先调用eyekey的接口进行识别比较好。####识别APIvareyeUrl="http://api.eyekey.com/face/Check/checking";varoptions={"app_id":"f89ae61fd63d4a63842277e9144a6bd2","app_key":"af1cd33549c54b27ae24aeb041865da2","url":"https://pic4.zhimg.com/43fda2d268bd17c561ab94d3cb8c80eb.jpg"}functionface(item){options.url=item.img;request.post({url:eyeUrl,form:options},function(error,response,body){if(!error&&response.statusCode==200){vardata=JSON.parse(body);try{if(data.face[0].attribute.gender=='Female'){console.log(`Downloading${item.img}`);downLoadImg(item)}}catch(e){console.log(`验证失败${item.img}~`);}}})}下载图片方法functiondownLoadImg(image){request.head(image.img,function(err,res,body){if(err){console.log(呃);}});请求(图像.img).pipe(fs.createWriteStream('./image/'+image.name+Date.now()+'.'+image.img.substring(image.img.lastIndexOf(".")+1,image.img.length)));}开始下载functionstartDownLoad(imgdata){//控制并发,5以内async.eachLimit(imgdata,3,function(item,callback){face(item);callback();},function(err){if(err){console.log(err);}else{console.log('success!');}});}1.getDataList(url)#获取数据入口并保存数据到file2.startDownLoad(imgdata)#下载图片数据的入口,把数据拿出来分享下载3.在命令行下依次执行这两个文件......执行结果作者信息原作者是力扑速LeapCloudTeam_UX成员:牛芳[原创]相关文章推荐多屏交互-H5中级高级前端,说我爱你不容易!