今天下午用python写了一个爬取桌面壁纸的爬虫。很简单,毕竟大部分网站都没有反爬策略。importrequestsfromlxmlimportetreeimportreimporttimeurl='https://wallhaven.cc/toplist'headers={'user-agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/88.0.4324.104Safari/537.36'}defgetmaxlistnum():makelistnumlink='https://wallhaven.cc/toplist?page=2'r=requests.get(makelistnumlink,headers=headers)r=etree.HTML(r.content.decode('utf8'))list_num_xpath='/html/body/main/div[1]/section/header/h2/text()[2]'list_num=r.xpath(list_num_xpath)list_num=list_num.pop()list_num=re.findall('([0-9]{1,4})',list_num).pop()returnlist_numlist_num=getmaxlistnum()print('目录前一共有'+list_num+'页壁纸')defwritefilespng(endlink,imgname,num):count=1whilecount<4:try:bgimg=requests.get(endlink,headers=headers,timeout=5).contentbreak除了requests.exceptions.RequestException:count+=1time.sleep(30)withopen(imgname+'.png','wb')作为mh:mh.write(bgimg)print('已完成'+imgname+'.png')defwritefilesjpg(endlink,imgname,num):count=1whilecount<4:try:bgimg=requests.get(endlink,headers=headers,timeout=5).contentbreakexceptrequests.exceptions.RequestException:count+=1time.sleep(30)withopen(imgname+'.jpg','wb')asmh:mh.write(bgimg)打印('已完成'+imgname+'.jpg')defmakebgimg(url,num):backgroundimgurl_xpath='//*[@id="thumbs"]/section/ul/li/figure/a/@href'r=requests.get(url,headers=headers)r=etree.HTML(r.content.decode('utf8'))backgroundimgurl=r.xpath(backgroundimgurl_xpath)endlink_xpath='//*[@id="wallpaper"]/@src'forbgurlinrange(len(backgroundimgurl)):everylink=backgroundimgurl.pop()r=requests.get(everylink,headers=headers)r=etree.HTML(r.content.decode('utf8'))endlink=r.xpath(endlink_xpath).pop()bgimginfo=re.findall('.*[a-z0-9]{6}.([pngjp]{3})',endlink).pop()如果bgimginfo=='png':imgname=re.findall('.*([a-z0-9]{6}).png',endlink).pop()writefilespng(endlink,imgname,num)elifbgimginfo=='jpg':imgname=re.findall('.*([a-z0-9]{6}).jpg',endlink).pop()writefilesjpg(endlink,imgname,num)defmakelink(wantget):urllist=[]pagenum=re.findall('([0-9])',wantget)pagenum=list(set(pagenum))pagenum.sort()maxpagenum=int(pagenum.pop())minpagenum=int(pagenum.pop(0))foriinrange(minpagenum,maxpagenum+1):newurl=url+'?page='+str(i)urllist.append(newurl)returnurllistdefmainbk():print('*'*30)print('壁纸站点:https://wallhaven.cc/toplist')print('只抓取toplist')print('注意:输入的是一个范围,如果要单独的页码,请只输入一个数字')wantget=input('请输入你要抓取的页数,如1-10表示抓取第1-10页,1-1表示抓取第1页n不要使用页码如123456,推荐1-2、1.2.3之类的Pleaseenter:')urllist=makelink(wantget)fornuminrange(len(urllist)):url=urllist.pop()makebgimg(url,num)mainbk()大致思路就是上面的代码,首先是获取Pagenumber,然后用re获取一个list,然后创建一个link,然后传入创建图片的函数,进行操作。目前我的测试应该没有大问题不过可以去掉sleep,或者减少修改时间,不然有时候会很急。下面简单说一下python对桌面壁纸的爬取。爬虫基本就一个吧,剩下的就写了。大体思路是一样的。python爬行桌面壁纸
