第一部分代码importrequestsfrombs4importBeautifulSoupimportosimporttimeimportrandomimportUserAgentindex='http://www.netbian.com'#网站根地址interval=0.1#抓取图片的时间间隔firstDir='E:\彼岸桌面wallpaper'#totalPathclassificationDict={}#存放网站分类子页面信息#获取页面过滤后的内容列表defscreen(url,select):headers=UserAgent.get_headers()#获取随机headershtml=requests.get(url=url,headers=headers)html.encoding='gbk'#网站编码html=html.textsoup=BeautifulSoup(html,'lxml')returnsoup.select(select)#获取页码defscreenPage(url,select):html=requests.get(url=url,headers=UserAgent.get_headers())html.encoding='gbk'html=html.textsoup=BeautifulSoup(html,'lxml')returnsoup.select(select)[0].next_sibling.text#下载操作defdownload(src,name,path):if(isinstance(src,str)):response=requests.get(src)path=path+'/'+name+'.jpg'while(os.path.exists(path)):#如果文件名重复path=path.split(".")[0]+str(random.randint(2,17))+'.'+path.split(".")[1]withopen(path,'wb')aspic:forchunkinresponse.iter_content(128):pic.write(chunk)#定位到19201080分辨率图片defhandleImgs(links,path):forlinkinlinks:href=link.get('href')if(href=='http://pic.netbian.com/'):#过滤图片广告继续#第一次跳转if('http://'inhref):#很少有图片没有提供正确的相对地址url=hrefelse:url=index+hrefselect='div#maindiv.endpagediv.picdiv.pic-downa'link=screen(url,select)if(link==[]):print(url+'Noneof这张图片,爬取失败')continuehref=link[0].get('href')#第二次跳转url=index+href#获取到图片select='div#maintableaimg'link=screen(url,select)if(link==[]):print(url+"图片需要登录才能爬取,爬取失败")continuename=link[0].get('alt').replace('\t','').replace('|','').replace(':','').replace('\\','').replace('/','').replace('*','').replace('?','').replace('"','').replace('<','').replace('>','')print(name)#输出文件名下载图片的esrc=link[0].get('src')if(requests.get(src).status_code==404):print(url+'图片下载链接404,爬取失败')print()continueprint()download(src,name,path)time.sleep(interval)#选择下载分类子页面defselect_classification(choice):print('------------------------')print('-------------'+选择+'------------')print('--------------------------')secondUrl=classificationDict[choice]['url']secondDir=classificationDict[choice]['path']if(notos.path.exists(secondDir)):os.mkdir(secondDir)#创建分类目录select='#main>div.page>span.slh'pageIndex=screenPage(secondUrl,select)lastPagenum=int(pageIndex)#获取最后一页的页码为i在范围内(0,lastPagenum):如果我==0:url=secondUrlelse:url=secondUrl+'index_%d.htm'%(i+1)print('-------------'+choice+':'+str(i+1)+'------------')path=secondDir+'/'+str(i+1)if(notos.path.exists(path)):os.mkdir(path)#在分类目录下创建页码目录select='div#maindiv.listullia'links=screen(url,select)handleImgs(links,path)#uiinterface,用户选择下载类别defui():print('------------netbian------------')print('all',end='')forcinclassificationDict.keys():print(c,end='')print()choice=input('请Enterclassificationname:')if(choice=='all'):forcinclassificationDict.keys():select_classification(c)elif(choicenotinclassificationDict.keys()):print("输入错误,请重新输入enter!")print('----')ui()else:select_classification(choice)#将分类子页面信息存储到字典中definit_classification():url=indexselect='#header>div.head>ul>li:nth-child(1)>div>a'classifications=screen(url,select)forcinclassifications:href=c.get('href')#获取外汇赠金活动的相对地址http:///www.fx61.com/activitiestext=c.string#获取分类名称if(text=='4kwallpaper'):#4k壁纸,由于权限问题无法爬取,跳过continuesecondDir=firstDir+'/'+text#分类目录url=index+href#分类子页面urlglobalclassificationDictclassificationDict[text]={'path':secondDir,'url':url}defmain():if(notos.path.exists(firstDir)):os.mkdir(firstDir)#创建通用目录init_classification()ui()if__name__=='__main__':main()第二部分代码importrandomuser_agent=["Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;zh-cn)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50","Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50","Mozilla/5.0(WindowsNT10.0;WOW64;rv:38.0)Gecko/20100101Firefox/38.0","Mozilla/5.0(WindowsNT10.0;WOW64;Trident/7.0;.NET4.0C;.NET4.0E;.NETCLR2.0.50727;.NETCLR3.0.30729;.NETCLR3.5.30729;InfoPath.3;rv:11.0)likeGecko","Mozilla/5.0(兼容;MSIE9.0;WindowsNT6.1;Trident/5.0)","Mozilla/4.0(兼容;MSIE8.0;Windows)NT6.0;Trident/4.0)”,”Mozilla/4.0(兼容;MSIE7.0;WindowsNT6.0)”,”Mozilla/4.0(兼容;MSIE6.0;WindowsNT5.1)”,”Mozilla/5.0(Macintosh;IntelMac)OSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1","Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1","Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11","Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11","Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)应用程序eWebKit/535.11(KHTML,如Gecko)Chrome/17.0.963.56Safari/535.11","Mozilla/4.0(兼容;MSIE7.0;WindowsNT5.1;Maxthon2.0)","Mozilla/4.0(兼容;MSIE7.0;WindowsNT)5.1;TencentTraveler4.0)","Mozilla/4.0(兼容;MSIE7.0;WindowsNT5.1)","Mozilla/4.0(兼容;MSIE7.0;WindowsNT5.1;TheWorld)","Mozilla/4.0(兼容;MSIE)7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)”,”Mozilla/4.0(兼容;MSIE7.0;WindowsNT5.1;360SE)”,”Mozilla/4.0(兼容;MSIE7.0;WindowsNT5.1;Avant浏览器)”,”Mozilla/4.0(兼容;MSIE7.0;WindowsNT5.1)”,”Mozilla/5.0(iPhone;U;CPUiPhoneOS4_3_3likeMacOSX;en-us)AppleWebKit/533.17.9(KHTML,likeGecko)Version/5.0.2Mobile/8J2Safari/6533.18.5","Mozilla/5.0(iPod;U;CPUiPhoneOS4_3_3likeMacOSX;en-us)AppleWebKit/533.17.9(KHTML,likeGecko)Version/5.0.2Mobile/8J2Safari/6533.18.5","Mozilla/5.0(iPad;U;CPU操作系统4_3_3,如MacOSX;en-us)AppleWebKit/533.17.9(KHTML,如Gecko)Version/5.0.2Mobile/8J2Safari/6533.18.5","Mozilla/5.0(Linux;U;Android2.3.7;en-us;NexusOneBuild/FRF91)AppleWebKit/533.1(KHTML,likeGecko)Version/4.0MobileSafari/533.1","MQQBrowser/26Mozilla/5.0(Linux;U;Android)2.3.7;zh-cn;MB200Build/GRJ22;CyanogenMod-7)AppleWebKit/533.1(KHTML,likeGecko)Version/4.0MobileSafari/533.1","Opera/9.80(Android2.3.4;Linux;OperaMobi/build-1107180945;U;en-GB)Presto/2.8.149Version/11.10","Mozilla/5.0(Linux;U;Android3.0;en-us;XoomBuild/HRI39)AppleWebKit/534.13(KHTML,likeGecko)Version/4.0Safari/534.13","Mozilla/5.0(BlackBerry;U;BlackBerry9800;en)AppleWebKit/534.1+(KHTML,likeGecko)Version/6.0.0.337MobileSafari/534.1+","Mozilla/5.0(hp-平板电脑;Linux;hpwOS/3.0.0;U;en-US)AppleWebKit/534.6(KHTML,如Gecko)wOSBrowser/233.70Safari/534.6TouchPad/1.0","Mozilla/5.0(SymbianOS/9.4;Series60/5.0诺基亚N97-1/20.0.019;Profile/MIDP-2.1Configuration/CLDC-1.1)AppleWebKit/525(KHTML,likeGecko)BrowserNG/7.1.18124","Mozilla/5.0(compatible;MSIE9.0;WindowsPhoneOS7.5;Trident/5.0;IEMobile/9.0;HTC;Titan)","UCWEB7.0.2.37/28/999","NOKIA5700/UCWEB7.0.2.37/28/999","Openwave/UCWEB7.0.2.37/28/999","Mozilla/4.0(compatible;MSIE6.0;)Opera/UCWEB7.0.2.37/28/999",#iPhone6"Mozilla/6.0(iPhone;CPUiPhoneOS8_0likeMacOSX)AppleWebKit/536.26(KHTML,likeGecko)Version/8.0Mobile/10A5376eSafari/8536.25",#newmobileua"Mozilla/5.0(Linux;u;Android4.2.2;zh-cn;)AppleWebKit/534.46(KHTML,likeGecko)Version/5.1MobileSafari/10600.6.3(compatible;Baiduspider/2.0;+http://www.baidu.com/search/s...)"]#随机获取一个请求头defget_headers():return{'用户代理':random.choice(user_agent)}
