制作工具模块——隐藏身份信息的User-Agent模块;对象服务器无法识别身份信息。importrandomuser_agent_data=[{"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3314.0Safari/537.36SE2.XMetaSr1.0"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18362"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/78.0.3904.108Safari/537.36QIHU360SE"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0);WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/80.0.3987.162Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/83.0.4103.106Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/80.0.3987.163Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/39.0.2171.71Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.25Safari/537.36Core/1.70.3722.400QQBrowser/10.5.3751.400"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18363"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/78.0.3904.70Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.25Safari/537.36Core/1.70.3765.400QQBrowser/10.6.4153.400"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.25Safari/537.36Core/1.70.3765.400QQBrowser/10.6.4153.400"},{“用户代理”:“Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/83.0.4103.106Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/64.0.3282.204Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18362"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/81.0.4044.138Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18362"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0);Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/80.0.3987.132Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,像Gecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18362"},{"User-Agent":"Mozilla/5.0(Window新台币10.0;Win64;x64;ServiceUI14)AppleWebKit/537.36(KHTML,likeGecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18362"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,如Gecko)Chrome/83.0.4103.106Safari/537.36"},{"User-Agent":"Mozilla/5.0(WindowsNT10.0;…)Gecko/20100101Firefox/77.0"},]defget_headers():"""随机获取标头"""index=random.randint(0,len(user_agent_data)-1)#print("下标值:",index)returnuser_agent_data[index]if__name__=='__main__':headers=get_headers()print("GetUAvaluerandomly:",headers)做一个动态IP代理池;防止IP被封;ippool.jsonimportjsonimportrandomdefget_proxies():"""随机获取代理池"""读取文件rfile=open("./ipfile/ippool.json","r",encoding="utf-8")proxy_lists=json.load(rfile)rfile.close()#print(len(proxy_lists))随机数index=random.randint(0,len(proxy_lists)-1)returnproxy_lists[index]if__name__=='__main__':proxies=get_proxies()print("随机获取代理ip:",proxies)importrequestsimportuseragenttoolimportproxytoolfromlxmlimportetreeimportjsonimportosclassOnePieceSpider(object):def__init__(self):#初始化self.url="http://kanbook.net/328"self.html_data=Noneself.one_piece_data_list=[]defget_url_html(self):"""分析获取URL源码"""headers=useragenttool.get_headers()#添加headers隐藏身份headers["Accept-Encoding"]="deflate,sdch,br"headers["Content-Type"]="text/html;charset=UTF-8"headers["Referer"]="https://kanbook.net/328/3/1/1"#referencepoint#print(headers)#请求响应response=requests.get(url=self.url,headers=headers,proxies=proxytool.get_proxies())html_content=response.content.decode("utf-8")self.html_data=html_content#print(html_content)defcatch_html_data(self):"""抓取URL源码数据"""#获取etree对象data_parse=etree.HTML(self.html_data)#print(data_parse)li_list=data_parse.xpath("//div[@aria-labelledby='3-tab']/ol/li")#print(li_list)#遍历处理g,li_list[::-1]中li_element的列表反转:#print(li_element)#提取的链接h_name=li_eelement.xpath("./a/@href")[0]#print(h_name)title=li_element.xpath("./a/@title")[0]#提取标题#print(title)#提取页面Numberpage=int(li_element.xpath("./a/span/text()")[0][1:4])#print(page)#放入字典one_piece_item={"title":title,"postfix":h_name,"page":page}#print(one_piece_item)self.one_piece_data_list.append(one_piece_item)print("添加成功!")defsave_data_file(self):"""保存信息"""path="./image_url"如果不是os.path.exists(path):os.mkdir(path)file=open(path+"/one_piece_data.json","w",encoding="utf-8")json.dump(self.one_piece_data_list,file,ensure_ascii=False,indent=2)file.close()print("数据保存成功!")defrun(self):#启动程序self.get_url_html()#print(html_content)self.catch_html_data()self.save_data_file()#print(self.one_piece_data_list)defmain():spider=OnePieceSpider()spider.run()开始爬取海贼王的所有全彩漫画图片-注意:headershould添加referer引用页面,选择除了循环(whileTrue)是为了让所有的卷图片都能下载成功,如果下载成功,会跳出循环importrequestsimportuseragenttoolimportproxytoolimporttimeimportrandomimportjsonimportosimportreimporturllib3urllib3.disable_warnings()classOnePieceImageSpider(object):def__init__(self):#初始化self.url=""defset_url(self,out_url):"""设置网络地址"""self.url=out_urldefget_url_list(self,num):"""GetnumpageURLs"""url_list=[]#拼接URLs得到页面列表在range(1,num+1):new_url=self.url.format(page)url_list.append(new_url)returnurl_listdefget_url_html(self,inner_url):"""解析获取URL源码"""headers=useragenttool.get_headers()headers["Accept-Encoding"]="deflate,sdch,br"headers["Content-Type"]="text/html;charset=UTF-8"headers["Referer"]="https://kanbook.net/328/3/6"#ReferencePage#print(headers)response=requests.get(url=inner_url,headers=headers,proxies=proxytool.get_proxies(),timeout=30,verify=False)#动态限制抓取网页源码的时间wait_time=random.randint(1,6)时间.sleep(wait_time)html_content=response.content#print(html_content)returnhtml_contentdef__download_image(self,image_url,name,index):"""下载图片:paramimage_url:图片地址ForexMT4教程https://www.kaifx.cn/mt4.html:paramname:filename:paramindex:imagenumber:return:"""whileTrue:try:iflen(image_url)==0:breakcontent=self.get_url_html(image_url)path="./onepieceimage/%s"%nameifnotos.path.exists(path):os.mkdir(path)withopen(path+"/%d.jpg"%index,"wb")aswfile:wfile.write(content)breakexceptExceptionasmsg:print("发生异常,错误信息为",msg)#启动程序defrun(self,url_list,title):#print(url_list)#遍历处理,获取htmlindex=2forurlinurl_list:whileTrue:try:#print(url)data=self.get_url_html(url).decode("utf-8")#print(data)regex=r"""varimg_list=(\[.+])"""result=re.findall(regex,data)#print(type(result[0]))#convertlistlists=json.loads(result[0])#print(lists)img_url=lists[0]print(img_url)breakexceptExceptionasmsg:print("Errormessage:",msg)self.__download_image(img_url,title,index)print("下载%d"%索引)index+=1print("所有图片下载成功fully")defmain():#提取文件read_file=open("./image_url/one_piece_data.json","r",encoding="utf-8")one_piece_data=json.load(read_file)read_file.close()#遍历处理,提取one_piece_data中element的字典数据:#print(element)#海贼王地址、页码、标题href_name=element["postfix"]number=element["page"]name=element["title"]#拼接URLhttp_url="http://kanbook.net"+href_name+"/{}"#print(http_url)onepieceemgspider=OnePieceImageSpider()onepieceemgspider.set_url(http_url)print("%s开始下载!"%name)url_list=onepiecespider.get_url_list(number)#print(url_list)#获取每个页面的url列表onepieceemgspider.run(url_list,name)if__name__=='__main__':main()
