当前位置: 首页 > 后端技术 > Python

Python爬取几部国漫

时间:2023-03-26 16:05:26 Python

前言说实话,这个爬虫可能是我目前遇到的最难的爬虫了。之前主要是爬取一些静态资源的网站。虽然这个网站的反爬机制虽然只是低级的,但是对于我这个新手来说还是比较吃力的。2.反爬过程这样我们就可以直接通过xpath定位得到defgetLinks(html):chapter_link=[]chapter_title=[]parse=parsel.Selector(html)links=parse.xpath('//div[@class="tab-contenttab-content-selectedzj_list_conautoHeight"]/ul[@class="list_con_liautoHeight"]/li/a/@href').getall()titles=parse.xpath('//div[@class="tab-contenttab-content-selectedzj_list_conautoHeight"]/ul[@class="list_con_liautoHeight"]/li/a/span[@class="list_con_zj"]/text()').getall()forlinkinlinks:chapter_link.insert(0,link)fortitleintitles:chapter_title.insert(0,title)returnchapter_link,chapter_title1234567891011需要注意的是,这里的章节是降序排列的,所以需要爬取翻转一下结束了,所以你不能只使用append方法,应用insert方法。源代码我的代码:importrequestsimportparselimportpypinyinfrombs4importBeautifulSoupimportreimportosimporttime#Masquerade浏览器。Setrequestheadersheaders={"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/83.0.4103.116Safari/537.36",}#返回请求信息网页defaskUrl(url):response=requests.get(url,headers=headers)html=response.content.decode('utf-8')returnhtml#获取所有章节链接和章节名称defgetLinks(html):chapter_link=[]chapter_title=[]parse=parsel.Selector(html)links=parse.xpath('//div[@class="tab-contenttab-content-selectedzj_list_conautoHeight"]/ul[@class="list_con_liautoHeight"]/li/a/@href').getall()titles=parse.xpath('//div[@class="tab-contenttab-content-selectedzj_list_conautoHeight"]/ul[@class="list_con_liautoHeight"]/li/a/span[@class="list_con_zj"]/text()').getall()forlinkinlinks:chapter_link.insert(0,link)fortitleintitles:chapter_title.insert(0,title)returnchapter_link,chapter_title#获取所有漫画的链接defgetImgs(link):pic_url=[]response=requests.get(link,headers=headers)html=Beautiful汤(response.text,'lxml')script_info=html.scriptone=re。findall("\|(\d{4})\|",str(script_info))[0]two=re.findall("\|(\d{5})\|",str(script_info))[0]threes=re.findall('\d{13,14}',str(script_info))fori,threesinenumerate(threes):iflen(three)==13:threes[i]=three+'0'threes=sorted(threes,key=lambdax:int(x))forthreeinthrees:ifthree[-1]=='0':pic_url.append("https://images.dmzj.com/img/chapterpic/"+one+"/"+two+"/"+three[:-1]+".jpg")else:pic_url.append("https://images.dmzj.com/img/chapterpic/"+1+"/"+2+"/"+3+".jpg")returnpic_url#下载漫画defdownload(url,links,dir_name):headers1={'Referer':url,}i=1;for链接中的链接:pic_name='%03d.jpg'%(i)new_dir_name=os.path.join(dir_name,pic_name)response=requests.get(link,headers=headers1)withopen(new_dir_name,'wb')asf:f.write(response.content)print(pic_name+"下载完成")i+=1#main方法defmain():manhuas=input("请输入你要下载的漫画名:")dir_name=r'D:\漫画'ifnotos.path.exists(dir_name+'./'+manhuas):os.makedirs(dir_name+'./'+manhuas)dir_name=dir_name+'./'+manhuasmanhuas=pypinyin.pinyin(manhuas,style=pypinyin.NORMAL)name=''formanhuainmanhuas:name=name+''.join(manhua)url="https://www.dmzj.com/info/"+name+".html"html=askUrl(url)links=getLinks(html)[0]names=getLinks(html)[1]fori,linkinenumerate(links):如果不是os.path.exists(dir_name+'./'+str(names[i])):os.makedirs(dir_name+'./'+str(names[i]))print("开始下载:"+names[i])imglinks=getImgs(link)download(link,imglinks,dir_name+'./'+str(names[i]))print(names[i]+"下载完成")print("休息一下,稍微继续下载下一章")time.sleep(10)print("——————————————————————————————————————————————————————————————————————————————————————")print(manhuas+"已经完全下载完毕")主函数入口if__name__=='__main__':main()12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394外汇MT4教程https://www.kaifx.cn/mt4.html老哥的代号:进口requestsimportosimportrefrombs4importBeautifulSoupfromcontextlibimportclosingfromtqdmimporttqdmimporttime#创建保存目录save_dir='药神记'ifsave_dirnotinos.listdir('./'):os.mkdir(save_dir)target_url="https://www.dmzj.com/info/yaoshenji.html"#获取动漫章节链接和章节名r=requests.get(url=target_url)bs=BeautifulSoup(r.text,'lxml')list_con_li=bs.find('ul',class_="list_con_li")cartoon_list=list_con_li.find_all('a')chapter_names=[]chapter_urls=[]forcartoonincartoon_list:href=cartoon.get('href')name=cartoon.textchapter_names.insert(0,name)章节网址。insert(0,href)#下载漫画i,urlinenumerate(tqdm(chapter_urls)):download_header={'Referer':url}name=chapter_names[i]#Remove.while'.'在名称中:name=name.replace('.','')chapter_save_dir=os.path.join(save_dir,name)ifnamenotinos.listdir(save_dir):os.mkdir(chapter_save_dir)r=requests.get(url=url)html=BeautifulSoup(r.text,'lxml')script_info=html.scriptpics=re.findall('\d{13,14}',str(script_info))forj,picinenumerate(图片):iflen(pic)==13:pics[j]=pic+'0'pics=sorted(pics,key=lambdax:int(x))chapterpic_hou=re.findall('\|(\d{5})\|',str(script_info))[0]chapterpic_qian=re.findall('\|(\d{4})\|',str(script_info))[0]foridx,picinenumerate(pics):ifpic[-1]=='0':url='https://images.dmzj.com/img/c...'+chapterpic_qian+'/'+chapterpic_hou+'/'+pic[:-1]+'.jpg'else:url='https://images.dmzj.com/img/c...'+chapterpic_qian+'/'+chapterpic_hou+'/'+pic+'.jpg'pic_name='%03d.jpg'%(idx+1)pic_save_path=os.path.join(chapter_save_dir,pic_name)withclosing(requests.get(url,headers=download_header,stream=True))作为响应:chunk_size=1024content_size=int(response.headers['content-length'])ifresponse.status_code==200:withopen(pic_save_path,"wb")asfile:fordatainresponse.iter_content(chunk_size=chunk_size):file.write(data)else:print('链接异常')time.sleep(10)