爬虫突破：GlidedSky基础题

时间：2023-03-26 18:16:48 Python

网址：http://glidedsky.com第一题：计算网页上所有的数字注册查看要爬取的网站，发现全是数字，，，这第一个问题真的很简单，没什么好讲的。第二题：同样的题，请求1000次的题都是一样的。最简单的方法就是改上面写的代码，但是速度太慢了，可以自己尝试优化，加线程或者直接用协程都很好。当然，我觉得协程应该更快。没有做具体的测试。直接修改运行结果，没有加任何线程和协程，时间有点长。非常基础，但也取决于你如何优化它。它应该更优化。我写的代码是偷懒的，是用协程写的。#!/usr/bin/envpython#-*-coding:utf-8-*-#@Time:2019/8/180:33#@Author:zhao.jia#@Site:#@File:glide_test.py#@Software:PyCharmportrequestsimporttoolsfromlxmlimportetreeimportaiohttpimportasyncioimportdatetimeimporttimefromrequests.adaptersimportHTTPAdapterclassTestGlidedsky:def__init__(self):self.headers="""Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3Accept-Encoding:gzip,deflateAccept-Language:zh-CN,zh;q=0.9Cache-Control:max-age=0Connection:keep-aliveCookie:_ga=GA1.2.1425271689.1566058842;_gid=GA1.2.586445152.1566058842;Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566058842,1566106841;Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566129989;_gat_gtag_UA_75859356_3=1;XSRF-TOKEN=eyJpdiI6IjM4SmpWMlwvaWxPQklreFVaMDFXVFhRPT0iLCJ2YWx1ZSI6IjdoMUFJaVF6YUVvUUNDZU1TaERsN0FVK0dRdTdORW9QUlwvNDlMXC9uXC9IdjdCZ2JCQVhiMXNEV2JKQnI5UXVIMHAiLCJtYWMiOiIyMWMyYzc1MzM3MWQyZTMxNDQwZjA5ZTUxNDZkOThmNTAyOWQwYTQzZDQyZTc4M2Q4YjNlZTI3YjYzZjgwNzA1In0%3D;glidedsky_session=eyJpdiI6Ik1rRUMrXC8yMlVkOEZlSEZja24zdmJRPT0iLCJ2YWx1ZSI6IjRoWG84K1MrM3NLbnlRVytrUVRHd1ZqWWtkdkdyeUtwOTBKdDFWTnl4THdkS1hcL2dmRzA1c1JJRDZSaHk2NlhKIiwibWFjIjoiNmQ2MmJhNWFlNzZiOWEwY2NiMDM1ZTBkZGE2MmNiNGQwNWU4OGJmOTU2OWQxNmU2NmM1MjE1ZmI0NGQ3MjllNyJ9Host:glidedsky.comReferer:http://glidedsky.com/loginUpgrade-Insecure-Requests:1User-Agent:Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.100Safari/537.36"""self.sess=requests.session()self.sess.headers=tools.headers_to_dict(self.headers)self.sum_count_2=0self.sess.mount('http://',HTTPAdapter(max_retries=3))self.sess.mount('https://',HTTPAdapter(max_retries=3))self.sess.verify=Falsedefbasic_one(self):sum_count=0res=self.sess.get(url="http://glidedsky.com/level/web/crawler-basic-1")res_html=etree.HTML(res.text)nums=res_html.xpath('//div[@class="col-md-1"]/text()')fornuminnums:sum_count+=int(num.strip())print("sum="+sum_count)#第二题defbasic_two(self):count=1sum_count=0whileTrue:res=self.sess.get(f"http://glidedsky.com/level/web/crawler-basic-2?page={count}")res_html=etree.HTML(res.text)nums=res_html.xpath('//div[@class="col-md-1"]/text()')fornuminnums:sum_count+=int(num.strip())count+=1ifcount==1001:breakprint(sum_count)asyncdefbasic_two_2(self,url):与aiohttp异步。ClientSession()作为会话：与session.get(url,headers=tools.headers_to_dict(self.headers))作为resp:res=awaitresp.text()res_html=etree.HTML(res)nums=res_html.xpath('//div[@class="col-md-1"]/text()')fornuminnums:self.sum_count_2+=int(num.strip())defsum_async_count(self):loop=asyncio.get_event_loop()tasks=[asyncio.ensure_future(self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}"))foriinrange(1,500)]loop.run_until_complete(asyncio.gather(*tasks))任务=[asyncio.ensure_future(self.basic_two_2(f"http://glidedsky.com/level/web/crawler-basic-2?page={i}"))foriinrange(500,1001)]loop.run_until_complete(asyncio.gather(*tasks))print(self.sum_count_2)if__name__=='__main__':#第二题#starttime=datetime.datetime.now()#TestGlidedsky().basic_two()#结束ime=datetime.datetime.now()#count_time_1=(endtime-starttime).seconds#print(count_time_1)#第二个问题#starttime_2=datetime.datetime.now()#TestGlidedsky().sum_async_count()#endtime_2=datetime.datetime.now()#count_time_2=(endtime_2-starttime_2).seconds#print(count_time_2)第三题：求和，不过这次ip被ban了，每个ip只能访问一次。这个问题有点恶心。只能找代理ip，随便找个免费的吧，想办法再试试#!/usr/bin/envpython#-*-编码：utf-8-*-#@Time:2019/8/2711:00#@Author:Andrew#@Site:#@File:python-abu.py#@Software:PyCharm#!-*-编码：utf-8-*-fromurllibimportrequestimportbase64fromlxmlimportetreeimporttimeimportrequestsfromrequests.adaptersimportHTTPAdapterclasstest:def__init__(self):self.sess=requests.session()self.sess.mount('http://',HTTPAdapter(max_retries=3))self.sess.mount('https://',HTTPAdapter(max_retries=3))self.sess.verify=Falsedefabu_test(self):#代理服务器proxyHost="proxy.abuyun.com"proxyPort="9020"#代理难道验证信息proxyUser="H2T*****22WD"proxyPass="7****10526D3F"proxy_dict={'http':"http-dyn.abuyun.com:9020"}auth=f"{proxyUser}:{proxyPass}"auth=base64.b64encode(auth.encode('utf8'))proxy_header={"Proxy-Authorization":'Basic'+auth.解码（）}self.get_html（proxy_dict,proxy_header)defget_html(self,proxy_dict,proxy_header):count=1sum_count=0headers="""接受：text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3Accept-Encoding:gzip,deflateAccept-Language:zh-CN,zh;q=0.9Cache-Control:max-age=0Cookie:_ga=GA1.2.1251062763.1566609395;Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1566609396,1566627265;_gid=GA1.2.1809641921.1566875827;_gat_gtag_UA_75859356_3=1;XSRF-TOKEN=eyJpdiI6IkNpMHk0SHlDSXIrWHU4MTBIaW96blE9PSIsInZhbHVlIjoiMXpzXC9GRmZGekxQYW5wcUt0ZU0xQ0l0MWVnNHdKWHo5XC9JNTRnZ0c0UWJlYjZlaDVhU1BNRGxENGNoWjBpdkE0IiwibWFjIjoiYTVjYmJjMzY3OTNiNTJjMDE5MjZhNmEzNDIwNGFmZDYwYzk5Yjg5ZjViYmExMzQwMjVkMTkzNDcyMmJjZmYxMyJ9;glidedsky_session=eyJpdiI6ImJ4aHA3QllGZE9PTlRnbTByZnNNOFE9PSIsInZhbHVlIjoiMGt6bUdqbDBcL2JSRERXbVFyMEdHNDArZmtOTHdQ0%3D;Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=1566875832主机：glidedsky.com代理连接：保持活动引用：http://glidedsky.com/login升级不安全请求：1用户代理：Mozilla/5.0（WindowsNT10.0；Win64；x64）AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.87Safari/537.36"""importtoolsheaders=tools.headers_to_dict(headers)headers.update(proxy_header)#print(headers)whileTrue:#ifcount==37orcount==38:#继续尝试：res=self.sess.get(f"http://glidedsky.com/level/web/crawler-ip-block-1?page={count}",headers=headers,代理=proxy_dict，超时=10）除了Exceptionase:print("异常")print(e)continuefile_name=f'glidedsky_{count}.html'ifres.status_code==200:withopen(file_name,'w',encoding='utf8')作为f:f.write(res.text)res_html=etree.HTML(res.text)nums=res_html.xpath('//div[@class="col-md-1"]/text()')如果nums:print("zhaodao")#withopen(file_name,'w',encoding='utf8')asf:#f.write(res.text)fornuminnums:sum_count+=int(num.strip())count+=1print(sum_count)ifcount==1001:returnsum_count#time.sleep(3)defparse_html(self):count=1sum_count=0而True:file_name=f'glidedsky_{count}.html'withopen(file_name,'r',encoding='utf8')asf:content=f.read()res_html=etree.HTML(content)nums=res_html.xpath('//div[@class="col-md-1"]/text()')ifnums:fornuminnums:sum_count+=int(num.strip())print("综合计数",count,sum_count)ifcount==1001:break#returnsum_countelse:print("Nocontent",file_name)continuecount+=1print("Sum",sum_count)if__name__=='__main__':#test().abu_test()测试().parse_html()结果：本文由多发平台ArtiPub自动发布

上一篇：用python给心爱的人做了一个520的照片墙，已经成功做了效果图

下一篇：Python面向对象——类属性《二》

爬虫突破：GlidedSky基础题相关文章