抓取ip(IPPool.py)importrequestsfromlxmlimportetreefromfake_useragentimportUserAgentfakeua=UserAgent()headers={'User-Agent':ua.random}defget_ip():ip_list=[]pathurl='https://www.xicidali.com/nt/'#ip是时效性的,只抓取第一页请求textresponse=etree.HTML(response)tr_list=response.xpath('//tr[@class="odd"]')foriintr_list:ipip=i.xpath('./td[2]/text()')[0]端口号port=i.xpath('./td[3]/text()')[0]协议协议=i.xpath('./td[6]/text()')[0]agreement=agreement.lower()拼装完整路径ip=agreement+'://'+ip+':'+portip_list.append(ip)returnip_listif__name__=='__main__':ip_list=get_ip()print(ip_list)testip测试方法一(frommultiprocessing.dummyimportPool)importrequestsfrommultiprocessing.dummyimportPool获取爬取的ip列表fromIPPoolimportget_iptest_list=get_ip()定义一个全局列表来存储有效的ipip_list=[]ip测试网站外汇百科http://www.fx61.com/definitionsurl='http://icanhazip.com'headers={'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64;rv:70.0)Gecko/20100101Firefox/70.0'}defip_test(ip):try:ifip.split(":")[0]=='http':proxies={'http':ip}else:proxies={'https':ip}response=requests.get(url=url,headers=headers,proxies=代理,timeout=3)ip_list.append(ip)print(ip+"available")except:print(ip+"unavailable")if__name__=='__main__':pool=Pool(4)pool.map(ip_test,test_list)print(ip_list)print("总共抓取了%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_list),len(test_list)-len(ip_list)))测试方法二(Threading多线程队列)importthreadingimportrequestsimportqueuefromfake_useragentimportUserAgentgetcrawlediplistfromIPPoolimportget_iptest_list=get_ip()defineagloballisttostorevalidipip_pool=[]Randomheadermasqueradeua=UserAgent()headers={'User-Agent':ua.random}url='https://www.csdn.net/'#url='http://icanhazip.com/'deftest_ip(queue_list):whileTrue:ifqueue_list.empty():breakelse:ip=queue_list.get()ifip.split(":")[0]=='http':proxies={'http':ip}else:proxies={'https':ip}try:response=requests.get(url=url,headers=headers,proxies=proxies,timeout=3)ifresponse.status_code==200:print("【%s】测试%s,测试结果【可用】"%(threading.current_thread().name,proxies))ip_pool.append(ip)except:print("【%s】测试%s,测试结果【Unavailable]"%(threading.current_thread().name,proxies))if__name__=='__main__':queue_list=queue.Queue()#创建一个队列,将爬取到的ip放入队列foriintest_list:queue_list中。put(i)createthreadout_thread=[threading.Thread(target=test_ip,args=(queue_list,),name="process%s"%item)foriteminrange(5)]forthreadinout_thread:thread。start()forthreadinout_thread:thread.join()print('testcompleted')print(ip_pool)print("总共抓取了%s个ip,可用ip为:%s,不可用ip为:%s"%(len(test_list),len(ip_pool),len(test_list)-len(ip_pool)))IPPool2.pyimportrequestsfromlxmlimportetreefromfake_useragentimportUserAgentfakeua=UserAgent()headers={'User-Agent':ua.random}defget_ip():ip_list=[]pathurl='https://www.kuaidali.com/fre...'requestresponse=requests.get(url=url,headers=headers)设置编码response.encoding=response.apparent_encodingresponse=response.textresponse=etree.HTML(response)tr_list=response.xpath('//*[@id="list"]/table/tbody/tr')foriintr_list:ip=i.xpath('./td[1]/text()')[0]ip_list.append(ip)returnip_listif__name__=='__main__':ip_list=get_ip()#打印(ip_list)
