当前位置: 首页 > 后端技术 > Python

Python爬取京东众筹【项目名称、支持人数、关注人数、剩余天数、话题数、类别】

时间:2023-03-26 13:07:35 Python

python爬虫使用requests请求列表页,selenium请求详情页,lxml解析网页保存为csv爬取范围:(按时间排序)前100页,每页16页,共1600众筹项目,详情见代码'''2020/3/11京东众筹https://z.jd.com/bigger/search.html------------------------------项目名称、支持人数、关注人数、剩余天数、话题数、类别----------------------------'''fromseleniumimportwebdriverfromselenium.webdriver.support.waitimportWebDriverWaitfromselenium.webdriver.common.byimportByfromselenium.webdriver.supportimportexpected_conditionsasECimportrequestsimportcsvimporttimefromitertoolsimportcountfromlxmlimportetreeimportxlwtimportosfromurllib.parseimporturlsplitimportsysimportdatetimeimportreclassLoopOver(Exception):def__init__(self,*args,**kwargs):passclass蜘蛛:def__init__(self):self.path='.'self.csvfilenamegbk='datas-gbk.csv'self.csvfilename='datas.csv'self.retry_time=1self.host='https://z.jd.com'self.listurl='https://z.jd.com/bigger/search.html'self.user_input_sales=100self.browser=webdriver.Chrome()self.wait=WebDriverWait(self.browser,20)defrun(self):strat=time.time()fortextinself.get_list(self.listurl):forpage_urlinself.parse_list(text):item=self.parse_page(self.get_page(page_url))self.save_data(item)end=time.time()self.runtime=end-stratdefget_list(self,url):forindexinrange(1,101):data={'productEnd':'-28','sort':'zxsx'}print('当前在第{}页'.format(index))data['page']=indexheaders={'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.87Safari/537.36',}response=requests.post(url,data=data,headers=headers)yieldresponse.textdefparse_list(self,text):html=etree.HTML(text)urls=html.xpath('//ul[@class="infosclearfix"]/li/a/@href')urls=[self.host+iforiinurls]如果len(urls)==0:self.save_html(text)raiseLoopOverforurlinurls:yieldurldefget_page(self,url):self.browser.get(url)self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="topicBtn"]/span')))time.sleep(3)returnself.browser.page_sourcedefparse_page(self,text):#项目名称,支持人数,粉丝数,剩余天数,话题数,分类html=etree.HTML(text)title=html.xpath('//h1[@class="p-title"]/text()')[0]。strip()typee=html.xpath('//title/text()')[0].strip().split('-')[-2].replace('crowdfunding','')尝试:p_num=re.findall(r"\d+\.?\d*",html.xpath('//p[@class="p-progress"]/span[@class="fr"]/text()')[0].strip())[0]除了IndexError:p_num='0'like_num=re.findall(r"\d+\.?\d*",html.xpath('//*[@id="focusCount"]/text()')[0].strip())[0]尝试:l_day=html.xpath('//*[@id="projectMessage"]/span[last()]/text()')[0].strip()除了IndexError:l_day=0尝试:talk_num=html.xpath('//*[@id="topicBtn"]/span/text()')[0].strip()exceptIndexError:talk_num=0return[title,typee,p_num,like_num,l_day,talk_num]defsave_data(self,item):'''保存文件'''print('>>>',item)withopen('{}/{}'.format(self.path,self.csvfilename),'a',encoding='utf_8',newline='')ascsvfile:writer=csv.writer(csvfile)writer.writerow(item)defsave_html(self,text):withopen('test.html','w',encoding='utf-8')asf:f.write(text)@propertydeftime(self):return'通用时间:{}秒'.format(self.runtime)if__name__=='__main__':spider=Spider()spider.run()print(spider.time)#总运行时间