当前位置: 首页 > 后端技术 > Python

爬虫项目实践:爬取星际争霸2天梯数据

时间:2023-03-26 18:58:20 Python

本文从https://www.rankedftw.com/lad...爬取了星际争霸2天梯数据,并存储在本地MySQL数据库中。一共爬取了32万条数据。importrequestsfrombs4importBeautifulSoupimportpymysql.cursorsfromconfigimport*frommultiprocessingimportPool#连接到数据库connection=pymysql.connect(host,user,password,db,port)cursor=connection.cursor()defget_html(url):#获取htmlr=requests.get(url)returnr.textdefsave_to_db(values):#Insertdataintothetableinsert_='INSERTINTO{}VALUES{}'.format(tabel,tuple(values))try:ifcursor.execute(insert_):connection.commit()except:connection.rollback()print('insertfailed',values)defget_info(html):#解析html以获取数据并存储soup=BeautifulSoup(html,'lxml')infos=soup.find('table',class_='team-size-1').find_all('tr')foriininfos[1:]:ifi.find_all('td',class_='img')[1].find('img',class_='league')不是无:League=i.find_all('td',class_='img')[1].find('img',class_='league').get('src').split('/')\[-1][:-10]else:联赛='0'infomation={'Rank':int(i.find_all('td',class_='number')[0].text),'Name':i.find('span',class_='name').text,'MMR':int(i.find_all('td',class_='number')[1\].text),'Points':int(i.find_all('td',class_='number')[2].text),'Wins':int(i.find_all('td',class_='number')[3].text),'Loss':int(i.find_all('td',class_='number')[4].text),'Played':0,'WinRate':i.find_all('td',class_='number')[6].text,'Age':i.find_all('td',class_='number')[7].text,'Region':i.find_all('td',class_='img')[0].find('img').get('src')[-12:-10],'League':联赛,'Tier':int(i.find_all('td',class_='img')[2].text),'种族':i.find('img',class_='race').get('src').split('/')[-1]\[:-10],}#keys=[ifor我ininfomation.keys()]values=[iforiininfomation.values()]save_to_db(values)#list_=[int(Rank),Name,int(MMR),int(Points),int(Wins),int(Losses),Played,WinRate,Age,Region,League,int(Tier),Race]#save_to_db(list_)defmain(offset):try:url='https://www.rankedftw.com/ladder/lotv/1v1/mmr/?offset={}'.format(offset*100)html=get_html(url)get_info(html)除了Exceptionase:print(e)if__name__=='__main__':pool_=Pool()pool_.map(main,[iforiinrange(3254)])connection.close()

猜你喜欢