[python]微信公众号文章爬取需要爬取部分微信公众号文章数据源1.搜狗微信搜索,可以搜索微信公众号文章,但是只有可以显示本公众号的最后十篇文章2.通过个人微信公众号中的素材管理,查看其他微信公众号文章Step1.手动从网站获取cookies,通过cookies登录2.从请求url获取token3。拼接参数请求https://mp.weixin.qq.com/cgi-bin/searchbiz获取公众号fakeid即biz4。拼接参数请求https://mp.weixin.qq.com/cgi-bin/appmsg?获取文章列表信息5.通过文章url爬取文章。这样就没办法获取阅读量和点赞数了,因为网页打开公众号文章是代码github仓库地址没有阅读次数和点赞importrequestsimportjsonimportreimporttimeclassWeChatCrawler():def__init__(self,wxList):self.wxList=wxListself.cookies=self.__getCookiesFromText()self.token=self.__getToken()self.headers={"HOST":"mp.weixin.qq.com","User-Agent":"Mozilla/5.0(WindowsNT6.1;WOW64;rv:53.0)Gecko/20100101Firefox/53.0"}self.searchBizParam={'action':'search_biz','token':self.token,'lang':'zh_CN','f':'json','ajax':'1','query':'','begin':'0','count':'5',}self.getMsgListParam={'token':self.token,'lang':'zh_CN','f':'json','ajax':'1','action':'list_ex','begin':'0','count':'5','query':'','fakeid':'','type':'9'}def__getCookiesFromText(self):#手动获取cookiewithopen('cookie.txt','r',encoding='utf-8')asf:cookieStr=f.read()#处理cookieStr格式转换成jsoncookieStr="{\""+cookieStr+"\"}"cookieStr=cookieStr.replace("rewardsn=;","").replace(";","\",\"").replace("=","\":\"").replace("\":\"\"","=\"").replace('','')#print(cookieStr)cookies=json.loads(cookieStr)返回cookiedef__getToken(self):url='https://mp.weixin.qq.com'response=requests.get(url=url,cookies=self.cookies)token=re.findall(r'token=(\d+)',str(response.url))[0]returntokendef__getWXFakeid(self,wx):searchUrl='https://mp.weixin.qq.com/cgi-bin/searchbiz?'self.searchBizParam['query']=wxsearchResponse=requests.get(searchUrl,cookies=self.cookies,headers=self.headers,params=self.searchBizParam)fakeid=searchResponse.json().get('list')[0].get('fakeid')返回fakeiddef__getWXMsgCnt(self,fakeId):self.getMsgListParam['fakeid']=fakeIdappmsgUrl='https://mp.weixin.qq.com/cgi-bin/appmsg?'appmsgResponse=requests.get(appmsgUrl,cookies=self.cookies,headers=self.headers,params=self.getMsgListParam)wxMsgCnt=appmsgResponse.json().get('app_msg_cnt')返回wxMsgCntdef__getWXMsgList(self,fakeId):appmsgUrl='https://mp.weixin.qq.com/cgi-bin/appmsg?'wxMsgCnt=self.__getWXMsgCnt(fakeId)ifwxMsgCntisnotNone:pages=int(wxMsgCnt)//5begin=0for_inrange(pages):print('====翻页====',begin)self.getMsgListParam['开始']=??str(开始)msg??ListResponse=requests.get(appmsgUrl,cookies=self.cookies,headers=self.headers,params=self.getMsgListParam)msg??List=msgListResponse.json().get('app_msg_list')foriteminmsgList:#todomoremsgLink=item.get('link')print(msgLink)msg??Title=item.get('title')print(msgTitle)begin+=5time.sleep(3)defrunCrawler(self):fakeIds=list(map(self.__getWXFakeid,self.wxList))list(map(self.__getWXMsgList,fakeIds))if__name__=='__main__':#例子wxList=['量子位',]wc=WeChatCrawler(wxList)wc.runCrawler()
