初学python,先记录一次爬虫经历,就作为python的入门训练吧。目标网站采用了动态加载技术。
#-*- coding:utf-8 -*-import requestsimport reimport threadingglobal headers_for_pc,headers_for_realurl,offset_for_pc,forbiddenoffset_for_pc=0forbidden=["xxxxxxx","xxxxxxx","xxxxxx","xxxxxxx"]headers_for_pc={'Accept':'*/*','Accept-Encoding':'gzip, deflate, sdch','Accept-Language':'zh-CN,zh;q=0.8','Cookie':'xxxxx''Host':'aps.115.com','Referer':'http://aps.115.com/bridge_2.0.html?xxxxx','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36','X-Requested-With':'xmlHttPRequest'}headers_for_realurl={'Accept':'*/*','Accept-Encoding':'gzip, deflate, sdch','Accept-Language':'zh-CN,zh;q=0.8','Cookie':'xxxxx''Host':'web.api.115.com','Referer':'http://web.api.115.com/bridge_2.0.html?xxxxxx','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36','X-Requested-With':'xmlhttpRequest'}url_for_pc="http://aps.115.com/natsort/files.php?xxxxxx"url_for_realurl="http://web.api.115.com/xxxxxx"def getpc(url,offset): response=requests.get(url,params="offset=%s"%(offset),headers=headers_for_pc) if response.status_code==200: #print response.url html=response.text pickcodes=re.findall(r'"pc":"(.*?)"',html) return pickcodes else: print "Sory,Get Pickcodes Fail,ErrorCode:",reponse.status_code return -1def geturl(url,pickcode): #print pickcode response=requests.get(url,params="pickcode="+pickcode,headers=headers_for_realurl) #print response.url if response.status_code==200: html=response.text #print html realurl=re.findall(r'"file_url":"(.*?)"',html) #name=str(re.findall(r'"file_name":"(.*?)"',html)[0]) return realurl else: print "Sory,Get Realurl Fail,Errorcode",response.status_code return -1def getpic(url,name): #print "name=",name f=open("%s"%(name),"wb") f.write(requests.get(url).content) f.close() #print name,"-->done"def work(offset): offset="%s"%(offset) print offset pcs=getpc(url_for_pc,offset) if pcs!=-1: for pc in pcs: if pc not in forbidden: #print pc url=geturl(url_for_realurl,pc) getpic(str(url[0]).replace("//",""),pc)for i in range(0,197,24):td=threading.Thread(target=work,args=(i)) td.run()print "done"
新闻热点
疑难解答