from bs4 import BeautifulSoupimport requestsimport time#url = 'http://www.tripadvisor.cn/Attractions-g60763-Activities-New_York_City_New_York.html'#根据页数确定范围urls =['http://www.tripadvisor.cn/Attractions-g60763-Activities-oa{}-New_York_City_New_York.html#ATTRACTION_LIST'.format(str(i)) for i in range(0,1080,30)]
def get_attractions(url , data=None): wb_data = requests.get(url) time.sleep(4) #设定延时 soup = BeautifulSoup(wb_data.text, 'lxml') titles = soup.select('div.PRoperty_title > a[target="_blank"]') imgs = soup.select('img[width="160"]') cates = soup.select('div.p13n_reasoning_v2 > a') for title , img , cate in zip(titles , imgs , cates): data = { 'title': title.get_text(), 'img': img.get('src'), 'cate': list(cate.stripped_strings) } print(data)
for i in urls: get_attractions(i)
新手学习Python,了解网页元素,难点:元素定位
新闻热点
疑难解答