新手,以下是爬取百度贴吧制定帖子的图片脚本,因为脚本主要是解析html代码,因此一旦百度修改页面前端代码,那么脚本会失效,权当爬虫入门练习吧,后续还会尝试更多的爬虫。
# coding=utf-8# !/usr/bin/env pythonimport urllib, string, osfrom bs4 import BeautifulSoupdef getHtml(url): page = urllib.urlopen(url) html = page.read() return htmldef getImg(): imgPath = 'F:/craw_tieba/' if not os.path.exists(imgPath): os.makedirs(imgPath) baseUrl = 'http://tieba.baidu.com/p/4657665666' imgList = [] for pg in range(1, 114): url = baseUrl + '?pn=' + str(pg) PRint 'Craw: ',url html = getHtml(url) soup = BeautifulSoup(html) imgURLList = string.split(str(soup.find_all('img')), ',') for i in range(0, len(imgURLList)): if 'http://imgsrc.baidu.com/forum/w%3D580/sign=' in imgURLList[i]: start = string.find(imgURLList[i], 'http') end = string.find(imgURLList[i], '.jpg') + 4 imgList.append(imgURLList[i][start : end]) x = 1 for img in imgList: urllib.urlretrieve(img, 'F:/craw_tieba/%s.jpg' % x) x += 1 print 'Craw tieba finish!'if __name__ == '__main__': getImg()
新闻热点
疑难解答