Beautiful Soup
-Python第三方库,用于从HTML或xml中提取数据
-官网: http://www.crummy.com/software/BeautifulSoup/
安装并测试beautifulsoup4
-安装:pip install beautifulsoup4
-测试:import bs4
用代码测试:
import bs4
PRint(bs4)
将文件安装到Scripts目录步骤:
cd /位置
dir
pip install beautifulsoup4
安装成功后的测试结果
<module 'bs4' from 'C://Users//Administrator//AppData//Local//Programs//Python//Python36-32//lib//site-packages//bs4//__init__.py'>
最后来个代码玩玩
__author__ = 'Mr'import urllib.requestimport reimport timefrom bs4 import BeautifulSoupp = re.compile('/jack_user/article/details/........')#博客主页url = "http://blog.csdn.net/jack_user" #使用build_opener()是为了让python程序模仿浏览器进行访问opener = urllib.request.build_opener()opener.addheaders = [('User-agent', 'Mozilla/5.0')]html = opener.open(url).read().decode('utf-8')allfinds = p.findall(html)#print(allfinds)urlBase = "http://blog.csdn.net"#需要将网址合并的部分#页面中的网址有重复的,需要使用set进行去重复mypages = list(set(allfinds))for i in range(len(mypages)): mypages[i] = urlBase+mypages[i]print('要刷的网页有:')for index , page in enumerate(mypages) : print(str(index), page)#设置每个网页要访问的次数brushNum = 200#所有的页面都访问print('下面开始刷了哦:')for index , page in enumerate(mypages) : for j in range(brushNum): try : pageContent = opener.open(page).read().decode('utf-8') #使用BeautifulSoup解析每篇博客的标题 soup = BeautifulSoup(pageContent) blogTitle = str(soup.title.string) blogTitle = blogTitle[0:blogTitle.find('-')] print(str(j) , blogTitle) except urllib.error.HTTPError: print('urllib.error.HTTPError') time.sleep(3)#出现错误,停几秒先 except urllib.error.URLError: print('urllib.error.URLError') time.sleep(3)#出现错误,停几秒先 time.sleep(0.5)#正常停顿,以免服务器拒绝访问
新闻热点
疑难解答