python解析html提取数据，并生成word文档实例解析

2020-01-04 16:03:33

字体：大中小

来源：转载

供稿：网友

简介

今天试着用ptyhon做了一个html">抓取网页内容，并生成word/120754.html">word文档的功能，功能很简单，做一下记录以备以后用到。

生成word用到了第三方组件python/205749.html">python/149904.html">python/51676.html">python-docx，所以先进行第三方组件的安装。由于windows下安装的python默认不带setuptools这个模块，所以要先安装setuptools这个模块。

安装

1、在python官网上找到　https://bootstrap.pypa.io/ez_setup.py　　，把代码保存到本地并执行:　 python ez_setup.py

2、下载python-docx 　(https://pypi.python.org/pypi/python-docx/0.7.4)，下载完成后解压并进入到　　XXX/python-docx-0.7.4　安装python-docx :　python setup.py install

这样python-docx就安装成功了，可以用它来操作word文档了，word文档的生成参考的这里https://python-docx.readthedocs.org/en/latest/index.html

html解析用到的是sgmllib里的SGMLParser　　url内容的获取用到的是urllib、urllib2

实现代码

# -*- coding: cp936 -*- from sgmllib import SGMLParser import os import sys import urllib import urllib2 from docx import Document from docx.shared import Inches import time  ##获取要解析的url class GetUrl(SGMLParser):   def __init__(self):     SGMLParser.__init__(self)     self.start=False     self.urlArr=[]     def start_div(self,attr):     for name,value in attr:       if value=="ChairmanCont Bureau":#页面js中的固定值         self.start=True     def end_div(self):     self.start=False     def start_a(self,attr):     if self.start:       for name,value in attr:         self.urlArr.append(value)            def getUrlArr(self):     return self.urlArr    ##解析上面获取的url，获取有用数据 class getManInfo(SGMLParser):   def __init__(self):     SGMLParser.__init__(self)     self.start=False     self.p=False     self.dl=False     self.manInfo=[]     self.subInfo=[]    def start_div(self,attr):     for name,value in attr:       if value=="SpeakerInfo":#页面js中的固定值         self.start=True    def end_div(self):     self.start=False    def start_p(self,attr):     if self.dl:       self.p=True    def end_p(self):     self.p=False    def start_img(self,attr):     if self.dl:       for name,value in attr:         self.subInfo.append(value)          def handle_data(self,data):     if self.p:       self.subInfo.append(data.decode('utf-8'))     def start_dl(self,attr):     if self.start:       self.dl=True    def end_dl(self):     self.manInfo.append(self.subInfo)     self.subInfo=[]     self.dl=False    def getManInfo(self):     return self.manInfo              urlSource="http://www.XXX" sourceData=urllib2.urlopen(urlSource).read()  startTime=time.clock() ##get urls getUrl=GetUrl() getUrl.feed(sourceData) urlArr=getUrl.getUrlArr() getUrl.close() print "get url use:" + str((time.clock() - startTime)) startTime=time.clock()   ##get maninfos manInfos=getManInfo() for url in urlArr:#one url one person   data=urllib2.urlopen(url).read()   manInfos.feed(data) infos=manInfos.getManInfo() manInfos.close() print "get maninfos use:" + str((time.clock() - startTime)) startTime=time.clock()  #word saveFile=os.getcwd()+"//xxx.docx" doc=Document() ##word title doc.add_heading("HEAD".decode('gbk'),0) p=doc.add_paragraph("HEADCONTENT:".decode('gbk'))   ##write info for infoArr in infos:   i=0   for info in infoArr:     if i==0:##img url       arr1=info.split('.')       suffix=arr1[len(arr1)-1]       arr2=info.split('/')       preffix=arr2[len(arr2)-2]       imgFile=os.getcwd()+"//imgs//"+preffix+"."+suffix       if not os.path.exists(os.getcwd()+"//imgs"):         os.mkdir(os.getcwd()+"//imgs")       imgData=urllib2.urlopen(info).read()        try:         f=open(imgFile,'wb')         f.write(imgData)         f.close()         doc.add_picture(imgFile,width=Inches(1.25))         os.remove(imgFile)       except Exception as err:         print (err)              elif i==1:       doc.add_heading(info+":",level=1)     else:       doc.add_paragraph(info,style='ListBullet')     i=i+1     doc.save(saveFile) print "word use:" + str((time.clock() - startTime))