python批量获取html内body内容的实例

2020-01-04 13:38:47

字体：大中小

来源：转载

供稿：网友

现在有一批完整的关于介绍城市美食、景点等的html页面，需要将里面body的内容提取出来

方法：利用python插件beautifulSoup获取htmlbody标签的内容，并批量处理。

# -*- coding:utf8 -*- from bs4 import BeautifulSoupimport osimport os.pathimport sysreload(sys) sys.setdefaultencoding('utf8')   def printPath(level,path):	global allFileNum	#所有文件夹，第一个字段是此目录的级别	dirList = [] 	#所有文件	fileList = [] 	#返回一个列表，其中包含在目录条目的名称	files = os.listdir(path) 	#先添加目录级别	dirList.append(str(level)) 	for f in files:		if(os.path.isdir(path+'/'+f)):			#排除隐藏文件夹，因为隐藏文件夹过多			if(f[0] == '.'):				pass			else:				#添加隐藏文件夹				dirList.append(f)		if(os.path.isfile(path+'/'+f)):			#添加文件			fileList.append(f)	return (dirList,fileList) #将文件html文件抓取并写入指定txt文件def getAndInsert(rootdir,savepath,path):	global file_num	f_list = os.listdir(rootdir+'/'+path)	for i in f_list:		temp = os.path.splitext(i)[0]		for num in range(1,11):			if(i==str(num)+'.html'):				#print rootdir+'/'+path+'/'+i				objFile = open(rootdir+'/'+path+'/'+i)				soup = BeautifulSoup(objFile)				arr = []				for child in soup.body:					arr.append(child)				if os.path.exists(savepath+'/'+path):					pass				else:					os.makedirs(savepath+'/'+path)				f = open(savepath+'/'+path+'/'+temp+'.txt','w')				for k,v in enumerate(arr):					if k!=1:						f.write(str(v))				f.close()				print path+'/'+i+' is running'	file_num = file_num + 1			 rootdir = '../zips2'dirList,fileList = printPath(1,rootdir) savepath = "../testC"file_num = 0 for fn in dirList:	if(fn == '1'):		pass	else:		getAndInsert(rootdir,savepath,fn)		print fn+' is ending'print '一共完成'+str(file_num)+'个城市的提取'

以上这篇python批量获取html内body内容的实例就是小编分享给大家的全部内容了，希望能给大家一个参考，也希望大家多多支持VEVB武林网。

注：相关教程知识阅读请移步到python教程频道。

上一篇：python requests.post带head和body的实例

下一篇：如何在django里上传csv文件并进行入库处理的方法