首页 > 学院 > 操作系统 > 正文

Scrapy+phantomjs爬取动态网页数据

2024-06-28 16:02:28
字体:
来源:转载
供稿:网友

安装phantomjs

安装包下载地址: http://phantomjs.org/ ,包括 Windows ,Mac OS,linux版本,自行选择对应 版本下载解压即可( 为方便使用,可自已为phantomjs设置环境变量 ),其中带有一个example文件夹,里面有很多已经写好的代码供使用。本文假设phantomjs已经安装好并已设置了环境变量。

Scrapy 中在setting 文件设置

#phantomjs的文件路径,这里我复制到spiders文件中JS_BIN="spiders//phantomjs.exe"LOGIN_TYPE="myCrawl"ROBOTSTXT_OBEY = True#反爬机制ROBOTSTXT_OBEY = False#设置取消CookesCOOKIES_ENABLED = False#设置用户代理值,随便浏览一个网页,按F12 -> Network -> F5,随便点击一项,你都能看到有 User-agent 这一项,将这里面的内容拷贝就可以。USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'DOWNLOAD_DELAY = 3CONCURRENT_REQUESTS=100#取消默认的useragent,使用新的useragent DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,#关闭默认下载器 'javaScriptMiddleware.JavascriptMiddleware':543 #键为中间件类的路径,值为中间件的顺序 }

编写中间件

神马是中间件?

# -*- coding: utf-8 -*-from selenium import webdriverfrom scrapy.conf import settings# from scrapy.http.response import Responsefrom scrapy.http import HtmlResponseimport timefrom scrapy import signalsfrom scrapy.xlib.pydispatch import dispatcher from telnetlib import DOclass JavaScriptMiddleware(object): def __init__(self): if settings['LOGIN_TYPE'] == 'MyCrawl': ''' self.simulation = weibo_login(settings['USERNAME'], settings['PWD'], settings['COOKIE_FILE']) cookie_file = settings['COOKIE_FILE'] cookie_jar = cookielib.LWPCookieJar(cookie_file) cookie_jar.load(ignore_discard=True, ignore_expires=True) self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN']) for c in cookie_jar: self.driver.add_cookie({'name': c.name, 'value': c.value, 'path': '/', 'domain': c.domain}) ''' # simulate user login PRocess self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])# 登录# self.driver.get('http://login.sina.com.cn/')# uid = self.driver.find_element_by_id('username')# upw = self.driver.find_element_by_id('passWord')# loginBtn = self.driver.find_element_by_class_name('smb_btn')# time.sleep(1)# uid.send_keys(settings['USERNAME'])# upw.send_keys(settings['PWD'])# loginBtn.click()# time.sleep(1) elif settings['LOGIN_TYPE'] == 'other': print('add login code') pass else: self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN']) dispatcher.connect(self.spider_closed, signals.spider_closed) def process_request(self, request, spider): self.driver.get(request.url) print("页面渲染中····开始自动下拉页面") indexPage = 1000 while indexPage<self.driver.execute_script("return document.body.offsetHeight"): self.driver.execute_script("scroll(0,"+str(indexPage)+")") indexPage = indexPage +1000 print(indexPage) time.sleep(1) rendered_body = self.driver.page_source #编码处理 if r'charset="GBK"' in rendered_body or r'charset=gbk' in rendered_body: coding = 'gbk' else: coding = 'utf-8' return HtmlResponse(request.url, body=rendered_body, encoding='utf-8') #关闭浏览器 def spider_closed(self, spider, reason): print ('close driver......') self.driver.close()

最后放虫咬网站

# -*- coding: utf-8 -*-import scrapyfrom scrapy.http import Requestfrom fileinput import filenamefrom pip._vendor.requests.packages.urllib3 import responsefrom win32ui import GetTypeimport refrom builtins import strclass DmozSpider(scrapy.Spider): name = "crawl007" redis_key = 'blog.csdn.net' start_urls = ["http://blog.csdn.net/u010085423/article/details/54943875"] def parse(self, response): #//*[@id="article_details"]/div[1]/h1/span/a content = response.xpath("//[@id='article_details']/div[1]/h1/span/a/text()").extract() if content: print(content[0])
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表