首页 > 编程 > JavaScript > 正文

nodejs通过phantomjs实现下载网页

2019-11-20 12:34:01
字体:
来源:转载
供稿:网友

功能其实很见简单,通过 phantomjs.exe 采集 url 加载的资源,通过子进程的方式,启动nodejs 加载所有的资源,对于css的资源,匹配css内容,下载里面的url资源

当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下

 首先当然是下载 nodejs 和 phantomjs

下面是 phantomjs.exe 执行的 down.js

var page = require('webpage').create(),  system = require('system');var spawn = require("child_process").spawnif (system.args.length === 1) {  console.log('Usage: netsniff.js <some URL>');  phantom.exit(1);} else {  var urls = [];  page.address = system.args[1];  page.onResourceReceived = function (res) {    if (res.stage === 'start') {      urls.push(res.url);    }  };  page.open(page.address, function (status) {    var har;    if (status !== 'success') {      console.log('FAIL to load the address');      phantom.exit(1);    } else {      console.log('down resource ' + urls.length + ' urls.');      var child = spawn("node", ["--harmony", "downHtml.js", urls.join(',')])      child.stdout.on("data", function (data) {       console.log(data);      })      child.stderr.on("data", function (data) {       console.log(data);      })      child.on("exit", function (code) {       phantom.exit();      })          }  });}

下面是对应的node运行的 downHtml.js

"use strict";var fs = require('fs');var http = require('http');var path = require('path');var r_url = require('url');var dirCache = {};//缓存减少判断function makedir (pathStr, callback) {  if (dirCache[pathStr] == 1) {    callback();  } else {    fs.exists(pathStr, function (exists) {      if (exists == true) {        dirCache[pathStr] == 1;        callback();      } else {        makedir(path.dirname(pathStr), function () {          fs.mkdir(pathStr, function () {            dirCache[pathStr] == 1;            callback();          })        });      }    })  }};var reg = /[:,]/s*url/(['"]?.*?(/1)/)/gvar reg2 = //((['"]?)(.*?)(/1)/)/var isDownMap = {};var downImgFromCss = function (URL) {  http.get(URL, function(res) {    //console.log(path.resolve(process.cwd(), 'index.min.css'))    //res.pipe(fs.createWriteStream(path.resolve(process.cwd(), 'index.min.css')));    var body = "";    res.setEncoding('utf8');    res.on('data', function (chunk) {      body += chunk;    });    res.on('end', function () {      var match = body.match(reg);      for (var i = 0, len = match.length; i < len; i++){        var m = match[i].match(reg2);        if (m && m[2]) {          var url = m[2];          let imgUrl = r_url.resolve(URL, url);          if (!isDownMap[imgUrl]) {            var uo = r_url.parse(imgUrl);            let filepath = CWD + '/' + uo.hostname + uo.pathname;            makedir(path.dirname(filepath), function () {              http.get(imgUrl, function (res) {                res.pipe(fs.createWriteStream(filepath));              })            })            isDownMap[imgUrl] = 1;          }        }      }    });  });}var URLS = process.argv[2].split(',');var CWD = process.cwd();//下载资源URLS.forEach(function (URL) {  var uo = r_url.parse(URL);  var filepath;  if (uo.pathname == '/' || uo.pathname == '') {    filepath = CWD + '/' + uo.hostname + '/index.html';  } else {    filepath = CWD + '/' + uo.hostname + uo.pathname;  }  makedir(path.dirname(filepath), function () {    http.get(URL, function (res) {      if (URL.indexOf('.css') != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf('text/css')!= -1)) {        console.log('down images form css file:' + URL + '.');        downImgFromCss(URL);      }      res.pipe(fs.createWriteStream(filepath));    })  });});

down.js downHtml.js 放在同一个文件夹下 通过下列 cmd 运行

D:/phantomjs-2.0.0-windows/bin/phantomjs.exe down.js http://www.youku.com/

以上所述就是本文的全部内容了,希望大家能够喜欢。

发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表