首页 > 网站 > WEB开发 > 正文

nodejs 下载网页及相关资源文件

2024-04-27 14:10:01
字体:
来源:转载
供稿:网友

nodejs 下载网页及相关资源文件

功能其实很见简单,通过 phantomjs.exe 采集 url 加载的资源,通过子进程的方式,启动nodejs 加载所有的资源,对于CSS的资源,匹配css内容,下载里面的url资源

当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下

首先当然是下载 nodejs 和phantomjs

下面是phantomjs.exe 执行的down.js

var page = require('webpage').create(),    system = require('system');var spawn = require("child_PRocess").spawnif (system.args.length === 1) {    console.log('Usage: netsniff.js <some URL>');    phantom.exit(1);} else {    var urls = [];    page.address = system.args[1];    page.onResourceReceived = function (res) {        if (res.stage === 'start') {            urls.push(res.url);        }    };    page.open(page.address, function (status) {        var har;        if (status !== 'success') {            console.log('FAIL to load the address');            phantom.exit(1);        } else {            console.log('down resource ' + urls.length + ' urls.');            var child = spawn("node", ["--harmony", "downHtml.js", urls.join(',')])            child.stdout.on("data", function (data) {              console.log(data);            })            child.stderr.on("data", function (data) {              console.log(data);            })            child.on("exit", function (code) {              phantom.exit();            })                    }    });}

下面是对应的node运行的downHtml.js

"use strict";var fs = require('fs');var http = require('http');var path = require('path');var r_url = require('url');var dirCache = {};//缓存减少判断function makedir (pathStr, callback) {    if (dirCache[pathStr] == 1) {        callback();    } else {        fs.exists(pathStr, function (exists) {            if (exists == true) {                dirCache[pathStr] == 1;                callback();            } else {                makedir(path.dirname(pathStr), function () {                    fs.mkdir(pathStr, function () {                        dirCache[pathStr] == 1;                        callback();                    })                });            }        })    }};var reg = /[:,]/s*url/(['"]?.*?(/1)/)/gvar reg2 = //((['"]?)(.*?)(/1)/)/var isDownMap = {};var downImgFromCss = function (URL) {    http.get(URL, function(res) {        //console.log(path.resolve(process.cwd(), 'index.min.css'))        //res.pipe(fs.createWriteStream(path.resolve(process.cwd(), 'index.min.css')));        var body = "";        res.setEncoding('utf8');        res.on('data', function (chunk) {            body += chunk;        });        res.on('end', function () {            var match = body.match(reg);            for (var i = 0, len = match.length; i < len; i++){                var m = match[i].match(reg2);                if (m && m[2]) {                    var url = m[2];                    let imgUrl = r_url.resolve(URL, url);                    if (!isDownMap[imgUrl]) {                        var uo = r_url.parse(imgUrl);                        let filepath = CWD + '/' + uo.hostname + uo.pathname;                        makedir(path.dirname(filepath), function () {                            http.get(imgUrl, function (res) {                                res.pipe(fs.createWriteStream(filepath));                            })                        })                        isDownMap[imgUrl] = 1;                    }                }            }        });    });}var URLS = process.argv[2].split(',');var CWD = process.cwd();//下载资源URLS.forEach(function (URL) {    var uo = r_url.parse(URL);    var filepath;    if (uo.pathname == '/' || uo.pathname == '') {        filepath = CWD + '/' + uo.hostname + '/index.html';    } else {        filepath = CWD + '/' + uo.hostname + uo.pathname;    }    makedir(path.dirname(filepath), function () {        http.get(URL, function (res) {            if (URL.indexOf('.css') != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf('text/css')!= -1)) {                console.log('down images form css file:' + URL + '.');                downImgFromCss(URL);            }            res.pipe(fs.createWriteStream(filepath));        })    });});

down.jsdownHtml.js 放在同一个文件夹下 通过下列 cmd 运行

D:/phantomjs-2.0.0-windows/bin/phantomjs.exe down.js http://www.youku.com/


发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表