I installed this and got it running (apparently).
The first time I ran it, I didn't get any feedback/logging/console messages for so long that I quit the process via Control+C
.
const scrape = require('website-scraper');
const phantomHtml = require('website-scraper-phantom');
scrape({
urls: ['http://example.com/'],
directory: '/Users/a/Sites/example/website-scraper/site',
recursive: true,
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}
},
prettifyUrls: true,
urlFilter: function(url){
return url.indexOf('http://example.com') === 0;
},
maxDepth: 20,
filenameGenerator: 'bySiteStructure',
onResourceSaved: (resource) => {
console.log(`Resource ${resource} was saved to fs`);
},
onResourceError: (resource, err) => {
console.log(`Resource ${resource} was not saved because of ${err}`);
},
httpResponseHandler: phantomHtml
}).then(console.log).catch(console.log);
$ node with-phantom.js
Resource { url: "http://example.com/", filename: "index.html", depth: 0 } was saved to fs
Resource { url: "http://example.com/content/images/global/logo--white.svg", filename: "content/images/global/logo--white.svg", depth: 1 } was saved to fs
Resource { url: "http://example.com/images/default-source/new/erica-neher.jpg", filename: "images/default-source/new/erica-neher.jpg", depth: 1 } was saved to fs
Resource { url: "http://example.com/images/default-source/new/mgelman.jpg?sfvrsn=2", filename: "images/default-source/new/mgelman.jpg", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-621522020/header-holistic.js", filename: "content/scripts/v-621522020/header-holistic.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/resource.axd?07IkLtzIKOUyqM8H2sHUPv81&t=636178367520", filename: "resource.axd", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/libs/modernizr-2.6.2.js", filename: "content/scripts/libs/modernizr-2.6.2.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/hotjar_include.js", filename: "content/scripts/hotjar_include.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/resource.axd?iinqIDoUO_VbuPa8B9csLfpVDCXSHdCKAoN-OP_pDFQ_0&t=636178367740", filename: "resource.axd", depth: 1 } was saved to fs
Resource { url: "http://example.com/resource.axd?PCLcor6BQmkX8d4Ln6gfp6zwZkEXw_F40hkBMSazY5i0&t=636178367740", filename: "resource.axd", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/hubspot.js", filename: "content/scripts/hubspot.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-603870980/ga.js", filename: "content/scripts/v-603870980/ga.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-625081030/holistic-header-footer.js", filename: "content/scripts/v-625081030/holistic-header-footer.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-603870980/global.js", filename: "content/scripts/v-603870980/global.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/vimeo.ga.min.js", filename: "content/scripts/vimeo.ga.min.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-6216491424941450/main.js", filename: "content/scripts/v-6216491424941450/main.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-621522020/form-validations.js", filename: "content/scripts/v-621522020/form-validations.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-6283065698575657/holistic.js", filename: "content/scripts/v-6283065698575657/holistic.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/resource.axd?zdee0I2COZBAoCu6mO5ApQ6PJv2uymlLGla6EwEYYVYUOCk4hiLpCNnwd89UU1&t=636178367740", filename: "resource.axd", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/sharethis.js", filename: "content/scripts/sharethis.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/scripts/v-6225564540/plugins.js", filename: "content/scripts/v-6225564540/plugins.js", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/styles/icon-fonts.css", filename: "content/styles/icon-fonts.css", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/styles/v-6250897590/holistic-header-footer.min.css", filename: "content/styles/v-6250897590/holistic-header-footer.min.css", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/styles/v-6256985850/holistic.min.css", filename: "content/styles/v-6256985850/holistic.min.css", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/styles/example.min.css", filename: "content/styles/example.min.css", depth: 1 } was saved to fs
Resource { url: "http://example.com/content/images/apple-touch-icon-precomposed.png", filename: "content/images/apple-touch-icon-precomposed.png", depth: 1 } was saved to fs
Resource { url: "http://example.com/favicon.ico", filename: "favicon.ico", depth: 1 } was saved to fs
Why is it taking so long to crawl/scrape/download the webpages and assets of the site?