最新消息:Welcome to the puzzle paradise for programmers! Here, a well-designed puzzle awaits you. From code logic puzzles to algorithmic challenges, each level is closely centered on the programmer's expertise and skills. Whether you're a novice programmer or an experienced tech guru, you'll find your own challenges on this site. In the process of solving puzzles, you can not only exercise your thinking skills, but also deepen your understanding and application of programming knowledge. Come to start this puzzle journey full of wisdom and challenges, with many programmers to compete with each other and show your programming wisdom! Translated with DeepL.com (free version)

javascript - nodejs web scraper for password protected website - Stack Overflow

matteradmin5PV0评论

I am trying to scrape a website using nodejs and it works perfectly on sites that do not require any authentication. But whenever I try to scrape a site with a form that requires username and password I only get the HTML from the authentication page (that is, if you would click 'view page source' on the authentication page it self, that is the HTML I get). I am able to get the desired HTML using curl

curl -d "username=myuser&password=mypw&submit=Login" URL

Here is my code...

var express = require('express');
var fs = require('fs'); //access to file system
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){
url = 'myURL'

request(url, function(error, response, html){

    // check errors
    if(!error){
        // Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality
        var $ = cheerio.load(html);

        var title, release, rating;
        var json = { title : "", release : "", rating : ""};

        $('.span8 b').filter(function(){
            // Let's store the data we filter into a variable so we can easily see what's going on.
            var data = $(this);
            title = data.first().text();
            release = data.text();

            json.title = title;
            json.release = release;
        })
    }
    else{
        console.log("Error occurred: " + error);
    }

    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){

        console.log('File successfully written! - Check your project directory for the output.json file');

    })

    res.send('Check your console!')
})

})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

I have tried the following...

var request = require('request',
    username:'myuser',
    password:'mypw');

This just returns the authentication page's HTML

request({form: {username:myuser, password:mypw, submit:Login}, url: myURL}, function(error, response, html){
    ...
    ...
    ...
}

This also just returns the authentication page's HTML

So my question is how do I achieve this using nodejs?

I am trying to scrape a website using nodejs and it works perfectly on sites that do not require any authentication. But whenever I try to scrape a site with a form that requires username and password I only get the HTML from the authentication page (that is, if you would click 'view page source' on the authentication page it self, that is the HTML I get). I am able to get the desired HTML using curl

curl -d "username=myuser&password=mypw&submit=Login" URL

Here is my code...

var express = require('express');
var fs = require('fs'); //access to file system
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){
url = 'myURL'

request(url, function(error, response, html){

    // check errors
    if(!error){
        // Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality
        var $ = cheerio.load(html);

        var title, release, rating;
        var json = { title : "", release : "", rating : ""};

        $('.span8 b').filter(function(){
            // Let's store the data we filter into a variable so we can easily see what's going on.
            var data = $(this);
            title = data.first().text();
            release = data.text();

            json.title = title;
            json.release = release;
        })
    }
    else{
        console.log("Error occurred: " + error);
    }

    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){

        console.log('File successfully written! - Check your project directory for the output.json file');

    })

    res.send('Check your console!')
})

})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

I have tried the following...

var request = require('request',
    username:'myuser',
    password:'mypw');

This just returns the authentication page's HTML

request({form: {username:myuser, password:mypw, submit:Login}, url: myURL}, function(error, response, html){
    ...
    ...
    ...
}

This also just returns the authentication page's HTML

So my question is how do I achieve this using nodejs?

Share Improve this question edited Jul 23, 2024 at 10:39 VLAZ 29.2k9 gold badges63 silver badges85 bronze badges asked Dec 23, 2014 at 14:03 gthb7gthb7 872 silver badges5 bronze badges
Add a ment  | 

1 Answer 1

Reset to default 3

you shouldn't use .get but .post and put the post param (username and password) in your call

request.post({
  headers: {'content-type' : 'application/x-www-form-urlencoded'},
  url:     url,
  body:    "username=myuser&password=mypw&submit=Login"
}, function(error, response, html){
    //do your parsing... 
    var $ = cheerio.load(html)
});
Post a comment

comment list (0)

  1. No comments so far