This tutorial will describe how CasperJS can be used to scrape/test multiple pages at a time. CasperJS is a navigation scripting and testing utility. It’s execution takes place in sequential manner, in which one navigation step executes after other. For small number of steps, this behavior of CasperJS is perfectly fine. But as number of steps increase, the amount of time consumed can become very huge. This problem can be solved by introducing parallelism in the execution of navigation steps.

Setup guide for CasperJS: CasperJS Official setup guide

Mission of This Tutorial #

Main Problem: #

Scraping search results from the first page for a keyword of google.
This problem statement can be divided into multiple parts:

  • Scrape results for a single keyword.
  • Scrape results for multiple keywords in sequential manner.
  • Introduce parallelism to the previous step.
  • Address issues, if any.

Getting Started #

Lets begin with the scraping results for a single keyword.

Google Scraping : For a single keyword #

A simple code to open Google search engine results for keyword facebook and then scrape results from first page looks like .

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
var results = {};
var casper = require('casper').create();
//It is important that you set the user-agent. Google uses it to render their pages.
casper.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64)"+
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36");
//Get all the links
function getLinks() {
var links = document.querySelectorAll('h3.r a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
//Add scraped links to results variable
function addScrapedLinksToResults(query){
return function(){
var links = this.evaluate(getLinks);
this.echo(JSON.stringify(query));
this.echo(query +" "+ links.join(","));
results[query] = links;
}
}
var query = 'google';
//We will be using this link instead of form submission.
//form submission is now based on instant search.
//As there is no event for html re-rendering,
//it is going to be difficult to know whether results for the new keyword
//has rendered or not.
var url = 'https://www.google.co.in/search?q='+query;
casper.start();
casper.thenOpen(url);
casper.then(addScrapedLinksToResults(query));
casper.run(function() {
// echo results in some pretty fashion
this.echo('Done');
for(var key in results){
this.echo(results[key].length + ' links found for '+ key +':');
this.echo(' - ' + results[key].join('\n - '));
}
this.exit();
});

Save this file as simple-google-scrape.jsin bin folder of CasperJS. Run it using following command.

1
casperjs simple-google-scrape.js

Console will display all the scraped links from the first page.

Google Scraping : For Multiple keywords #

By simply adding multiple casper.thenOpen(url) beforecasper.run() multiple result pages can be scraped.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
var results = {};
var casper = require('casper').create();
casper.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64)"+
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36");
function getLinks() {
var links = document.querySelectorAll('h3.r a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
function addScrapedLinksToResults(query){
return function(){
var links = this.evaluate(getLinks);
this.echo(JSON.stringify(query));
this.echo(query +" "+ links.join(","));
results[query] = links;
}
}
var query = 'google';
var url = 'https://www.google.co.in/search?q='+query;
casper.start();
casper.thenOpen(url);
casper.then(addScrapedLinksToResults(query));
query = 'facebook';
url = 'https://www.google.co.in/search?q='+query;
casper.thenOpen(url);
casper.then(addScrapedLinksToResults(query));
casper.run(function() {
// echo results in some pretty fashion
this.echo('Done');
for(var key in results){
this.echo(results[key].length + ' links found for '+ key +':');
this.echo(' - ' + results[key].join('\n - '));
}
this.exit();
});

Google Scraping : For Multiple keywords, using array #

Each thenOpen and then call adds a navigation step to the execution stack of CasperJS. Code for scraping results of multiple keywords can easily be tweaked with array of keywords. For more information on Navigation steps:What Does ‘Then’ Really Mean in CasperJS

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
var results = {};
var casper = require('casper').create();
casper.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64)"+
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36");
function getLinks() {
var links = document.querySelectorAll('h3.r a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
function addScrapedLinksToResults(query){
return function(){
var links = this.evaluate(getLinks);
this.echo(JSON.stringify(query));
this.echo(query +" "+ links.join(","));
results[query] = links;
}
}
var query = ['google','facebook','twitter','pinterest','whatsapp','skype'];
casper.start();
for(var i=0;i<query.length;i++){
var url = 'https://www.google.co.in/search?q='+query[i];
casper.thenOpen(url);
casper.then(addScrapedLinksToResults(query[i]));
}
casper.run(function() {
// echo results in some pretty fashion
this.echo('Done');
for(var key in results){
this.echo(results[key].length + ' links found for '+ key +':');
this.echo(' - ' + results[key].join('\n - '));
}
this.exit();
});

This code results in proper and complete result. Based on observations made in previous run, following conclusions could be made:

  • Keyword to URL mapping is not proper.
  • For some keywords, no results were scraped: then() step[the scraping step] executed before page was loaded. Above step also could be the result of this issue.
  • And apparently only one instance worked: Executing exit() on one casper instance caused all other instances to exit.

Solutions: #

To deal with the first problem jQuery is injected in every result page and with the help of $.ready page was scraped at the very right time. This solution eliminated both mapping and zero results problem.
For the second problem, status is added to each casper instance with the help of casper.completed = false. Status was then modified to true once instance associated with it has executed all the steps. All the statuses were checked before calling exit() and only iff statuses for all thecasper instances are set to true, exit() was issued.