Re-using the crawler, instead initializing after each url?
My scraper uses BullMQ, which retrieves jobs (URL's) from the job queue and runs them with CheerioCrawler.
Is there any way to initialize the crawler once and keep using it. I assume this will also consume less resources and increase performance?
If there are any best practices that I have not implemented I would love to hear about it.
// worker.ts
import { Worker } from 'bullmq';
import { CheerioCrawler, ProxyConfiguration } from 'crawlee';
import Redis from 'ioredis';
import { router } from './router';
import dotenv from 'dotenv';
dotenv.config();
console.log("REDIS_URL_JOB_QUEUE", process.env.REDIS_URL_JOB_QUEUE)
const connection = new Redis(process.env.REDIS_URL_JOB_QUEUE || '', {
maxRetriesPerRequest: null
}); // Connect to a local Redis instance
const proxy = process.env?.PROXY_URL || '';
console.log('proxy', proxy)
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: [proxy],
});
const crawler = new CheerioCrawler({
proxyConfiguration,
requestHandler: router,
});
const scraperWorker = new Worker(
'scraper',
async (job) => {
const url: string = job.data.url;
try {
// await crawler.addRequests([url]);
await crawler.run([
{
label: 'PRODUCT',
url
},
]);
// If everything went well, return a result
return { result: 'success' };
} catch (error) {
// If something went wrong, throw an error
console.error(`Scrape of ${url} failed with error ${error.message}`);
throw error;
}
},
{
connection, limiter: {
max: 2, // Max number of jobs to handle
duration: 5000 // per duration value in milliseconds (60,000ms = 1 minute)
}
}
);
scraperWorker.on('completed', (job, result) => {
console.log(`Job ${job.id} completed with result ${result.result}`);
});
scraperWorker.on('failed', (job, err) => {
if (!job) return console.log('Job not found');
console.log(`Job ${job.id} failed with error ${err.message}`);
});
// worker.ts
import { Worker } from 'bullmq';
import { CheerioCrawler, ProxyConfiguration } from 'crawlee';
import Redis from 'ioredis';
import { router } from './router';
import dotenv from 'dotenv';
dotenv.config();
console.log("REDIS_URL_JOB_QUEUE", process.env.REDIS_URL_JOB_QUEUE)
const connection = new Redis(process.env.REDIS_URL_JOB_QUEUE || '', {
maxRetriesPerRequest: null
}); // Connect to a local Redis instance
const proxy = process.env?.PROXY_URL || '';
console.log('proxy', proxy)
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: [proxy],
});
const crawler = new CheerioCrawler({
proxyConfiguration,
requestHandler: router,
});
const scraperWorker = new Worker(
'scraper',
async (job) => {
const url: string = job.data.url;
try {
// await crawler.addRequests([url]);
await crawler.run([
{
label: 'PRODUCT',
url
},
]);
// If everything went well, return a result
return { result: 'success' };
} catch (error) {
// If something went wrong, throw an error
console.error(`Scrape of ${url} failed with error ${error.message}`);
throw error;
}
},
{
connection, limiter: {
max: 2, // Max number of jobs to handle
duration: 5000 // per duration value in milliseconds (60,000ms = 1 minute)
}
}
);
scraperWorker.on('completed', (job, result) => {
console.log(`Job ${job.id} completed with result ${result.result}`);
});
scraperWorker.on('failed', (job, err) => {
if (!job) return console.log('Job not found');
console.log(`Job ${job.id} failed with error ${err.message}`);
});

2 Replies
other-emeraldOP•2y ago
After each URL scraped I see a log: Initializing the crawler.
deep-jade•2y ago
Use
keepAlive
option. [1]
Allows to keep the crawler alive even if the RequestQueue gets empty.
[1] https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions#keepAlive
POC:
import {
Dataset,
CheerioCrawler,
ProxyConfiguration,
log,
LogLevel
} from 'crawlee';
log.setLevel(LogLevel.DEBUG);
const crawler = new CheerioCrawler({
keepAlive: true,
proxyConfiguration,
async requestHandler({ request, $ }) {
log.debug(`Processing ${request.url}...`);
const title = $('title').text();
const h1texts: { text: string }[] = [];
$('h1').each((index, el) => {
h1texts.push({
text: $(el).text(),
});
});
await Dataset.pushData({
url: request.url,
title,
h1texts,
});
},
failedRequestHandler({ request }) {
log.debug(`Request ${request.url} failed !`);
},
});
/**
* Returns a random integer between min (inclusive) and max (inclusive).
* The value is no lower than min (or the next integer greater than min
* if min isn't an integer) and no greater than max (or the next integer
* lower than max if max isn't an integer).
* Using Math.round() will give you a non-uniform distribution!
*/
function getRandomInt(min: number, max: number) {
min = Math.ceil(min);
max = Math.floor(max);
return Math.floor(Math.random() * (max - min + 1)) + min;
}
async function scraperWorker(){
const url: string = `https://example.com/${getRandomInt(1, 100)}`
await new Promise(resolve => setTimeout(resolve, 5000));
log.info(`scraperWorker of ${url}`)
crawler.addRequests([url]);
}
crawler.run();
(function loop(){
Promise.resolve()
.then(async () => await scraperWorker())
.catch(e => log.error(e))
.then(() => process.nextTick(loop));
})();
import {
Dataset,
CheerioCrawler,
ProxyConfiguration,
log,
LogLevel
} from 'crawlee';
log.setLevel(LogLevel.DEBUG);
const crawler = new CheerioCrawler({
keepAlive: true,
proxyConfiguration,
async requestHandler({ request, $ }) {
log.debug(`Processing ${request.url}...`);
const title = $('title').text();
const h1texts: { text: string }[] = [];
$('h1').each((index, el) => {
h1texts.push({
text: $(el).text(),
});
});
await Dataset.pushData({
url: request.url,
title,
h1texts,
});
},
failedRequestHandler({ request }) {
log.debug(`Request ${request.url} failed !`);
},
});
/**
* Returns a random integer between min (inclusive) and max (inclusive).
* The value is no lower than min (or the next integer greater than min
* if min isn't an integer) and no greater than max (or the next integer
* lower than max if max isn't an integer).
* Using Math.round() will give you a non-uniform distribution!
*/
function getRandomInt(min: number, max: number) {
min = Math.ceil(min);
max = Math.floor(max);
return Math.floor(Math.random() * (max - min + 1)) + min;
}
async function scraperWorker(){
const url: string = `https://example.com/${getRandomInt(1, 100)}`
await new Promise(resolve => setTimeout(resolve, 5000));
log.info(`scraperWorker of ${url}`)
crawler.addRequests([url]);
}
crawler.run();
(function loop(){
Promise.resolve()
.then(async () => await scraperWorker())
.catch(e => log.error(e))
.then(() => process.nextTick(loop));
})();