Crawlee & Apify•2y ago

Re-using the crawler, instead initializing after each url?

My scraper uses BullMQ, which retrieves jobs (URL's) from the job queue and runs them with CheerioCrawler. Is there any way to initialize the crawler once and keep using it. I assume this will also consume less resources and increase performance? If there are any best practices that I have not implemented I would love to hear about it.

// worker.ts
import { Worker } from 'bullmq';
import { CheerioCrawler, ProxyConfiguration } from 'crawlee';
import Redis from 'ioredis';
import { router } from './router';
import dotenv from 'dotenv';
dotenv.config();

console.log("REDIS_URL_JOB_QUEUE", process.env.REDIS_URL_JOB_QUEUE)
const connection = new Redis(process.env.REDIS_URL_JOB_QUEUE || '', {
    maxRetriesPerRequest: null
}); // Connect to a local Redis instance

const proxy = process.env?.PROXY_URL || '';
console.log('proxy', proxy)

const proxyConfiguration = new ProxyConfiguration({
    proxyUrls: [proxy],
});

const crawler = new CheerioCrawler({
    proxyConfiguration,
    requestHandler: router,
});

const scraperWorker = new Worker(
    'scraper',
    async (job) => {
        const url: string = job.data.url;
        
        try {
            // await crawler.addRequests([url]);
            await crawler.run([
                {
                    label: 'PRODUCT',
                    url
                },
            ]);

            // If everything went well, return a result
            return { result: 'success' };
        } catch (error) {
            // If something went wrong, throw an error
            console.error(`Scrape of ${url} failed with error ${error.message}`);
            throw error;
        }
    },
    {
        connection, limiter: {
            max: 2,    // Max number of jobs to handle
            duration: 5000 // per duration value in milliseconds (60,000ms = 1 minute)
        }
    }
);

scraperWorker.on('completed', (job, result) => {
    console.log(`Job ${job.id} completed with result ${result.result}`);
});

scraperWorker.on('failed', (job, err) => {
    if (!job) return console.log('Job not found');
    console.log(`Job ${job.id} failed with error ${err.message}`);
});

// worker.ts
import { Worker } from 'bullmq';
import { CheerioCrawler, ProxyConfiguration } from 'crawlee';
import Redis from 'ioredis';
import { router } from './router';
import dotenv from 'dotenv';
dotenv.config();

console.log("REDIS_URL_JOB_QUEUE", process.env.REDIS_URL_JOB_QUEUE)
const connection = new Redis(process.env.REDIS_URL_JOB_QUEUE || '', {
    maxRetriesPerRequest: null
}); // Connect to a local Redis instance

const proxy = process.env?.PROXY_URL || '';
console.log('proxy', proxy)

const proxyConfiguration = new ProxyConfiguration({
    proxyUrls: [proxy],
});

const crawler = new CheerioCrawler({
    proxyConfiguration,
    requestHandler: router,
});

const scraperWorker = new Worker(
    'scraper',
    async (job) => {
        const url: string = job.data.url;
        
        try {
            // await crawler.addRequests([url]);
            await crawler.run([
                {
                    label: 'PRODUCT',
                    url
                },
            ]);

            // If everything went well, return a result
            return { result: 'success' };
        } catch (error) {
            // If something went wrong, throw an error
            console.error(`Scrape of ${url} failed with error ${error.message}`);
            throw error;
        }
    },
    {
        connection, limiter: {
            max: 2,    // Max number of jobs to handle
            duration: 5000 // per duration value in milliseconds (60,000ms = 1 minute)
        }
    }
);

scraperWorker.on('completed', (job, result) => {
    console.log(`Job ${job.id} completed with result ${result.result}`);
});

scraperWorker.on('failed', (job, err) => {
    if (!job) return console.log('Job not found');
    console.log(`Job ${job.id} failed with error ${err.message}`);
});

2 Replies

other-emeraldOP•2y ago

After each URL scraped I see a log: Initializing the crawler.

deep-jade•2y ago

Use keepAlive option. [1] Allows to keep the crawler alive even if the RequestQueue gets empty. [1] https://crawlee.dev/api/basic-crawler/interface/BasicCrawlerOptions#keepAlive POC:

import {
    Dataset,
    CheerioCrawler,
    ProxyConfiguration,
    log,
    LogLevel
} from 'crawlee';


log.setLevel(LogLevel.DEBUG);

const crawler = new CheerioCrawler({
    keepAlive: true,
    proxyConfiguration,

    async requestHandler({ request, $ }) {
        log.debug(`Processing ${request.url}...`);

        const title = $('title').text();
        const h1texts: { text: string }[] = [];
        $('h1').each((index, el) => {
            h1texts.push({
                text: $(el).text(),
            });
        });

        await Dataset.pushData({
            url: request.url,
            title,
            h1texts,
        });
    },

    failedRequestHandler({ request }) {
        log.debug(`Request ${request.url} failed !`);
    },
});

/**
 * Returns a random integer between min (inclusive) and max (inclusive).
 * The value is no lower than min (or the next integer greater than min
 * if min isn't an integer) and no greater than max (or the next integer
 * lower than max if max isn't an integer).
 * Using Math.round() will give you a non-uniform distribution!
 */
function getRandomInt(min: number, max: number) {
    min = Math.ceil(min);
    max = Math.floor(max);
    return Math.floor(Math.random() * (max - min + 1)) + min;
}

async function scraperWorker(){
    const url: string = `https://example.com/${getRandomInt(1, 100)}`
    await new Promise(resolve => setTimeout(resolve, 5000));
    log.info(`scraperWorker of ${url}`)
    crawler.addRequests([url]);
}

crawler.run();

(function loop(){
    Promise.resolve()
      .then(async () => await scraperWorker())
      .catch(e => log.error(e))
      .then(() => process.nextTick(loop));
})();

import {
    Dataset,
    CheerioCrawler,
    ProxyConfiguration,
    log,
    LogLevel
} from 'crawlee';


log.setLevel(LogLevel.DEBUG);

const crawler = new CheerioCrawler({
    keepAlive: true,
    proxyConfiguration,

    async requestHandler({ request, $ }) {
        log.debug(`Processing ${request.url}...`);

        const title = $('title').text();
        const h1texts: { text: string }[] = [];
        $('h1').each((index, el) => {
            h1texts.push({
                text: $(el).text(),
            });
        });

        await Dataset.pushData({
            url: request.url,
            title,
            h1texts,
        });
    },

    failedRequestHandler({ request }) {
        log.debug(`Request ${request.url} failed !`);
    },
});

/**
 * Returns a random integer between min (inclusive) and max (inclusive).
 * The value is no lower than min (or the next integer greater than min
 * if min isn't an integer) and no greater than max (or the next integer
 * lower than max if max isn't an integer).
 * Using Math.round() will give you a non-uniform distribution!
 */
function getRandomInt(min: number, max: number) {
    min = Math.ceil(min);
    max = Math.floor(max);
    return Math.floor(Math.random() * (max - min + 1)) + min;
}

async function scraperWorker(){
    const url: string = `https://example.com/${getRandomInt(1, 100)}`
    await new Promise(resolve => setTimeout(resolve, 5000));
    log.info(`scraperWorker of ${url}`)
    crawler.addRequests([url]);
}

crawler.run();

(function loop(){
    Promise.resolve()
      .then(async () => await scraperWorker())
      .catch(e => log.error(e))
      .then(() => process.nextTick(loop));
})();

BasicCrawlerOptions | API | Crawlee

Gaming

Programming

Re-using the crawler, instead initializing after each url?

Did you find this page helpful?