PlayWrightCrawler new request results are bleeding into old requests. RequestQueue issue?

Hello, first some code: crawl function
async function crawl (jobId, websiteURL, cb) {

var crawler = new crawlee.PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log }) {

const element = await page.$$eval('img', as => as.map(a => a.src));
if (element.length > 0) {
for (var img of element) {
if(cb.indexOf(img) === -1) {
cb.push(img);
}
}
}

// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks();
},
sessionPoolOptions: { persistStateKey: jobId, persistStateKeyValueStoreId: jobId },

});

await crawler.run([websiteURL]);
await crawler.teardown()

return cb;
}
async function crawl (jobId, websiteURL, cb) {

var crawler = new crawlee.PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log }) {

const element = await page.$$eval('img', as => as.map(a => a.src));
if (element.length > 0) {
for (var img of element) {
if(cb.indexOf(img) === -1) {
cb.push(img);
}
}
}

// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks();
},
sessionPoolOptions: { persistStateKey: jobId, persistStateKeyValueStoreId: jobId },

});

await crawler.run([websiteURL]);
await crawler.teardown()

return cb;
}
setInterval calls this function

async function fetchImagesUrls (uid, jobId, websiteURL) {
console.log("Fetching images...")

const results = await crawl(jobId, websiteURL, cb = []);
console.log(results);

return results;
}

async function fetchImagesUrls (uid, jobId, websiteURL) {
console.log("Fetching images...")

const results = await crawl(jobId, websiteURL, cb = []);
console.log(results);

return results;
}
Background: I'm calling the fetchImagesUrls from a setInterval function simulating a 'cron job'. I purposely make setinterval pick up Job#1 (details are fetched from a DB) then when the Job#1 starts, I make Job#2 be available for processing. Behavior: Now Job#1 and Job#2 are running from two different calls, however, the results are getting mixed into each other. I've tried useState() and my own callback (as shown here) - is there a way to make new calls be isolated to their own results set? I understand I might be missing something regarding JS fundamentals, but some guidance would be much appreciated. Thanks!
6 Replies
MEE6
MEE6•3y ago
@cryptorex just advanced to level 1! Thanks for your contributions! 🎉
fair-rose
fair-roseOP•3y ago
Other stuff I tried 1. injecting a key as the jobId into the cb array and push relevant job results to that key and return results from the cb array via the corresponding key, like: { 'jobId': ['url1', 'url2', 'url2'] }
genetic-orange
genetic-orange•3y ago
You need to create multiple request queues or request lists, one for each crawler. Then the results won't mix
fair-rose
fair-roseOP•3y ago
thanks! that seemed easy, and I think it worked. I can see the storage -> request_queues now has the assigned jobId (uuid) so I added this:
const rQueue = await crawlee.RequestQueue.open(jobId);
const rQueue = await crawlee.RequestQueue.open(jobId);
and passed it into my crawl function, then to the crawler init object as requestQueue: rQueue and I think it worked! I will do more testing but thanks again for your guidance!
genetic-orange
genetic-orange•3y ago
You will just need to clean the named queues afterwards. await rQueue.drop()
fair-rose
fair-roseOP•3y ago
ok thanks Lukas!

Did you find this page helpful?