CA
sunny-green

To eliminate duplicates of "request retries," may need to set a "timeout" between them?

The issue is that when the "job" fails, it gets restarted as many times as specified in "maxRequestRetries." However, if the restarted "jobs" are successful, I end up with multiple identical results in the output, whereas I only need one. For example: the first job fails, and it gets restarted (which is intended), but since it successfully restarts, for instance, two times, I receive two identical results. But I actually need only one result.
import { Dataset, PuppeteerCrawler, log, } from 'crawlee';

export const puppeteerCrawler = async (cbRouterHandler, links) => {
const crawler = new PuppeteerCrawler({
minConcurrency: 4,
maxConcurrency: 20,
maxRequestRetries: 3,
requestHandlerTimeoutSecs: 30,
headless: false,
requestHandler: cbRouterHandler,
preNavigationHooks: [
async (crawlingContext, gotoOptions) => {
gotoOptions.timeout = 15_000;
gotoOptions.waitUntil = 'networkidle2';
},
],
failedRequestHandler({ request, error }) {
log.error(`Request ${request.url} failed too many times.`);
},
});

await crawler.run(links);

await Dataset.exportToJSON('TEST');
};
import { Dataset, PuppeteerCrawler, log, } from 'crawlee';

export const puppeteerCrawler = async (cbRouterHandler, links) => {
const crawler = new PuppeteerCrawler({
minConcurrency: 4,
maxConcurrency: 20,
maxRequestRetries: 3,
requestHandlerTimeoutSecs: 30,
headless: false,
requestHandler: cbRouterHandler,
preNavigationHooks: [
async (crawlingContext, gotoOptions) => {
gotoOptions.timeout = 15_000;
gotoOptions.waitUntil = 'networkidle2';
},
],
failedRequestHandler({ request, error }) {
log.error(`Request ${request.url} failed too many times.`);
},
});

await crawler.run(links);

await Dataset.exportToJSON('TEST');
};
2 Replies
sunny-green
sunny-greenOP2y ago
import { puppeteerCrawler } from '../../../crawlers/index.js';
import { Dataset, createHttpRouter, log } from 'crawlee';
import links from './input/pdp-urls_TEST.json' assert { type: 'json' };
import { labelLinksList, getHTML } from '../../utils/index.js';

const router = createHttpRouter();

const LABEL = 'mex.reviews';
const FORMATTED_INPUT_LINKS = labelLinksList(links, LABEL, { uniqueKey: true }, {});

const SELECTORS = {
scheme: '[type="application/ld+json"]',
productName: 'h1.is-title',
waitElem: '[class="thumbs-container"]',
archiveElement: '[class="content"] [class="heading"]',
rating: '.reviews-average-grade .rating .container .title',
reviewCount: '.reviews-average-grade .rating .container .count',
};

const getData = async ($) => {
const productName = $(SELECTORS.productName).text().trim() || null;
const rating =
$(SELECTORS.rating)
.text()
.split('/')[0]
.replace(/[^0-9.]+/g, '') || null;
const reviewCount =
$(SELECTORS.reviewCount)
.text()
.replace(/[^0-9.]+/g, '') || null;

const fullData = {
productName,
rating,
reviewCount,
};
return fullData;
};

const reviewsHandler = async ({ request, page }) => {
log.debug(`Processing ${request.url}...`);
await page.waitForSelector(SELECTORS.scheme);
await page.waitForTimeout(3000);
let result;
const $ = await getHTML(page);

if (!result) {
const pageResult = await getData($);
result = {
url: request.url,
...pageResult,
...request.userData,
};
}
await Dataset.pushData(result);
};

(async () => {
await puppeteerCrawler(router, FORMATTED_INPUT_LINKS);
})();

router.addHandler(LABEL, reviewsHandler);
import { puppeteerCrawler } from '../../../crawlers/index.js';
import { Dataset, createHttpRouter, log } from 'crawlee';
import links from './input/pdp-urls_TEST.json' assert { type: 'json' };
import { labelLinksList, getHTML } from '../../utils/index.js';

const router = createHttpRouter();

const LABEL = 'mex.reviews';
const FORMATTED_INPUT_LINKS = labelLinksList(links, LABEL, { uniqueKey: true }, {});

const SELECTORS = {
scheme: '[type="application/ld+json"]',
productName: 'h1.is-title',
waitElem: '[class="thumbs-container"]',
archiveElement: '[class="content"] [class="heading"]',
rating: '.reviews-average-grade .rating .container .title',
reviewCount: '.reviews-average-grade .rating .container .count',
};

const getData = async ($) => {
const productName = $(SELECTORS.productName).text().trim() || null;
const rating =
$(SELECTORS.rating)
.text()
.split('/')[0]
.replace(/[^0-9.]+/g, '') || null;
const reviewCount =
$(SELECTORS.reviewCount)
.text()
.replace(/[^0-9.]+/g, '') || null;

const fullData = {
productName,
rating,
reviewCount,
};
return fullData;
};

const reviewsHandler = async ({ request, page }) => {
log.debug(`Processing ${request.url}...`);
await page.waitForSelector(SELECTORS.scheme);
await page.waitForTimeout(3000);
let result;
const $ = await getHTML(page);

if (!result) {
const pageResult = await getData($);
result = {
url: request.url,
...pageResult,
...request.userData,
};
}
await Dataset.pushData(result);
};

(async () => {
await puppeteerCrawler(router, FORMATTED_INPUT_LINKS);
})();

router.addHandler(LABEL, reviewsHandler);
export const labelLinksList = (initialList, label, options = null, userData = null) =>
initialList.map((url, index) => ({
url,
label,
...(options?.uniqueKey && { uniqueKey: `${index + 1}` }),
userData: {
...(userData ? userData : null),
index: index + 1,
},
}));
export const labelLinksList = (initialList, label, options = null, userData = null) =>
initialList.map((url, index) => ({
url,
label,
...(options?.uniqueKey && { uniqueKey: `${index + 1}` }),
userData: {
...(userData ? userData : null),
index: index + 1,
},
}));
For example: input length = 200 links, output = 215 objects (must be 200 ).
rival-black
rival-black2y ago
Hey there! Request queue deduplicates the URLs, but I see you're explicitly setting the uniqueKey for the requests - why? From what I see the problem is that there are probably duplicate URLs which are fed to crawler as different requests, and thus when they succeed - they are expectedly produce duplicates in the dataset.

Did you find this page helpful?