Blocking certain requests

I'm trying to block some requests in Puppeteer but it doesn't seem to work if I run the script headed :
const blockedResourceTypes = ['webp', 'svg', 'mp4', 'jpeg', 'gif', 'avif', 'font']
const crawler = new PuppeteerCrawler({
launchContext: {
launchOptions: {
headless: false,
devtools: true,
defaultViewport:{ width: 1920, height: 6000 },
args: [
'--disable-dev-shm-usage',
]
},
useIncognitoPages: true,
},
proxyConfiguration,
requestHandler: router,
maxConcurrency: 16,
maxRequestRetries: 15,
maxRequestsPerMinute: 2,
navigationTimeoutSecs: 120,
useSessionPool: true,
failedRequestHandler({ request }) {
log.debug(`Request ${request.url} failed 15 times.`);
},

preNavigationHooks: [
async ({ addInterceptRequestHandler }) => {
await addInterceptRequestHandler((request) => {
if (blockedResourceTypes.includes(request.resourceType())) {
return request.respond({
status: 200,
body: 'useless shit',
});
}
return request.continue();
});
},
],
});
const blockedResourceTypes = ['webp', 'svg', 'mp4', 'jpeg', 'gif', 'avif', 'font']
const crawler = new PuppeteerCrawler({
launchContext: {
launchOptions: {
headless: false,
devtools: true,
defaultViewport:{ width: 1920, height: 6000 },
args: [
'--disable-dev-shm-usage',
]
},
useIncognitoPages: true,
},
proxyConfiguration,
requestHandler: router,
maxConcurrency: 16,
maxRequestRetries: 15,
maxRequestsPerMinute: 2,
navigationTimeoutSecs: 120,
useSessionPool: true,
failedRequestHandler({ request }) {
log.debug(`Request ${request.url} failed 15 times.`);
},

preNavigationHooks: [
async ({ addInterceptRequestHandler }) => {
await addInterceptRequestHandler((request) => {
if (blockedResourceTypes.includes(request.resourceType())) {
return request.respond({
status: 200,
body: 'useless shit',
});
}
return request.continue();
});
},
],
});
Any ideas ?
7 Replies
MEE6
MEE6•3y ago
@NeoNomade just advanced to level 8! Thanks for your contributions! 🎉
Pepa J
Pepa J•3y ago
@NeoNomade I mostly go for something like (it should be almost identical for JS and Pupeteer):
const abortAssets: PlaywrightHook = async ({ page }) => {
const RESOURCE_EXCLUSIONS = ['image', 'media', 'font', 'stylesheet'];
await page.route('**/*', (route) => {
if (RESOURCE_EXCLUSIONS.includes(route.request().resourceType())) {
return route.abort();
}
return route.continue();
});
};


const playwrightCrawler = new PlaywrightCrawler({
// ...
preNavigationHooks: [
abortAssets,
],
// ...
});
const abortAssets: PlaywrightHook = async ({ page }) => {
const RESOURCE_EXCLUSIONS = ['image', 'media', 'font', 'stylesheet'];
await page.route('**/*', (route) => {
if (RESOURCE_EXCLUSIONS.includes(route.request().resourceType())) {
return route.abort();
}
return route.continue();
});
};


const playwrightCrawler = new PlaywrightCrawler({
// ...
preNavigationHooks: [
abortAssets,
],
// ...
});
NeoNomade
NeoNomadeOP•3y ago
certainly routes don't work with Pupeteer
Pepa J
Pepa J•3y ago
@NeoNomade Rewrote it for Pupeteer/JS:
const abortAssets = async ({ page }) => {
const RESOURCE_EXCLUSIONS = ['image', 'media', 'font', 'stylesheet'];
await page.setRequestInterception(true);

await page.on('request', (request) => {
if (RESOURCE_EXCLUSIONS.includes(request.resourceType())) {
return request.abort();
}
return request.continue();
});
};


const crawler = new PuppeteerCrawler({
preNavigationHooks: [
abortAssets,
],
headless: false,
proxyConfiguration,
requestHandler: router,
});
const abortAssets = async ({ page }) => {
const RESOURCE_EXCLUSIONS = ['image', 'media', 'font', 'stylesheet'];
await page.setRequestInterception(true);

await page.on('request', (request) => {
if (RESOURCE_EXCLUSIONS.includes(request.resourceType())) {
return request.abort();
}
return request.continue();
});
};


const crawler = new PuppeteerCrawler({
preNavigationHooks: [
abortAssets,
],
headless: false,
proxyConfiguration,
requestHandler: router,
});
Actually found a decent article about it on google https://www.scrapingbee.com/blog/block-requests-puppeteer/
Block ressources with Puppeteer
This article will show you how to intercept and block requests with Puppeteer using the request interception API and the puppeteer extra plugin.
NeoNomade
NeoNomadeOP•3y ago
Thanks will try immediately Works absolutely great, also in conjuction with puppeteer ad-blocker thank you very much ! Can I ask another thing here ?
Pepa J
Pepa J•3y ago
If it is a different topic please create another thread 🙂
NeoNomade
NeoNomadeOP•3y ago
new thread created

Did you find this page helpful?