running numerous scrapers from one start file with speed
alr have a web scraper for amazon outputting to a rawData.json file able to successfully to scrape product links and then go through each of those product links to get the data i need
but i want to scale up to many many scrapers and im having trouble running multiple scrapers at once
i essentially made a new router to handle the other site and want to know how i can make sure that only the url with the same label will run the router handler with the same label but it wont let me define both routes like
it didn't work and i had to combine both routers in a weird way to get it to work and there weren't any errors but I keep getting no scraped data from the second site (ebay) and it it sometimes shows objects that have the eBay site name instead of amazon but still have an amazon link with an amazon product in it
i want to be able to run both scrapes at the same time, get rid of the combinedRouter, and define them as different routes and also make the scrapes happen faster and also make it so that it is easy to add routes on and scale up the process and keep adding on new scrapers daily
here are my codes:
requestHandler: [router, router2]
requestHandler: [router, router2]
here are my codes:
1 Reply
national-goldOP•2y ago
main.js:
combinedRouter.js:
amazon.js:
ebay.js:
i get outputs as such in my data file (data is incomplete bc some tags are not accurate):
and weird ones too:
import { CheerioCrawler, ProxyConfiguration, AutoscaledPool, SessionPool } from 'crawlee';
import { combinedRouter } from './combinedRouter.js';
const searchKeywords = 'hydroflasks'; // Replace with desired search keywords
const startUrls = [
{ url: `https://www.amazon.com/s?k=${searchKeywords}`, label: 'AMAZON' },
{ url: `https://www.ebay.com/sch/i.html?_nkw=${searchKeywords}`, label: 'EBAY' },
];
const crawler = new CheerioCrawler({
useSessionPool: true,
sessionPoolOptions: { maxPoolSize: 100 },
// Set to true if you want the crawler to save cookies per session,
// and set the cookie header to request automatically (default is true).
persistCookiesPerSession: true,
// Start the crawler right away and ensure there will always be 20 concurrent requests ran at any time
minConcurrency: 20,
// Ensure the crawler doesn't exceed 15 concurrent requests ran at any time
maxConcurrency: 40,
// ...but also ensure the crawler never exceeds 250 requests per minute
maxRequestsPerMinute: 250,
// Define router to run crawl
requestHandler: combinedRouter
});
export { crawler }
await crawler.run(startUrls);
import { CheerioCrawler, ProxyConfiguration, AutoscaledPool, SessionPool } from 'crawlee';
import { combinedRouter } from './combinedRouter.js';
const searchKeywords = 'hydroflasks'; // Replace with desired search keywords
const startUrls = [
{ url: `https://www.amazon.com/s?k=${searchKeywords}`, label: 'AMAZON' },
{ url: `https://www.ebay.com/sch/i.html?_nkw=${searchKeywords}`, label: 'EBAY' },
];
const crawler = new CheerioCrawler({
useSessionPool: true,
sessionPoolOptions: { maxPoolSize: 100 },
// Set to true if you want the crawler to save cookies per session,
// and set the cookie header to request automatically (default is true).
persistCookiesPerSession: true,
// Start the crawler right away and ensure there will always be 20 concurrent requests ran at any time
minConcurrency: 20,
// Ensure the crawler doesn't exceed 15 concurrent requests ran at any time
maxConcurrency: 40,
// ...but also ensure the crawler never exceeds 250 requests per minute
maxRequestsPerMinute: 250,
// Define router to run crawl
requestHandler: combinedRouter
});
export { crawler }
await crawler.run(startUrls);
import { router as amazonRouter } from './amazon.js';
import { router2 as ebayRouter } from './ebay.js';
const combinedRouter = (request, crawler) => {
amazonRouter(request, crawler);
ebayRouter(request, crawler);
};
export { combinedRouter };
import { router as amazonRouter } from './amazon.js';
import { router2 as ebayRouter } from './ebay.js';
const combinedRouter = (request, crawler) => {
amazonRouter(request, crawler);
ebayRouter(request, crawler);
};
export { combinedRouter };
import { createCheerioRouter } from 'crawlee';
import fs, { link } from 'fs';
import { crawler } from './main.js';
export const router = createCheerioRouter();
router.addHandler('AMAZON', async ({ $, crawler }) => {
console.log('starting link scrape')
// Scrape product links from search results page
const productLinks = $('h2 a').map((_, el) => 'https://www.amazon.com' + $(el).attr('href')).get();
console.log(`Found ${productLinks.length} product links for Amazon`);
console.log(productLinks)
// Add each product link to request queue
for (const link of productLinks) {
const result = await crawler.addRequests([link]);
await result.waitForAllRequestsToBeAdded;
}
});
router.addDefaultHandler(async ({ $, request, crawler }) => {
const productInfo = {};
productInfo.link = request.url;
productInfo.storeName = 'Amazon';
productInfo.productTitle = $('span#productTitle').text().trim();
productInfo.productDescription = $('div#productDescription').text().trim();
productInfo.salePrice = $('span#priceblock_ourprice').text().trim();
productInfo.originalPrice = $('span.priceBlockStrikePriceString').text().trim();
productInfo.reviewScore = $('span#acrPopover').attr('title');
productInfo.shippingInfo = $('div#ourprice_shippingmessage').text().trim();
// Write product info to JSON file
if (Object.keys(productInfo).length > 0) {
const rawData = JSON.stringify(productInfo, null, 2);
fs.appendFile('rawData.json', rawData, (err) => {
if (err) throw err;
});
}
console.log(`Product info written to rawData.json for amazon`);
});
import { createCheerioRouter } from 'crawlee';
import fs, { link } from 'fs';
import { crawler } from './main.js';
export const router = createCheerioRouter();
router.addHandler('AMAZON', async ({ $, crawler }) => {
console.log('starting link scrape')
// Scrape product links from search results page
const productLinks = $('h2 a').map((_, el) => 'https://www.amazon.com' + $(el).attr('href')).get();
console.log(`Found ${productLinks.length} product links for Amazon`);
console.log(productLinks)
// Add each product link to request queue
for (const link of productLinks) {
const result = await crawler.addRequests([link]);
await result.waitForAllRequestsToBeAdded;
}
});
router.addDefaultHandler(async ({ $, request, crawler }) => {
const productInfo = {};
productInfo.link = request.url;
productInfo.storeName = 'Amazon';
productInfo.productTitle = $('span#productTitle').text().trim();
productInfo.productDescription = $('div#productDescription').text().trim();
productInfo.salePrice = $('span#priceblock_ourprice').text().trim();
productInfo.originalPrice = $('span.priceBlockStrikePriceString').text().trim();
productInfo.reviewScore = $('span#acrPopover').attr('title');
productInfo.shippingInfo = $('div#ourprice_shippingmessage').text().trim();
// Write product info to JSON file
if (Object.keys(productInfo).length > 0) {
const rawData = JSON.stringify(productInfo, null, 2);
fs.appendFile('rawData.json', rawData, (err) => {
if (err) throw err;
});
}
console.log(`Product info written to rawData.json for amazon`);
});
import { createCheerioRouter } from 'crawlee';
import fs, { link } from 'fs';
import { crawler } from './main.js';
export const router2 = createCheerioRouter();
router2.addHandler('EBAY', async ({ $, crawler }) => {
console.log('starting link scrape')
// Scrape product links from search results page
const productLinks = $('a.item__info-link').map((_, el) => $(el).attr('href')).get();
console.log(`Found ${productLinks.length} product links for eBay`);
// Add each product link to request queue
for (const link of productLinks) {
const result = await crawler.addRequests([link]);
await result.waitForAllRequestsToBeAdded;
}
});
router2.addDefaultHandler(async ({ $, request, crawler }) => {
const productInfo = {};
productInfo.link = request.url;
productInfo.storeName = 'eBay';
productInfo.productTitle = $('h3.s-item__title').text().trim();
productInfo.productDescription = $('div.a-section.a-spacing-small.span.a-size-base-plus').text().trim();
productInfo.salePrice = $('span.s-item__price').text().trim();
productInfo.originalPrice = $('span.s-item__price--original').text().trim();
productInfo.reviewScore = $('div.s-item__reviews').text().trim();
productInfo.shippingInfo = $('span.s-item__shipping').text().trim();
// Write product info to JSON file
if (Object.keys(productInfo).length > 0) {
const rawData = JSON.stringify(productInfo, null, 2);
fs.appendFile('rawData.json', rawData, (err) => {
if (err) throw err;
});
}
});
import { createCheerioRouter } from 'crawlee';
import fs, { link } from 'fs';
import { crawler } from './main.js';
export const router2 = createCheerioRouter();
router2.addHandler('EBAY', async ({ $, crawler }) => {
console.log('starting link scrape')
// Scrape product links from search results page
const productLinks = $('a.item__info-link').map((_, el) => $(el).attr('href')).get();
console.log(`Found ${productLinks.length} product links for eBay`);
// Add each product link to request queue
for (const link of productLinks) {
const result = await crawler.addRequests([link]);
await result.waitForAllRequestsToBeAdded;
}
});
router2.addDefaultHandler(async ({ $, request, crawler }) => {
const productInfo = {};
productInfo.link = request.url;
productInfo.storeName = 'eBay';
productInfo.productTitle = $('h3.s-item__title').text().trim();
productInfo.productDescription = $('div.a-section.a-spacing-small.span.a-size-base-plus').text().trim();
productInfo.salePrice = $('span.s-item__price').text().trim();
productInfo.originalPrice = $('span.s-item__price--original').text().trim();
productInfo.reviewScore = $('div.s-item__reviews').text().trim();
productInfo.shippingInfo = $('span.s-item__shipping').text().trim();
// Write product info to JSON file
if (Object.keys(productInfo).length > 0) {
const rawData = JSON.stringify(productInfo, null, 2);
fs.appendFile('rawData.json', rawData, (err) => {
if (err) throw err;
});
}
});
"link": "https://www.amazon.com/Hydro-Flask-Standard-Flex-RAIN/dp/B08WWLPYKC/ref=sr_1_8?keywords=hydroflasks&qid=1684641714&sr=8-8",
"storeName": "Amazon",
"productTitle": "Hydro Flask Standard Mouth Bottle with Flex Cap",
"productDescription": "The Standard Mouth bottle is ideal for sipping, while still accommodating ice cubes. Featuring the insulated Flex Cap, designed for ultimate portability and comfort.",
"salePrice": "",
"originalPrice": "",
"reviewScore": "4.8 out of 5 stars",
"shippingInfo": ""
"link": "https://www.amazon.com/Hydro-Flask-Standard-Flex-RAIN/dp/B08WWLPYKC/ref=sr_1_8?keywords=hydroflasks&qid=1684641714&sr=8-8",
"storeName": "Amazon",
"productTitle": "Hydro Flask Standard Mouth Bottle with Flex Cap",
"productDescription": "The Standard Mouth bottle is ideal for sipping, while still accommodating ice cubes. Featuring the insulated Flex Cap, designed for ultimate portability and comfort.",
"salePrice": "",
"originalPrice": "",
"reviewScore": "4.8 out of 5 stars",
"shippingInfo": ""
"link": "https://www.amazon.com/Hydro-Flask-Standard-Flex-RAIN/dp/B08WWLPYKC/ref=sr_1_8?keywords=hydroflasks&qid=1684641714&sr=8-8",
"storeName": "eBay",
"productTitle": "",
"productDescription": "",
"salePrice": "",
"originalPrice": "",
"reviewScore": "",
"shippingInfo": ""
"link": "https://www.amazon.com/Hydro-Flask-Standard-Flex-RAIN/dp/B08WWLPYKC/ref=sr_1_8?keywords=hydroflasks&qid=1684641714&sr=8-8",
"storeName": "eBay",
"productTitle": "",
"productDescription": "",
"salePrice": "",
"originalPrice": "",
"reviewScore": "",
"shippingInfo": ""