crawlee do not scrap second time

I am scraping same amazon products in fixed interval of time but when i run program, crawlee scrap for the first time but after that it does not make any request
No description
8 Replies
rising-crimson
rising-crimsonOP3y ago
No description
rising-crimson
rising-crimsonOP3y ago
this.crawler = new CheerioCrawler({
proxyConfiguration: new ProxyConfiguration({ proxyUrls: proxies }),
requestQueue,
requestHandler: async ({ request, $ }) => {
const product = request.userData.product,

title = $("#productTitle").text().trim(),
price = $(".a-price span").first().text(),
image = $("#imgTagWrapperId img").attr("src"),

inventory = ($("#add-to-cart-button")?.attr("value") || $("#add-to-cart-button").text()) ? true : false,

ship = $("div.tabular-buybox-container div.tabular-buybox-text:nth(0) span").text(),
sold = $("div.tabular-buybox-container div.tabular-buybox-text:nth(1) span").text(),

available = (ship === sold) && inventory && sold === "Amazon.com"

if(product.new) {
await AmazonProduct.findOneAndUpdate({ id: product.id }, {
title, price, image, available,
new: false,
updated_at: new Date()
})
} else if(product.available !== available) {
if(available) this.sendNotification({ title, price, image, id: product.id })

await AmazonProduct.findOneAndUpdate({ id: product.id }, {
price: price || product.price,
available,
new: false,
updated_at: new Date()
})
}

console.log({ title, price, image, available })
}
})
this.crawler = new CheerioCrawler({
proxyConfiguration: new ProxyConfiguration({ proxyUrls: proxies }),
requestQueue,
requestHandler: async ({ request, $ }) => {
const product = request.userData.product,

title = $("#productTitle").text().trim(),
price = $(".a-price span").first().text(),
image = $("#imgTagWrapperId img").attr("src"),

inventory = ($("#add-to-cart-button")?.attr("value") || $("#add-to-cart-button").text()) ? true : false,

ship = $("div.tabular-buybox-container div.tabular-buybox-text:nth(0) span").text(),
sold = $("div.tabular-buybox-container div.tabular-buybox-text:nth(1) span").text(),

available = (ship === sold) && inventory && sold === "Amazon.com"

if(product.new) {
await AmazonProduct.findOneAndUpdate({ id: product.id }, {
title, price, image, available,
new: false,
updated_at: new Date()
})
} else if(product.available !== available) {
if(available) this.sendNotification({ title, price, image, id: product.id })

await AmazonProduct.findOneAndUpdate({ id: product.id }, {
price: price || product.price,
available,
new: false,
updated_at: new Date()
})
}

console.log({ title, price, image, available })
}
})
continuing-cyan
continuing-cyan3y ago
I guess you are using the same requestQueue for all runs and then all requests are considered already handled so it does not do anything.
rising-crimson
rising-crimsonOP3y ago
is there a way to stop this behaviour or should i create new crawler for every round
continuing-cyan
continuing-cyan3y ago
it is not default behaviour, by default storage is purged automatically. see this https://crawlee.dev/api/core/function/purgeDefaultStorages
rising-crimson
rising-crimsonOP3y ago
I tried this function but still same behaviour I tried not using requestQueue and still same behavior
rising-crimson
rising-crimsonOP3y ago
i was able to fix by giving unique url everytime
No description
rising-crimson
rising-crimsonOP3y ago
but if its caching url in storage then it will end up storing each and every url which is not good

Did you find this page helpful?