Crawlee doesn't process newly enqueued links via enqueueLinks

Hi folks, I'm trying to build a crawler that retrieves a body (Buffer), and later enqueues the next "page" to be crawled, if it exists (has_next === true ). The problem is that ?page=1 gets processed but the enqueued page (via enqueueLinks) doesn't; Crawlee states that it has processed all links (1 of 1). I have confirmed that has_next is indeed true and that enqueueLinks gets called. Am I missing something obvious?
await crawler.run([
{ url: 'http://quotes.toscrape.com/api/quotes?page=1', label: 'quotes' },
])

// ...

routerRef.addHandler('quotes', async (context) => {

const { request, sendRequest, enqueueLinks, response, body, log } = context

log.info(request.url)
if (!(body instanceof Buffer)) {
log.error(`Expected a Buffer instance.`)
return
}

const json: QuotesResponse = JSON.parse(body.toString())

if ((json.quotes?.length ?? 0) > 0) {
await datasetRef.pushData(json)
}

if (json.has_next) {
const urls = [`http://quotes.toscrape.com/api/quotes?page=${json.page + 1}`]
await enqueueLinks({
urls,
label: 'quotes',
strategy: EnqueueStrategy.All,
})
} else {
log.warning('No next was found')
}

})
await crawler.run([
{ url: 'http://quotes.toscrape.com/api/quotes?page=1', label: 'quotes' },
])

// ...

routerRef.addHandler('quotes', async (context) => {

const { request, sendRequest, enqueueLinks, response, body, log } = context

log.info(request.url)
if (!(body instanceof Buffer)) {
log.error(`Expected a Buffer instance.`)
return
}

const json: QuotesResponse = JSON.parse(body.toString())

if ((json.quotes?.length ?? 0) > 0) {
await datasetRef.pushData(json)
}

if (json.has_next) {
const urls = [`http://quotes.toscrape.com/api/quotes?page=${json.page + 1}`]
await enqueueLinks({
urls,
label: 'quotes',
strategy: EnqueueStrategy.All,
})
} else {
log.warning('No next was found')
}

})
4 Replies
exotic-emerald
exotic-emeraldOP3y ago
Hi again, I just cleaned up the code example, to make it easier on the eyes.
await crawler.run([
{ url: 'http://quotes.toscrape.com/api/quotes?page=1', label: 'quotes' },
])

routerRef.addHandler('quotes', async ({ enqueueLinks, body }) => {

const json: QuotesResponse = JSON.parse(body.toString())

// Do something usefull with retrieved data.
// ...

// Enqueue next page
await enqueueLinks({
urls: [`http://quotes.toscrape.com/api/quotes?page=${json.page + 1}`],
label: 'quotes',
strategy: EnqueueStrategy.All,
})

})
await crawler.run([
{ url: 'http://quotes.toscrape.com/api/quotes?page=1', label: 'quotes' },
])

routerRef.addHandler('quotes', async ({ enqueueLinks, body }) => {

const json: QuotesResponse = JSON.parse(body.toString())

// Do something usefull with retrieved data.
// ...

// Enqueue next page
await enqueueLinks({
urls: [`http://quotes.toscrape.com/api/quotes?page=${json.page + 1}`],
label: 'quotes',
strategy: EnqueueStrategy.All,
})

})
solid-orange
solid-orange3y ago
you can use crawler.addRequests function like this:
await crawler.run([
{ url: 'http://quotes.toscrape.com/api/quotes?page=1', label: 'quotes' },
])

// ...

routerRef.addHandler('quotes', async (context) => {

const { request, sendRequest, enqueueLinks, response, body, log, crawler } = context

log.info(request.url)
if (!(body instanceof Buffer)) {
log.error(`Expected a Buffer instance.`)
return
}

const json: QuotesResponse = JSON.parse(body.toString())

if ((json.quotes?.length ?? 0) > 0) {
await datasetRef.pushData(json)
}

if (json.has_next) {
const urls = [{url:`http://quotes.toscrape.com/api/quotes?page=${json.page + 1}`, label:'quotes'}]
await crawler.addRequests(urls);
} else {
log.warning('No next was found')
}

})
await crawler.run([
{ url: 'http://quotes.toscrape.com/api/quotes?page=1', label: 'quotes' },
])

// ...

routerRef.addHandler('quotes', async (context) => {

const { request, sendRequest, enqueueLinks, response, body, log, crawler } = context

log.info(request.url)
if (!(body instanceof Buffer)) {
log.error(`Expected a Buffer instance.`)
return
}

const json: QuotesResponse = JSON.parse(body.toString())

if ((json.quotes?.length ?? 0) > 0) {
await datasetRef.pushData(json)
}

if (json.has_next) {
const urls = [{url:`http://quotes.toscrape.com/api/quotes?page=${json.page + 1}`, label:'quotes'}]
await crawler.addRequests(urls);
} else {
log.warning('No next was found')
}

})
foreign-sapphire
foreign-sapphire3y ago
agree with Honza on crawler.addRequests, enqueueLinks's internal logic is quite complex – i often run the crawler in debug mode to see what is going on or check it's return value to see if it behaves as expected
const res = await enqueueLinks(...)
// res contains { processedRequests: [...], unprocessedRequests: [...] }
const res = await enqueueLinks(...)
// res contains { processedRequests: [...], unprocessedRequests: [...] }
exotic-emerald
exotic-emeraldOP3y ago
Thank you both @HonzaS and @strajk for answering, I've got things working by using crawler.addRequests().

Did you find this page helpful?