Routers not working as expected

Hello everyone First of all, thanks for this project — it looks really good and promising! I'm considering using Crawlee as an alternative to Scrapy. I'm trying to use a router to run different processes based on the URL. But the request is never captured by the handler . I’d appreciate any insights — am I missing something here? Here’s my crawl.py:
import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router

async def main() -> None:
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False

crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
request_handler=router,
max_requests_per_crawl=5,
)

await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])

if __name__ == '__main__':
asyncio.run(main())
import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler
from crawlee import service_locator
from routes import router

async def main() -> None:
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False

crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
request_handler=router,
max_requests_per_crawl=5,
)

await crawler.run(['https://investor.agenusbio.com/news/default.aspx'])

if __name__ == '__main__':
asyncio.run(main())
and here my routes:
from __future__ import annotations

from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction

router = Router[AdaptivePlaywrightCrawlingContext]()

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
url = request_options.get('url', '')

if url.endswith('.pdf'):
print(f"Request options: {request_options} before")
request_options['label'] = 'pdf_handler'
print(f"Request options: {request_options} after")
return request_options

return request_options

@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
await context.enqueue_links(
transform_request_function=transform_request,
)

@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info('Processing PDF: %s', context.request.url)
from __future__ import annotations

from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router
from crawlee import RequestOptions, RequestTransformAction

router = Router[AdaptivePlaywrightCrawlingContext]()

def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
url = request_options.get('url', '')

if url.endswith('.pdf'):
print(f"Request options: {request_options} before")
request_options['label'] = 'pdf_handler'
print(f"Request options: {request_options} after")
return request_options

return request_options

@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
await context.enqueue_links(
transform_request_function=transform_request,
)

@router.handler(label='pdf_handler')
async def pdf_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
context.log.info('Processing PDF: %s', context.request.url)
3 Replies
Hall
Hall2mo ago
Someone will reply to you shortly. In the meantime, this might help:
fair-rose
fair-roseOP2mo ago
Request options: {'url': 'https://s202.q4cdn.com/949764554/files/doc_news/Agenus-Reports-Third-Quarter-2023-Results-2023.pdf', 'user_data': {}, 'label': None} before
Request options: {'url': 'https://s202.q4cdn.com/949764554/files/doc_news/Agenus-Reports-Third-Quarter-2023-Results-2023.pdf', 'user_data': {}, 'label': 'pdf_handler'} after
Request options: {'url': 'https://s202.q4cdn.com/949764554/files/doc_news/Agenus-Reports-Third-Quarter-2023-Results-2023.pdf', 'user_data': {}, 'label': None} before
Request options: {'url': 'https://s202.q4cdn.com/949764554/files/doc_news/Agenus-Reports-Third-Quarter-2023-Results-2023.pdf', 'user_data': {}, 'label': 'pdf_handler'} after
Mantisus
Mantisus2mo ago
Hey @Matheus Rossi Thank you for your interest in the framework! Try using
await context.enqueue_links(transform_request_function=transform_request, strategy='all')
await context.enqueue_links(transform_request_function=transform_request, strategy='all')
default - same-hostname However, the links to the PDF in your case are on a different host

Did you find this page helpful?