How to handle TimeoutErrors

Hey, We've just started using Crawlee 0.1.2 for some basic web scraping tasks, and I'm unsure from the docs how to handle cases where the HTTP request times out. Below is a simple script that scrapes a webpage and outputs the number of anchor tags on the page. This particular site blocks the request (in that it never responds) and the script hangs indefinitely. How can I handle this case so that the script doesn't hang indefinitely? Thanks in advance :perfecto:
from datetime import timedelta
from crawlee.beautifulsoup_crawler import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
import asyncio

async def task():
crawler = BeautifulSoupCrawler(
request_handler_timeout=timedelta(seconds=10),
max_request_retries=0,
)

@crawler.router.default_handler
async def _request_handler(context: BeautifulSoupCrawlingContext):
url = context.request.url
links = context.soup.find_all("a")
print(f"Found {len(links)} links")

await crawler.run(["https://www.unicreditgroup.eu/"])


if __name__ == "__main__":
asyncio.run(task())
from datetime import timedelta
from crawlee.beautifulsoup_crawler import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
import asyncio

async def task():
crawler = BeautifulSoupCrawler(
request_handler_timeout=timedelta(seconds=10),
max_request_retries=0,
)

@crawler.router.default_handler
async def _request_handler(context: BeautifulSoupCrawlingContext):
url = context.request.url
links = context.soup.find_all("a")
print(f"Found {len(links)} links")

await crawler.run(["https://www.unicreditgroup.eu/"])


if __name__ == "__main__":
asyncio.run(task())
2 Replies
criminal-purple
criminal-purple•10mo ago
from datetime import timedelta from crawlee.beautifulsoup_crawler import ( BeautifulSoupCrawler, BeautifulSoupCrawlingContext, ) import asyncio async def task(): crawler = BeautifulSoupCrawler( request_handler_timeout=timedelta(seconds=10), # Timeout for the request handler max_request_retries=1, # Allow one retry in case of failure ) @crawler.router.default_handler async def _request_handler(context: BeautifulSoupCrawlingContext): url = context.request.url links = context.soup.find_all("a") print(f"Found {len(links)} links at {url}") # Add error handling for timeout @crawler.router.error_handler async def _error_handler(context: BeautifulSoupCrawlingContext): print(f"Error occurred while processing {context.request.url}: {context.error}") await crawler.run(["https://www.unicreditgroup.eu/"]) if __name__ == "__main__": asyncio.run(task()) I thin this is the solution. please retry with this.
conscious-sapphire
conscious-sapphireOP•9mo ago
Thanks for the help! Will revisit this when I get the chance 🤙

Did you find this page helpful?