Enqueue_links only on match in url path? Cancel request in pre_navigation_hook?

I have set up my handler that it only enqueue links that match on certain keywords Problem here is that I want the code to only check the URL Path and not the full URL. To give an example: Lets say I only want to enqueue links where the keyword "team" or "about" is part of the URL path. When crawling www.example.com and it would find an url with www.example.com/team. I want that URL to queue. When crawling www.my-team.com it would match on all urls on that website because team is part of the main url. But that is not the desired behaviour I want. I thought of using a pre_navigation_hook and check there again with the following code, but I don't think it's possible to cancel a request that is already queued?
parsed_url = urlparse(context.request.url)
path_name = parsed_url.path

results = _get_regex_matches(path_name)

if not results:
context.log.info(
f'No match found for URL: {context.request.url} in path: '
f'{path_name}'
)
# TODO: CANCEL REQUEST
parsed_url = urlparse(context.request.url)
path_name = parsed_url.path

results = _get_regex_matches(path_name)

if not results:
context.log.info(
f'No match found for URL: {context.request.url} in path: '
f'{path_name}'
)
# TODO: CANCEL REQUEST
In the docs I found something like await request_list.mark_request_as_handled(request) but I don't think I have any access to a request_list or something simular in the PlaywrightPreNavCrawlingContext It would be great if someone can point me in the right direction!
2 Replies
Hall
Hall4mo ago
Someone will reply to you shortly. In the meantime, this might help: -# This post was marked as solved by ROYOSTI. View answer.
optimistic-gold
optimistic-gold4mo ago
Hey @ROYOSTI A PR is now in the works that will allow you to easily customize this behavior - https://github.com/apify/crawlee-python/pull/923 Prior to its release, there are several ways to solve it. 1. You can try setting up a selector that selects only the links you need
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
crawler = BeautifulSoupCrawler()

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'The title of {context.request.url} ...')
await context.enqueue_links(selector='a[href*="changelog"], a[href*="quick-start"]')

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
import asyncio

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
crawler = BeautifulSoupCrawler()

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'The title of {context.request.url} ...')
await context.enqueue_links(selector='a[href*="changelog"], a[href*="quick-start"]')

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
2. You do not necessarily need to use enqueue_links
import asyncio

from yarl import URL

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
crawler = BeautifulSoupCrawler()

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'The title of {context.request.url} ...')
next_requests = []
for link in context.parsed_content.select('a'):
link = link.get('href')
if 'changelog' in link or 'quick-start' in link:
url = URL(context.request.url).join(URL(link))
next_requests.append(str(url))
await context.add_requests(next_requests)

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
import asyncio

from yarl import URL

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
crawler = BeautifulSoupCrawler()

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'The title of {context.request.url} ...')
next_requests = []
for link in context.parsed_content.select('a'):
link = link.get('href')
if 'changelog' in link or 'quick-start' in link:
url = URL(context.request.url).join(URL(link))
next_requests.append(str(url))
await context.add_requests(next_requests)

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
I noticed you're using Playwright. You can use route so you don't have to make a real request.
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext


async def main() -> None:
crawler = PlaywrightCrawler(max_requests_per_crawl=50)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

page_content = await context.page.content()
if '||skip||' in page_content:
context.log.info(f'Skip {context.request.url} ...')
return

await context.enqueue_links()

@crawler.pre_navigation_hook
async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
if context.request.url == 'https://crawlee.dev/':
return
if 'changelog' not in context.request.url and 'quick-start' not in context.request.url:
await context.page.route(
context.request.url,
lambda route, _: route.fulfill(
status=200,
body=b'||skip||',
),
)

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext


async def main() -> None:
crawler = PlaywrightCrawler(max_requests_per_crawl=50)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

page_content = await context.page.content()
if '||skip||' in page_content:
context.log.info(f'Skip {context.request.url} ...')
return

await context.enqueue_links()

@crawler.pre_navigation_hook
async def navigation_hook(context: PlaywrightPreNavCrawlingContext) -> None:
if context.request.url == 'https://crawlee.dev/':
return
if 'changelog' not in context.request.url and 'quick-start' not in context.request.url:
await context.page.route(
context.request.url,
lambda route, _: route.fulfill(
status=200,
body=b'||skip||',
),
)

await crawler.run(['https://crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())

Did you find this page helpful?