Crawlee & Apify•8mo ago

How can i use proxy with playwright apify

hi I'm trying to make a scraper and i don't now how to implement a proxy hosted by apify in my script i share you a code to see why i'm trying to do

3 Replies

Hall•8mo ago

View post on community site

This post has been pushed to the community knowledgebase. Any replies in this thread will be synced to the community site.

Apify Community

jolly-crimsonOP•8mo ago

async def main() -> None:
    async with Actor:
        # Read the Actor input

        actor_input = await Actor.get_input() or {}
        start_urls = actor_input.get("start_urls", [])
        max_depth = actor_input.get("max_depth", 1)
        proxy_configuration = await Actor.create_proxy_configuration()
        proxy_url = await proxy_configuration.new_url()

        if not start_urls:
            Actor.log.info("No start URLs specified in Actor input, exiting...")
            await Actor.exit()

        # Enqueue the starting URLs in the default request queue
        default_queue = await Actor.open_request_queue()

        for start_url in start_urls:
            url = start_url.get("url")
            Actor.log.info(f"Enqueuing {url} ...")
            await default_queue.add_request({"url": url, "userData": {"depth": 0}})

        Actor.log.info("Launching Playwright...")
        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(headless=Actor.config.headless)
            context = await browser.new_context()

            while request := await default_queue.fetch_next_request():
                url = request["url"]
                depth = request["userData"]["depth"]

                Actor.log.info(f"Scraping {url} ...")

                Actor.log.info("Launching Playwright...")

                try:
                    page = await context.new_page()
                    await page.goto(url)
                    content = await page.content()

                    soup = BeautifulSoup(content, "html.parser")
                    content_seller = AmzonSellerInfosParser(url=url, soup=soup).parse()
                    await Actor.push_data(content_seller)

                except Exception:
                    Actor.log.exception(f"Cannot extract data from {url}.")
                finally:
                    await page.close()

async def main() -> None:
    async with Actor:
        # Read the Actor input

        actor_input = await Actor.get_input() or {}
        start_urls = actor_input.get("start_urls", [])
        max_depth = actor_input.get("max_depth", 1)
        proxy_configuration = await Actor.create_proxy_configuration()
        proxy_url = await proxy_configuration.new_url()

        if not start_urls:
            Actor.log.info("No start URLs specified in Actor input, exiting...")
            await Actor.exit()

        # Enqueue the starting URLs in the default request queue
        default_queue = await Actor.open_request_queue()

        for start_url in start_urls:
            url = start_url.get("url")
            Actor.log.info(f"Enqueuing {url} ...")
            await default_queue.add_request({"url": url, "userData": {"depth": 0}})

        Actor.log.info("Launching Playwright...")
        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(headless=Actor.config.headless)
            context = await browser.new_context()

            while request := await default_queue.fetch_next_request():
                url = request["url"]
                depth = request["userData"]["depth"]

                Actor.log.info(f"Scraping {url} ...")

                Actor.log.info("Launching Playwright...")

                try:
                    page = await context.new_page()
                    await page.goto(url)
                    content = await page.content()

                    soup = BeautifulSoup(content, "html.parser")
                    content_seller = AmzonSellerInfosParser(url=url, soup=soup).parse()
                    await Actor.push_data(content_seller)

                except Exception:
                    Actor.log.exception(f"Cannot extract data from {url}.")
                finally:
                    await page.close()

dependent-tan•8mo ago

Hi. I don't see in your code that you pass the proxy_url to playwright. You can find out how to do this in the official playwright documentation - https://playwright.dev/python/docs/network#http-proxy

Network | Playwright Python

Introduction

Gaming

Programming

How can i use proxy with playwright apify

Did you find this page helpful?