How can i use proxy with playwright apify

hi I'm trying to make a scraper and i don't now how to implement a proxy hosted by apify in my script i share you a code to see why i'm trying to do
3 Replies
Hall
Hall8mo ago
View post on community site
This post has been pushed to the community knowledgebase. Any replies in this thread will be synced to the community site.
Apify Community
jolly-crimson
jolly-crimsonOP8mo ago
async def main() -> None:
async with Actor:
# Read the Actor input

actor_input = await Actor.get_input() or {}
start_urls = actor_input.get("start_urls", [])
max_depth = actor_input.get("max_depth", 1)
proxy_configuration = await Actor.create_proxy_configuration()
proxy_url = await proxy_configuration.new_url()

if not start_urls:
Actor.log.info("No start URLs specified in Actor input, exiting...")
await Actor.exit()

# Enqueue the starting URLs in the default request queue
default_queue = await Actor.open_request_queue()

for start_url in start_urls:
url = start_url.get("url")
Actor.log.info(f"Enqueuing {url} ...")
await default_queue.add_request({"url": url, "userData": {"depth": 0}})

Actor.log.info("Launching Playwright...")
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=Actor.config.headless)
context = await browser.new_context()

while request := await default_queue.fetch_next_request():
url = request["url"]
depth = request["userData"]["depth"]

Actor.log.info(f"Scraping {url} ...")

Actor.log.info("Launching Playwright...")

try:
page = await context.new_page()
await page.goto(url)
content = await page.content()

soup = BeautifulSoup(content, "html.parser")
content_seller = AmzonSellerInfosParser(url=url, soup=soup).parse()
await Actor.push_data(content_seller)

except Exception:
Actor.log.exception(f"Cannot extract data from {url}.")
finally:
await page.close()
async def main() -> None:
async with Actor:
# Read the Actor input

actor_input = await Actor.get_input() or {}
start_urls = actor_input.get("start_urls", [])
max_depth = actor_input.get("max_depth", 1)
proxy_configuration = await Actor.create_proxy_configuration()
proxy_url = await proxy_configuration.new_url()

if not start_urls:
Actor.log.info("No start URLs specified in Actor input, exiting...")
await Actor.exit()

# Enqueue the starting URLs in the default request queue
default_queue = await Actor.open_request_queue()

for start_url in start_urls:
url = start_url.get("url")
Actor.log.info(f"Enqueuing {url} ...")
await default_queue.add_request({"url": url, "userData": {"depth": 0}})

Actor.log.info("Launching Playwright...")
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=Actor.config.headless)
context = await browser.new_context()

while request := await default_queue.fetch_next_request():
url = request["url"]
depth = request["userData"]["depth"]

Actor.log.info(f"Scraping {url} ...")

Actor.log.info("Launching Playwright...")

try:
page = await context.new_page()
await page.goto(url)
content = await page.content()

soup = BeautifulSoup(content, "html.parser")
content_seller = AmzonSellerInfosParser(url=url, soup=soup).parse()
await Actor.push_data(content_seller)

except Exception:
Actor.log.exception(f"Cannot extract data from {url}.")
finally:
await page.close()
dependent-tan
dependent-tan8mo ago
Hi. I don't see in your code that you pass the proxy_url to playwright. You can find out how to do this in the official playwright documentation - https://playwright.dev/python/docs/network#http-proxy

Did you find this page helpful?