async def main() -> None:
async with Actor:
# Read the Actor input
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get("start_urls", [])
max_depth = actor_input.get("max_depth", 1)
proxy_configuration = await Actor.create_proxy_configuration()
proxy_url = await proxy_configuration.new_url()
if not start_urls:
Actor.log.info("No start URLs specified in Actor input, exiting...")
await Actor.exit()
# Enqueue the starting URLs in the default request queue
default_queue = await Actor.open_request_queue()
for start_url in start_urls:
url = start_url.get("url")
Actor.log.info(f"Enqueuing {url} ...")
await default_queue.add_request({"url": url, "userData": {"depth": 0}})
Actor.log.info("Launching Playwright...")
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=Actor.config.headless)
context = await browser.new_context()
while request := await default_queue.fetch_next_request():
url = request["url"]
depth = request["userData"]["depth"]
Actor.log.info(f"Scraping {url} ...")
Actor.log.info("Launching Playwright...")
try:
page = await context.new_page()
await page.goto(url)
content = await page.content()
soup = BeautifulSoup(content, "html.parser")
content_seller = AmzonSellerInfosParser(url=url, soup=soup).parse()
await Actor.push_data(content_seller)
except Exception:
Actor.log.exception(f"Cannot extract data from {url}.")
finally:
await page.close()