Crawlee & Apify•10mo ago

Cookies and other inputs

Hello everyone, I am new to crawlee. I used apify api version, now I want to apply same logic with python version. My pain points are input values in api version which I didnt find out where I should write them in python version. Inputs are : { "startUrls": [ { "url": "https://celo.org" } ], "useSitemaps": false, "crawlerType": "playwright:firefox", "includeUrlGlobs": [], "excludeUrlGlobs": [], "ignoreCanonicalUrl": false, "maxCrawlDepth": 20, "maxCrawlPages": 9999999, "initialConcurrency": 0, "maxConcurrency": 200, "initialCookies": [], "proxyConfiguration": { "useApifyProxy": true }, "maxSessionRotations": 10, "maxRequestRetries": 5, "requestTimeoutSecs": 60, "minFileDownloadSpeedKBps": 128, "dynamicContentWaitSecs": 10, "waitForSelector": "", "maxScrollHeightPixels": 5000, "removeElementsCssSelector": "nav, footer, script, style, noscript, svg,\n[role="alert"],\n[role="banner"],\n[role="dialog"],\n[role="alertdialog"],\n[role="region"][aria-label*="skip" i],\n[aria-modal="true"]", "removeCookieWarnings": true, "expandIframes": true, "clickElementsCssSelector": "[aria-expanded="false"]", "htmlTransformer": "readableText", "readableTextCharThreshold": 100, "aggressivePrune": false, "debugMode": false, "debugLog": false, "saveHtml": false, "saveHtmlAsFile": false, "saveMarkdown": true, "saveFiles": false, "saveScreenshots": false, "maxResults": 9999999, "clientSideMinChangePercentage": 15, "renderingTypeDetectionPercentage": 10 } Thanks in advance!

1 Reply

funny-blueOP•10mo ago

my code 1 python async def main() -> None: crawler = PlaywrightCrawler( # Limit the crawl to max requests. Remove or increase it for crawling all links. max_requests_per_crawl=20,
) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. content_html = await context.page.content()
soup = BeautifulSoup(content_html, 'html.parser') text_content = soup.get_text(separator=' ', strip=True)
data = { 'url': context.request.url, 'title': await context.page.title(), "context": text_content } # Push the extracted data to the default dataset. await context.push_data(data) #await dataset.push_data(data) # Enqueue all links found on the page. await context.enqueue_links(selector='.morelink') # Run the crawler with the initial list of requests. await crawler.run(['https://celo.org']) `

Gaming

Programming

Cookies and other inputs

Did you find this page helpful?