Cookies and other inputs
Hello everyone,
I am new to crawlee.
I used apify api version, now I want to apply same logic with python version.
My pain points are input values in api version which I didnt find out where I should write them in python version.
Inputs are :
{
"startUrls": [
{
"url": "https://celo.org"
}
],
"useSitemaps": false,
"crawlerType": "playwright:firefox",
"includeUrlGlobs": [],
"excludeUrlGlobs": [],
"ignoreCanonicalUrl": false,
"maxCrawlDepth": 20,
"maxCrawlPages": 9999999,
"initialConcurrency": 0,
"maxConcurrency": 200,
"initialCookies": [],
"proxyConfiguration": {
"useApifyProxy": true
},
"maxSessionRotations": 10,
"maxRequestRetries": 5,
"requestTimeoutSecs": 60,
"minFileDownloadSpeedKBps": 128,
"dynamicContentWaitSecs": 10,
"waitForSelector": "",
"maxScrollHeightPixels": 5000,
"removeElementsCssSelector": "nav, footer, script, style, noscript, svg,\n[role="alert"],\n[role="banner"],\n[role="dialog"],\n[role="alertdialog"],\n[role="region"][aria-label*="skip" i],\n[aria-modal="true"]",
"removeCookieWarnings": true,
"expandIframes": true,
"clickElementsCssSelector": "[aria-expanded="false"]",
"htmlTransformer": "readableText",
"readableTextCharThreshold": 100,
"aggressivePrune": false,
"debugMode": false,
"debugLog": false,
"saveHtml": false,
"saveHtmlAsFile": false,
"saveMarkdown": true,
"saveFiles": false,
"saveScreenshots": false,
"maxResults": 9999999,
"clientSideMinChangePercentage": 15,
"renderingTypeDetectionPercentage": 10
}
Thanks in advance!
1 Reply
funny-blueOP•10mo ago
my code
1 python
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=20,
) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. content_html = await context.page.content()
soup = BeautifulSoup(content_html, 'html.parser') text_content = soup.get_text(separator=' ', strip=True)
data = { 'url': context.request.url, 'title': await context.page.title(), "context": text_content } # Push the extracted data to the default dataset. await context.push_data(data) #await dataset.push_data(data) # Enqueue all links found on the page. await context.enqueue_links(selector='.morelink') # Run the crawler with the initial list of requests. await crawler.run(['https://celo.org']) `
) # Define the default request handler, which will be called for every request. @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Extract data from the page. content_html = await context.page.content()
soup = BeautifulSoup(content_html, 'html.parser') text_content = soup.get_text(separator=' ', strip=True)
data = { 'url': context.request.url, 'title': await context.page.title(), "context": text_content } # Push the extracted data to the default dataset. await context.push_data(data) #await dataset.push_data(data) # Enqueue all links found on the page. await context.enqueue_links(selector='.morelink') # Run the crawler with the initial list of requests. await crawler.run(['https://celo.org']) `