Crawlee & Apify•6mo ago

Scrape/crawl transactional rather than batch

Hi, I'm looking to introduce crawling websites into an existing workflow which doesn't suit batch processing. i.e. I want to scrape each website get the result and do some further processing downstream. I do have this working with the code attached however I imagine there's a better way to achieve this given I'll be concurrently processing this with up to 500 websites and my concern is memory allocation

export async function crawlWebsiteForAddresses(url: string) {
  const ukPostcodeRegex = /\b([A-Z]{1,2}[0-9][A-Z0-9]?)\s?([0-9][A-Z]{2})\b/;
  const addressSet = new AddressSet();

  const crawler = new PlaywrightCrawler({
    requestHandler: async ({ request, page, enqueueLinks, log }) => {
      const content = await page.content();

      const postcodeMatch = content.match(ukPostcodeRegex);
      if (postcodeMatch) {
        const postcode = postcodeMatch[0];
        log.info(`UK postcode found on ${request.url}: ${postcode}`);
        const addressElement = page.locator(`text=${postcode}`).first();

        if (addressElement) {
          const parentTextContent = await addressElement.evaluate((el) => (el.parentElement?.textContent ? el.parentElement?.textContent : ""));
          log.info(`Address found for postcode ${postcode}: ${parentTextContent}`);
          addressSet.add({ postcode, addressText: parentTextContent });
        }
      }

      await enqueueLinks();
    },
    maxRequestsPerCrawl: 500,
  });

  await crawler.run([url]);
  await crawler.teardown();
  return addressSet;
}

export async function crawlWebsiteForAddresses(url: string) {
  const ukPostcodeRegex = /\b([A-Z]{1,2}[0-9][A-Z0-9]?)\s?([0-9][A-Z]{2})\b/;
  const addressSet = new AddressSet();

  const crawler = new PlaywrightCrawler({
    requestHandler: async ({ request, page, enqueueLinks, log }) => {
      const content = await page.content();

      const postcodeMatch = content.match(ukPostcodeRegex);
      if (postcodeMatch) {
        const postcode = postcodeMatch[0];
        log.info(`UK postcode found on ${request.url}: ${postcode}`);
        const addressElement = page.locator(`text=${postcode}`).first();

        if (addressElement) {
          const parentTextContent = await addressElement.evaluate((el) => (el.parentElement?.textContent ? el.parentElement?.textContent : ""));
          log.info(`Address found for postcode ${postcode}: ${parentTextContent}`);
          addressSet.add({ postcode, addressText: parentTextContent });
        }
      }

      await enqueueLinks();
    },
    maxRequestsPerCrawl: 500,
  });

  await crawler.run([url]);
  await crawler.teardown();
  return addressSet;
}

1 Reply

Hall•6mo ago

Someone will reply to you shortly. In the meantime, this might help:

Gaming

Programming

Scrape/crawl transactional rather than batch

Did you find this page helpful?