Crawlee & Apify•2mo ago

Request queue with id: [id] not does not exist

I create an API with express that runs crawle when called on an endpoint. It is weird that it works completly fine on the first request I make to the API, but fails on the next ones. I get the error: Request queue with id: [id] not does not exist. I think I'm making some JavaScript mistake tbh, I don't have much experience with it. Here is the way I'm doing the API:

import { crawler } from './main.js'  // Import the exported crawler from main file
import express from "express";

const app = express();
app.use(express.json());

const BASE_URL = "https.....";

app.post("/scrape", async (req, res) => {
    if (!req.body || !req.body.usernames) {
        return res.status(400).json({ error: "Invalid input" });
    }

    const { usernames } = req.body;
    const urls = usernames.map(username => `${BASE_URL}${username}`);

    try {
        await crawler.run(urls);
        const dataset = await crawler.getData();


        return res.status(200).json({ data: dataset });
    } catch (error) {
        console.error("Scraping error:", error);
        return res.status(500).json({ error: "Scraping failed" });
    }
});


const PORT = parseInt(process.env.PORT) || 3000;
app.listen(PORT, () => console.log(`Server running on port ${PORT}`));

import { crawler } from './main.js'  // Import the exported crawler from main file
import express from "express";

const app = express();
app.use(express.json());

const BASE_URL = "https.....";

app.post("/scrape", async (req, res) => {
    if (!req.body || !req.body.usernames) {
        return res.status(400).json({ error: "Invalid input" });
    }

    const { usernames } = req.body;
    const urls = usernames.map(username => `${BASE_URL}${username}`);

    try {
        await crawler.run(urls);
        const dataset = await crawler.getData();


        return res.status(200).json({ data: dataset });
    } catch (error) {
        console.error("Scraping error:", error);
        return res.status(500).json({ error: "Scraping failed" });
    }
});


const PORT = parseInt(process.env.PORT) || 3000;
app.listen(PORT, () => console.log(`Server running on port ${PORT}`));

Here is how my crawler look:

const proxies = [...] //my proxy list

const proxyConfiguration = new ProxyConfiguration({
    proxyUrls: proxies,
});


export const crawler = new CheerioCrawler({
    proxyConfiguration,

    requestHandler: async ({ request, json, proxyInfo  }) => {
        log.info(JSON.stringify(proxyInfo, null, 2))

        /// Scraping logic

        await Dataset.pushData({
            // pushing data
        });
    }, new Configuration({
    persistStorage: false,
}));

const proxies = [...] //my proxy list

const proxyConfiguration = new ProxyConfiguration({
    proxyUrls: proxies,
});


export const crawler = new CheerioCrawler({
    proxyConfiguration,

    requestHandler: async ({ request, json, proxyInfo  }) => {
        log.info(JSON.stringify(proxyInfo, null, 2))

        /// Scraping logic

        await Dataset.pushData({
            // pushing data
        });
    }, new Configuration({
    persistStorage: false,
}));

2 Replies

Hall•2mo ago

Someone will reply to you shortly. In the meantime, this might help:

wise-white•2mo ago

Maybe the error happens because Crawlee's components (like request queues) are designed for single-use within one crawl. When you try to reuse the same crawler instance for a second request, it's trying to access resources that were already cleaned up. So probably you need to create a new crawler instance for each request e.q.:

// Define a function that creates a new crawler instance for each request
const createCrawler = () => {
  const proxies = [...]; // your proxy list
  
  const proxyConfiguration = new ProxyConfiguration({
    proxyUrls: proxies,
  });
  
  return new CheerioCrawler({
    proxyConfiguration,
    // Turn off persistence to avoid storage conflicts
    useSessionPool: false,
    
    requestHandler: async ({ request, json, proxyInfo }) => {
      console.log(JSON.stringify(proxyInfo, null, 2));
      
      // Scraping logic
      await Dataset.pushData({
        // pushing data
      });
    }
  }, new Configuration({
    persistStorage: false,
  }));
};

    // Create a new crawler instance for each request
    const crawler = createCrawler();

    // Run the crawler with the provided URLs
    await crawler.run(urls);

// Define a function that creates a new crawler instance for each request
const createCrawler = () => {
  const proxies = [...]; // your proxy list
  
  const proxyConfiguration = new ProxyConfiguration({
    proxyUrls: proxies,
  });
  
  return new CheerioCrawler({
    proxyConfiguration,
    // Turn off persistence to avoid storage conflicts
    useSessionPool: false,
    
    requestHandler: async ({ request, json, proxyInfo }) => {
      console.log(JSON.stringify(proxyInfo, null, 2));
      
      // Scraping logic
      await Dataset.pushData({
        // pushing data
      });
    }
  }, new Configuration({
    persistStorage: false,
  }));
};

    // Create a new crawler instance for each request
    const crawler = createCrawler();

    // Run the crawler with the provided URLs
    await crawler.run(urls);

Gaming

Programming

Request queue with id: [id] not does not exist

Did you find this page helpful?