Robots.txt

Hey, do you have any idea how to respect robots.txt? We must code that ourself?

2 Replies

afraid-scarletOP•10mo ago

""" 
Class to respect robot.txt file
"""

import urllib.parse

import aiohttp
from protego import Protego


class RobotTXT:
    """Class to respect robot.txt file"""

    def __init__(self):
        self._robots = {}
        self._user_agent = ["*", "GPTBot", "WaveAICrawler"]

    async def __call__(self, url: str) -> bool:
        """Check if the url is allowed to be crawled

        Args:
            url (str): url to be checked

        Returns:
            bool: True if the url is allowed to be crawled, False otherwise
        """

        url_parse = urllib.parse.urlparse(url)
        robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"

        if robots_url not in self._robots:
            async with aiohttp.ClientSession() as session:
                async with session.get(robots_url) as response:
                    robots_content = await response.text()
                    self._robots[robots_url] = Protego.parse(robots_content)

        authorize = []
        for agent in self._user_agent:
            authorize.append(self._robots[robots_url].can_fetch(url, agent))

        if len(self._robots) > 1000:
            self._robots.popitem(last=False)

        return all(authorize)

""" 
Class to respect robot.txt file
"""

import urllib.parse

import aiohttp
from protego import Protego


class RobotTXT:
    """Class to respect robot.txt file"""

    def __init__(self):
        self._robots = {}
        self._user_agent = ["*", "GPTBot", "WaveAICrawler"]

    async def __call__(self, url: str) -> bool:
        """Check if the url is allowed to be crawled

        Args:
            url (str): url to be checked

        Returns:
            bool: True if the url is allowed to be crawled, False otherwise
        """

        url_parse = urllib.parse.urlparse(url)
        robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"

        if robots_url not in self._robots:
            async with aiohttp.ClientSession() as session:
                async with session.get(robots_url) as response:
                    robots_content = await response.text()
                    self._robots[robots_url] = Protego.parse(robots_content)

        authorize = []
        for agent in self._user_agent:
            authorize.append(self._robots[robots_url].can_fetch(url, agent))

        if len(self._robots) > 1000:
            self._robots.popitem(last=False)

        return all(authorize)

I have made this scripts if you want and to use it:

robots_parser = RobotTXT()
authorized = await robots_parser(url)

robots_parser = RobotTXT()
authorized = await robots_parser(url)

wise-white•10mo ago

Hey @Jourdelune thanks! I think this better belongs to Crawlee for Python repository issue section: https://github.com/apify/crawlee-python/ feel free to open an feat request/issue and propose the contribution, our team will be happily look into it 🙂

GitHub

GitHub - apify/crawlee-python: Crawlee—A web scraping and browser a...

Crawlee—A web scraping and browser automation library for Python to build reliable crawlers. Extract data for AI, LLMs, RAG, or GPTs. Download HTML, PDF, JPG, PNG, and other files from websites. Wo...

Gaming

Programming

Robots.txt

Did you find this page helpful?