Robots.txt

Hey, do you have any idea how to respect robots.txt? We must code that ourself?
2 Replies
afraid-scarlet
afraid-scarletOP•10mo ago
"""
Class to respect robot.txt file
"""

import urllib.parse

import aiohttp
from protego import Protego


class RobotTXT:
"""Class to respect robot.txt file"""

def __init__(self):
self._robots = {}
self._user_agent = ["*", "GPTBot", "WaveAICrawler"]

async def __call__(self, url: str) -> bool:
"""Check if the url is allowed to be crawled

Args:
url (str): url to be checked

Returns:
bool: True if the url is allowed to be crawled, False otherwise
"""

url_parse = urllib.parse.urlparse(url)
robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"

if robots_url not in self._robots:
async with aiohttp.ClientSession() as session:
async with session.get(robots_url) as response:
robots_content = await response.text()
self._robots[robots_url] = Protego.parse(robots_content)

authorize = []
for agent in self._user_agent:
authorize.append(self._robots[robots_url].can_fetch(url, agent))

if len(self._robots) > 1000:
self._robots.popitem(last=False)

return all(authorize)
"""
Class to respect robot.txt file
"""

import urllib.parse

import aiohttp
from protego import Protego


class RobotTXT:
"""Class to respect robot.txt file"""

def __init__(self):
self._robots = {}
self._user_agent = ["*", "GPTBot", "WaveAICrawler"]

async def __call__(self, url: str) -> bool:
"""Check if the url is allowed to be crawled

Args:
url (str): url to be checked

Returns:
bool: True if the url is allowed to be crawled, False otherwise
"""

url_parse = urllib.parse.urlparse(url)
robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"

if robots_url not in self._robots:
async with aiohttp.ClientSession() as session:
async with session.get(robots_url) as response:
robots_content = await response.text()
self._robots[robots_url] = Protego.parse(robots_content)

authorize = []
for agent in self._user_agent:
authorize.append(self._robots[robots_url].can_fetch(url, agent))

if len(self._robots) > 1000:
self._robots.popitem(last=False)

return all(authorize)
I have made this scripts if you want and to use it:
robots_parser = RobotTXT()
authorized = await robots_parser(url)
robots_parser = RobotTXT()
authorized = await robots_parser(url)
wise-white
wise-white•10mo ago
Hey @Jourdelune thanks! I think this better belongs to Crawlee for Python repository issue section: https://github.com/apify/crawlee-python/ feel free to open an feat request/issue and propose the contribution, our team will be happily look into it 🙂
GitHub
GitHub - apify/crawlee-python: Crawlee—A web scraping and browser a...
Crawlee—A web scraping and browser automation library for Python to build reliable crawlers. Extract data for AI, LLMs, RAG, or GPTs. Download HTML, PDF, JPG, PNG, and other files from websites. Wo...

Did you find this page helpful?