CA
fascinating-indigo
Function isn't running when posted in api fy
Hello everyone,
I have the following code working perfectly when I am not using Apify however once I use Apify it doesn't run the second function. I am using the apify template for Scrapy .
Thanks for the help
from typing import Generator
from scrapy.responsetypes import Response
from apify import Actor
from urllib.parse import urljoin
import nest_asyncio
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
from scrapy.http import Response, Request
class TitleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['apc.fr']
start_urls = [
"https://www.apc.fr/men/men-shirts.html",
]
def parse(self, response: Response):
Actor.log.info(f'TitleSpider is parsing {response}...')
li_elements = response.css('li.product-item')
product_links = []
for li in li_elements:
productlink_container = li.css('.product-link')
product_link = productlink_container.css('a::attr(href)').get()
if product_link:
product_links.append(product_link)
for link in product_links:
yield scrapy.Request(url=link, callback=self.second_page)
def second_page(self, response: Response):
Actor.log.info(f'Second fonction is parsing {response}...')
productname = response.css('h1.product-name::text').get()
print(productname)
from typing import Generator
from scrapy.responsetypes import Response
from apify import Actor
from urllib.parse import urljoin
import nest_asyncio
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
from scrapy.http import Response, Request
class TitleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['apc.fr']
start_urls = [
"https://www.apc.fr/men/men-shirts.html",
]
def parse(self, response: Response):
Actor.log.info(f'TitleSpider is parsing {response}...')
li_elements = response.css('li.product-item')
product_links = []
for li in li_elements:
productlink_container = li.css('.product-link')
product_link = productlink_container.css('a::attr(href)').get()
if product_link:
product_links.append(product_link)
for link in product_links:
yield scrapy.Request(url=link, callback=self.second_page)
def second_page(self, response: Response):
Actor.log.info(f'Second fonction is parsing {response}...')
productname = response.css('h1.product-name::text').get()
print(productname)
3 Replies
harsh-harlequin•2y ago
Hey, thanks for the report! The team knows about the issue and will fix it
fascinating-indigoOP•2y ago
Thanks for your answer. Any ETA for this resolution? Thanks
In the meantime and if it can help I found this solution that somehow works.
class TitleSpider(scrapy.Spider):
name = 'title_spider'
allowed_domains = ['apc.fr']
start_urls = [
"https://www.apc.fr/women/women-blouses-shirts.html",
]
test_url = None
def parse(self, response: Response):
if 'isProductPage' in response.meta:
yield from self._parse_prod(response)
else:
Actor.log.info(f'Main page => {response}...')
li_elements = response.css('li.product-item')
for li in li_elements:
productlink_container = li.css('.product-link')
product_links = productlink_container.css('a::attr(href)').getall()
for link in product_links:
self.test_url = response.url
yield scrapy.Request(link, meta={"isProductPage": True})
def _parse_prod(self, response: Response):
Actor.log.info(f'Product Page => {response}...')
test_url = self.test_url
current_url = response.url
productname = response.css('h1.product-name::text').get()
productdescriptionfirst = response.css('div.product.attribute.intro')
productdescriptionsecond = productdescriptionfirst.css('div.value')
productdescription =
}'
#Other stuffs ....
class TitleSpider(scrapy.Spider):
name = 'title_spider'
allowed_domains = ['apc.fr']
start_urls = [
"https://www.apc.fr/women/women-blouses-shirts.html",
]
test_url = None
def parse(self, response: Response):
if 'isProductPage' in response.meta:
yield from self._parse_prod(response)
else:
Actor.log.info(f'Main page => {response}...')
li_elements = response.css('li.product-item')
for li in li_elements:
productlink_container = li.css('.product-link')
product_links = productlink_container.css('a::attr(href)').getall()
for link in product_links:
self.test_url = response.url
yield scrapy.Request(link, meta={"isProductPage": True})
def _parse_prod(self, response: Response):
Actor.log.info(f'Product Page => {response}...')
test_url = self.test_url
current_url = response.url
productname = response.css('h1.product-name::text').get()
productdescriptionfirst = response.css('div.product.attribute.intro')
productdescriptionsecond = productdescriptionfirst.css('div.value')
productdescription =
}'
#Other stuffs ....
harsh-harlequin•2y ago
Hey, sorry for the late reply but the issue should be fixed!