The scrapy crawler gets a different amount of data every time, okay?

There are 20 pages of data on the website, 14 on each page, 280 in total. I have tried several times to get inconsistent data each time, and there is no error in the log. To get the multi-page data and the detail page data, the code is as follows: class GzDfjrjdSpider(scrapy.Spider): name = 'gz_dfjrjd' allowed_domains = ['jrjgj.gz.gov.cn'] start_urls = ['http://jrjgj.gz.gov.cn/tzgg/index.html'] base_url = 'http://jrjgj.gz.gov.cn/tzgg/index' page = 1 def parse(self, response): li_list = response.xpath('//div[@id="main"]/ul/li') for li in li_list: self.ww = self.ww + 1 url = li.xpath('./a/@href').extract_first() name = li.xpath('./a/@title').extract_first() time_str = li.xpath('./span/text()').extract_first() date = datetime.datetime.strptime(time_str, '%Y-%m-%d') yield scrapy.Request(url=url, callback=self.parse_second, meta={'title': name, 'time': date, 'url': url, 'download_timeout': 30}) self.page = self.page + 1 self.url = self.baseurl + '' + str(self.page) + '.html' yield scrapy.Request(url=self.url, callback=self.parse, meta={'download_timeout': 20}) def parse_second(self, response): content = response.xpath('//div[@class="info_cont"]').getall() policy_file_id = uuid.uuid1() create_time = datetime.datetime.now() policy = GzStBureauItem(policy_file_id=policy_file_id, title=response.meta['title'], goverment='gz_ed', area='gzp', date=response.meta['time'], content="".join(content), url=response.meta['url'], create_time=create_time) yield policy -----------------------------------------------------------------------
2 Replies
dependent-tan
dependent-tanOP3y ago
2023-02-28 15:21:26 [scrapy.statscollectors] INFO: Dumping Scrapy stats: {'downloader/exception_count': 4, 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 2, 'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 2, 'downloader/request_bytes': 112499, 'downloader/request_count': 273, 'downloader/request_method_count/GET': 273, 'downloader/response_bytes': 1839915, 'downloader/response_count': 269, 'downloader/response_status_count/200': 268, 'downloader/response_status_count/404': 1, 'elapsed_time_seconds': 998.274853, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2023, 2, 28, 7, 21, 26, 919976), 'httpcompression/response_bytes': 8127341, 'httpcompression/response_count': 237, 'httperror/response_ignored_count': 1, 'httperror/response_ignored_status_count/404': 1, 'item_scraped_count': 233, 'log_count/DEBUG': 519, 'log_count/ERROR': 15, 'log_count/INFO': 27, 'offsite/domains': 6, 'offsite/filtered': 18, 'request_depth_max': 20, 'response_received_count': 269, 'retry/count': 4, 'retry/reason_count/twisted.internet.error.TimeoutError': 2, 'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 2, 'scheduler/dequeued': 273, 'scheduler/dequeued/memory': 273, 'scheduler/enqueued': 273, 'scheduler/enqueued/memory': 273, 'start_time': datetime.datetime(2023, 2, 28, 7, 4, 48, 645123)} 2023-02-28 15:21:26 [scrapy.core.engine] INFO: Spider closed (finished)
mute-gold
mute-gold3y ago
I am not familiar with scrapy, but - I would start with adding more logs to see whether particular pages are failing, maybe dumping the html returned, etc. In most cases - the page would either be blocked, or e..g. the firtst URL from above return status 503.

Did you find this page helpful?