我正在开发一个 Scrapy 项目,并有一个自定义蜘蛛定义如下:class JosephCrawlSpider(Spider): parse_spider = JosephParseSpider() def start_requests(self): for url in
我正在开发一个 Scrapy 项目,并有一个自定义蜘蛛定义如下:
class JosephCrawlSpider(Spider):
parse_spider = JosephParseSpider()
def start_requests(self):
for url in self.start_urls:
category = self.extract_category_from_url(url)
yield Request(
url, callback=self.parse, cookies=self.cookies,
meta={'category': category}
)
def parse(self, response):
product_links = response.css('.product-name a.name-link::attr(href)').getall()
if not product_links:
return
category = response.meta.get('category', 'unknown')
for link in product_links:
yield response.follow(
link, callback=self.parse_product, cookies=self.cookies,
meta={'category': category}
)
start = int(response.url.split('start=')[-1].split('&')[0])
next_start = start + 12
next_page = (
f"https://www.joseph-fashion.com/en-gb/womens/"
f"?start={next_start}&sz=12&format=page-element&rowNumber=4"
f"¤tView=model_view"
)
yield Request(
url=next_page, callback=self.parse, cookies=self.cookies,
meta={'category': category}
)
def parse_product(self, response):
response.meta['raw_product'] = response
yield from self.parse_item(response)
def parse_item(self, response):
return self.parse_spider.parse(response)
def extract_category_from_url(self, url):
parts = url.split('/')
try:
start_index = parts.index("womens")
return parts[start_index:start_index + 3]
except ValueError:
return []
我想重构此爬虫程序以使用 Scrapy Rule
并 LinkExtractor
处理产品链接和分页的提取。我该如何修改爬虫程序以利用这些类?
我特别感兴趣的是如何设置 Rule
提取产品链接和处理 JosephCrawlSpider
课堂上的分页。
如有任何关于如何整合的指导或示例 Rule
, LinkExtractor
我们将不胜感激!