python - I want Scrapy to run through each item once -
i scrapy run through each item once relevant data grouped together. puts links, headers, dates etc together. posting file more once. pretty new both scrapy , python advice grateful for.
here spider code:
from scrapy.spiders import spider scrapy.selector import selector fashioblog.functions import extract_data fashioblog.items import fashioblog class firstspider(spider): name = "first" allowed_domains = [ "stopitrightnow.com" ] start_urls = [ "http://www.stopitrightnow.com" ] def parse(self, response): sel = selector(response) sites = sel.xpath('//div[@class="post-outer"]') items= [] site in sites: item = fashioblog() item['title'] = extract_data(site.xpath('//h3[normalize-space(@class)="post-title entry-title"]//text()').extract()) item['url'] = extract_data(site.xpath('//div[normalize-space(@class)="post-body entry-content"]//@href').extract()) item['date'] = extract_data(site.xpath('//h2[normalize-space(@class)="date-header"]/span/text()').extract()) #item['body'] = site.xpath('//div[@class="post-body entry-content"]/i/text()').extract() item['labellink'] = extract_data(site.xpath('//span[normalize-space(@class)="post-labels"]//@href').extract()) item['comment'] = extract_data(site.xpath('//span[normalize-space(@class)="post-comment-link"]//text()').extract()) item['picurl'] = extract_data(site.xpath('//div[normalize-space(@class)="separator"]//@href').extract()) #item['labeltext'] = extract_data(site.xpath('(//i//text()').extract()) #item['labellink2'] = extract_data(site.xpath('(//i//@href').extract()) yield item
make expressions context-specific prepending dot:
item['title'] = extract_data(site.xpath('.//h3[normalize-space(@class)="post-title entry-title"]//text()').extract()) ^ here
Comments
Post a Comment