1 # -*- coding: utf-8 -*- 2 import scrapy 3 from yg.items import YgItem 4 ?5 class YgSpiderSpider(scrapy.Spider): 6 ????name = ‘yg_spider‘ 7 ????allowed_domains = [‘wz.sun0769.com‘] 8 ????start_urls = [‘http://wz.sun0769.com/index.php/question/questionType?type=4&page=0‘] 9 10 ????def parse(self, response):11 ????????tr_list = response.xpath("//div[@class=‘greyframe‘]/table[2]/tr/td/table/tr")12 ????????for tr in tr_list:13 ????????????item = YgItem()14 ????????????item["title"] = tr.xpath("./td[2]/a[2]/@title").extract_first()15 ????????????item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first()16 ????????????item["update_time"] = tr.xpath("./td[last()]/text()").extract_first()17 ????????????# print(item)18 19 ????????????yield scrapy.Request(20 ????????????????item["href"],21 ????????????????callback=self.parse_detail,22 ????????????????meta={"item":item}23 ????????????)24 25 ????????next_url = response.xpath("//a[text()=‘>‘]/@href").extract_first()26 ????????if next_url is not None:27 ????????????yield scrapy.Request(28 ????????????????next_url,29 ????????????????callback=self.parse30 ????????????)31 32 ????def parse_detail(self,response): #处理详情页33 ????????item = response.meta["item"]34 ????????item["content"] = response.xpath("//div[@class=‘c1 text14_2‘]//text()").extract()35 ????????item["content_img"] = response.xpath("//div[@class=‘c1 text14_2‘]//img/@src").extract()36 ????????item["content_img"] = ["http://wz.sun0769.com"+i for i in item["content_img"]]37 ????????# print(item)38 ????????yield item
1 # -*- coding: utf-8 -*- 2 ?3 # Define your item pipelines here 4 # 5 # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 import re 8 import json 9 10 class YgPipeline(object):11 ????def process_item(self, item, spider):12 ????????item["content"] = self.process_content(item["content"])13 ????????with open("yg.txt", "a", encoding="utf-8") as f:14 ????????????f.write(json.dumps(dict(item), ensure_ascii=False, indent=4))15 ????????????f.write("\n")16 ????????return item17 18 ????def process_content(self, content):19 ????????content = [re.sub(r‘\xa0|\s‘,"",i) for i in content]20 ????????content = [i for i in content if len(i)>0]21 ????????return content
1 # -*- coding: utf-8 -*- 2 ?3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7 ?8 import scrapy 9 10 11 class YgItem(scrapy.Item):12 ????# define the fields for your item here like:13 ????title = scrapy.Field()14 ????update_time = scrapy.Field()15 ????href = scrapy.Field()16 ????content = scrapy.Field()17 ????content_img = scrapy.Field()18 ????# pass
投诉网站爬虫
原文地址:https://www.cnblogs.com/sure-feng/p/10092283.html