- spider.py文件配置
?1 ??2 # -*- coding: utf-8 -*- ?3 import scrapy ?4 from itTeachers.items import ItteachersItem ?5 ??6 ??7 class ItcastSpider(scrapy.Spider): ?8 ????name = ‘itcast‘ ?9 ????allowed_domains = [‘itcast.cn‘] 10 ????start_urls = [‘http://www.itcast.cn/channel/teacher.shtml#‘] 11 ?12 ????def parse(self, response): 13 ????????#with open("teacher.html","w") as f: 14 ????????????#f.write(response.body) 15 ?16 ????????items = [] 17 ?18 ????????teacher_list = response.xpath(‘//div[@class="li_txt"]‘) 19 ????????for each in teacher_list: 20 ?21 ????????????#我们将得到的数据封装到一个‘ItcastItem‘对象 22 ????????????item = ItteachersItem() 23 ????????????name = each.xpath(‘h3/text()‘).extract() 24 ????????????title = each.xpath(‘h4/text()‘).extract() 25 ????????????info = each.xpath(‘p/text()‘).extract() 26 ?27 ????????????#xpath返回的是包含一个元素的列表 28 ????????????item[‘name‘] = name[0] 29 ????????????item[‘title‘] = title[0] 30 ????????????item[‘info‘] = info[0] 31 ?32 ????????????items.append(item) 33 ????????#直接返回最后数据 34 ????????return items~ ????????????????????????
- items.py文件配置
?1 # -*- coding: utf-8 -*- ?2 ??3 # Define here the models for your scraped items ?4 # ?5 # See documentation in: ?6 # https://doc.scrapy.org/en/latest/topics/items.html ?7 ??8 import scrapy ?9 ?10 ?11 class ItteachersItem(scrapy.Item): 12 ????# define the fields for your item here like: 13 ????# name = scrapy.Field() 14 ????name = scrapy.Field() 15 ????title = scrapy.Field() 16 ????info = scrapy.Field()
scrapy crawl itcast -o teachers.json 爬虫案列
原文地址:https://www.cnblogs.com/hizf/p/8270008.html