分享web开发知识

注册/登录|最近发布|今日推荐

主页 IT知识网页技术软件开发前端开发代码编程运营维护技术分享教程案例
当前位置:首页 > 前端开发

scrapy-redis使redis不止保存url

发布时间:2023-09-06 02:26责任编辑:林大明关键词:urlredis

先看scrapy-redis源码

 ?1 class RedisMixin(object): ?2 ????"""Mixin class to implement reading urls from a redis queue.""" ?3 ????redis_key = None ?4 ????redis_batch_size = None ?5 ????redis_encoding = None ?6 ??7 ????# Redis client placeholder. ?8 ????server = None ?9 ?10 ????def start_requests(self): 11 ????????"""Returns a batch of start requests from redis.""" 12 ????????return self.next_requests() 13 ?14 ????def setup_redis(self, crawler=None): 15 ????????"""Setup redis connection and idle signal. 16 ?17 ????????This should be called after the spider has set its crawler object. 18 ????????""" 19 ????????if self.server is not None: 20 ????????????return 21 ?22 ????????if crawler is None: 23 ????????????# We allow optional crawler argument to keep backwards 24 ????????????# compatibility. 25 ????????????# XXX: Raise a deprecation warning. 26 ????????????crawler = getattr(self, ‘crawler‘, None) 27 ?28 ????????if crawler is None: 29 ????????????raise ValueError("crawler is required") 30 ?31 ????????settings = crawler.settings 32 ?33 ????????if self.redis_key is None: 34 ????????????self.redis_key = settings.get( 35 ????????????????‘REDIS_START_URLS_KEY‘, defaults.START_URLS_KEY, 36 ????????????) 37 ?38 ????????self.redis_key = self.redis_key % {‘name‘: self.name} 39 ?40 ????????if not self.redis_key.strip(): 41 ????????????raise ValueError("redis_key must not be empty") 42 ?43 ????????if self.redis_batch_size is None: 44 ????????????# TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). 45 ????????????self.redis_batch_size = settings.getint( 46 ????????????????‘REDIS_START_URLS_BATCH_SIZE‘, 47 ????????????????settings.getint(‘CONCURRENT_REQUESTS‘), 48 ????????????) 49 ?50 ????????try: 51 ????????????self.redis_batch_size = int(self.redis_batch_size) 52 ????????except (TypeError, ValueError): 53 ????????????raise ValueError("redis_batch_size must be an integer") 54 ?55 ????????if self.redis_encoding is None: 56 ????????????self.redis_encoding = settings.get(‘REDIS_ENCODING‘, defaults.REDIS_ENCODING) 57 ?58 ????????self.logger.info("Reading start URLs from redis key ‘%(redis_key)s‘ " 59 ?????????????????????????"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", 60 ?????????????????????????self.__dict__) 61 ?62 ????????self.server = connection.from_settings(crawler.settings) 63 ????????# The idle signal is called when the spider has no requests left, 64 ????????# that‘s when we will schedule new requests from redis queue 65 ????????crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 66 ?67 ????def next_requests(self): 68 ????????"""Returns a request to be scheduled or none.""" 69 ????????use_set = self.settings.getbool(‘REDIS_START_URLS_AS_SET‘, defaults.START_URLS_AS_SET) 70 ????????fetch_one = self.server.spop if use_set else self.server.lpop 71 ????????# XXX: Do we need to use a timeout here? 72 ????????found = 0 73 ????????# TODO: Use redis pipeline execution. 74 ????????while found < self.redis_batch_size: 75 ????????????data = fetch_one(self.redis_key) 76 ????????????if not data: 77 ????????????????# Queue empty. 78 ????????????????break 79 ????????????req = self.make_request_from_data(data) 80 ????????????if req: 81 ????????????????yield req 82 ????????????????found += 1 83 ????????????else: 84 ????????????????self.logger.debug("Request not made from data: %r", data) 85 ?86 ????????if found: 87 ????????????self.logger.debug("Read %s requests from ‘%s‘", found, self.redis_key) 88 ?89 ????def make_request_from_data(self, data): 90 ????????"""Returns a Request instance from data coming from Redis. 91 ?92 ????????By default, ``data`` is an encoded URL. You can override this method to 93 ????????provide your own message decoding. 94 ?95 ????????Parameters 96 ????????---------- 97 ????????data : bytes 98 ????????????Message from redis. 99 100 ????????"""101 ????????url = bytes_to_str(data, self.redis_encoding)102 ????????return self.make_requests_from_url(url)103 104 ????def schedule_next_requests(self):105 ????????"""Schedules a request if available"""106 ????????# TODO: While there is capacity, schedule a batch of redis requests.107 ????????for req in self.next_requests():108 ????????????self.crawler.engine.crawl(req, spider=self)109 110 ????def spider_idle(self):111 ????????"""Schedules a request if available, otherwise waits."""112 ????????# XXX: Handle a sentinel to close the spider.113 ????????self.schedule_next_requests()114 ????????raise DontCloseSpider115 116 117 class RedisSpider(RedisMixin, Spider):118 ????"""Spider that reads urls from redis queue when idle.119 120 ????Attributes121 ????----------122 ????redis_key : str (default: REDIS_START_URLS_KEY)123 ????????Redis key where to fetch start URLs from..124 ????redis_batch_size : int (default: CONCURRENT_REQUESTS)125 ????????Number of messages to fetch from redis on each attempt.126 ????redis_encoding : str (default: REDIS_ENCODING)127 ????????Encoding to use when decoding messages from redis queue.128 129 ????Settings130 ????--------131 ????REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")132 ????????Default Redis key where to fetch start URLs from..133 ????REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)134 ????????Default number of messages to fetch from redis on each attempt.135 ????REDIS_START_URLS_AS_SET : bool (default: False)136 ????????Use SET operations to retrieve messages from the redis queue. If False,137 ????????the messages are retrieve using the LPOP command.138 ????REDIS_ENCODING : str (default: "utf-8")139 ????????Default encoding to use when decoding messages from redis queue.140 141 ????"""142 143 ????@classmethod144 ????def from_crawler(self, crawler, *args, **kwargs):145 ????????obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)146 ????????obj.setup_redis(crawler)147 ????????return obj148 149 150 class RedisCrawlSpider(RedisMixin, CrawlSpider):151 ????"""Spider that reads urls from redis queue when idle.152 153 ????Attributes154 ????----------155 ????redis_key : str (default: REDIS_START_URLS_KEY)156 ????????Redis key where to fetch start URLs from..157 ????redis_batch_size : int (default: CONCURRENT_REQUESTS)158 ????????Number of messages to fetch from redis on each attempt.159 ????redis_encoding : str (default: REDIS_ENCODING)160 ????????Encoding to use when decoding messages from redis queue.161 162 ????Settings163 ????--------164 ????REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")165 ????????Default Redis key where to fetch start URLs from..166 ????REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)167 ????????Default number of messages to fetch from redis on each attempt.168 ????REDIS_START_URLS_AS_SET : bool (default: True)169 ????????Use SET operations to retrieve messages from the redis queue.170 ????REDIS_ENCODING : str (default: "utf-8")171 ????????Default encoding to use when decoding messages from redis queue.172 173 ????"""174 175 ????@classmethod176 ????def from_crawler(self, crawler, *args, **kwargs):177 ????????obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)178 ????????obj.setup_redis(crawler)179 ????????return obj

仔细看完的话会发现

make_request_from_data(self, data)
这个方法是从redis中返回一个请求实例 默认是一个url
接下来重写一下这个方法直接传入到
self.make_requests_from_url
一个json串就好了
在这个方法里面可以把这个串解析了请求url或者生产url
代码如下
 1 ????def make_request_from_data(self, data): 2 ????????‘‘‘ 3 ????????:params data bytes, Message from redis 4 ????????‘‘‘ 5 ????????company = bytes_to_str(data, self.redis_encoding) 6 ????????return self.make_requests_from_url(company) 7 ?8 ????def make_requests_from_url(self, company): 9 ????????data = eval(company)10 ????????url = data["url"]11 ????????headers = {12 ????????????"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",13 ????????????"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"14 ????????}15 ????????return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)

值得注意的是

不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
但是同时重写make_request_from_data和make_requests_from_url方法则可以执行

scrapy-redis使redis不止保存url

原文地址:https://www.cnblogs.com/ltn26/p/10120444.html

知识推荐

我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8 不良信息举报平台 互联网安全管理备案 Copyright 2023 www.wodecom.cn All Rights Reserved