0.问题现象和原因
如下图所示,由于 Scrapyd 的 Web Interface 的 log 链接直接指向 log 文件,Response Headers 的 Content-Type 又没有声明字符集 charset=UTF-8,因此通过浏览器查看 log 会出现非 ASCII 乱码。
1.解决思路
(1) 如下图所示,在 Jobs 页面添加带有项目信息的 UTF-8 超链接,如 http://127.0.0.1:6800/logs/UTF-8.html?project=proxy&spider=test&job=cd2cc82a87f111e8ac72b827ebc33e0b
(2) 在 Scrapyd 的 logs 目录新建 UTF-8.html,通过 <meta charset="UTF-8"> 声明编码
(3) 新页面打开超链接后,通过 JS 获取 url 查询对,然后更新 UTF-8.html 页面的 iframe 的 src 属性,如 <iframe src="/logs/proxy/test/9a7ee91287f111e8ac72b827ebc33e0b.log" width="100%" height="100%"></iframe>
(4) 浏览器自动加载 iframe 获取 log 文件
2.修改 Scrapyd 代码
/site-packages/scrapyd/website.py
改动位置:
(1) table 添加最后一列,见红色代码
???def render(self, txrequest): ???????cols = 9 ######## 8 ???????s = "<html><head><meta charset=‘UTF-8‘><title>Scrapyd</title></head>" ???????s += "<body>" ???????s += "<h1>Jobs</h1>" ???????s += "<p><a href=‘..‘>Go back</a></p>" ???????s += "<table border=‘1‘>" ???????s += "<tr><th>Project</th><th>Spider</th><th>Job</th><th>PID</th><th>Start</th><th>Runtime</th><th>Finish</th><th>Log</th>" ???????if self.local_items: ???????????s += "<th>Items</th>" ???????????#cols = 9 ######## ???????????cols += 1 ########
(2) 有两处需要添加 UTF-8 超链接,分别对应 Running 和 Finished,见红色代码
???????????s += "<td><a href=‘/logs/%s/%s/%s.log‘>Log</a></td>" % (p.project, p.spider, p.job) ???????????s += "<td><a href=‘/logs/UTF-8.html?project=%s&spider=%s&job=%s‘ target=‘_blank‘>UTF-8</a></td>" % (p.project, p.spider, p.job) ########
(3) 完整代码:
from datetime import datetimeimport socketfrom twisted.web import resource, staticfrom twisted.application.service import IServiceCollectionfrom scrapy.utils.misc import load_objectfrom .interfaces import IPoller, IEggStorage, ISpiderSchedulerfrom six.moves.urllib.parse import urlparseclass Root(resource.Resource): ???def __init__(self, config, app): ???????resource.Resource.__init__(self) ???????self.debug = config.getboolean(‘debug‘, False) ???????self.runner = config.get(‘runner‘) ???????logsdir = config.get(‘logs_dir‘) ???????itemsdir = config.get(‘items_dir‘) ???????local_items = itemsdir and (urlparse(itemsdir).scheme.lower() in [‘‘, ‘file‘]) ???????self.app = app ???????self.nodename = config.get(‘node_name‘, socket.gethostname()) ???????self.putChild(b‘‘, Home(self, local_items)) ???????if logsdir: ???????????self.putChild(b‘logs‘, static.File(logsdir.encode(‘ascii‘, ‘ignore‘), ‘text/plain‘)) ???????if local_items: ???????????self.putChild(b‘items‘, static.File(itemsdir, ‘text/plain‘)) ???????self.putChild(b‘jobs‘, Jobs(self, local_items)) ???????services = config.items(‘services‘, ()) ???????for servName, servClsName in services: ?????????servCls = load_object(servClsName) ?????????self.putChild(servName.encode(‘utf-8‘), servCls(self)) ???????self.update_projects() ???def update_projects(self): ???????self.poller.update_projects() ???????self.scheduler.update_projects() ???@property ???def launcher(self): ???????app = IServiceCollection(self.app, self.app) ???????return app.getServiceNamed(‘launcher‘) ???@property ???def scheduler(self): ???????return self.app.getComponent(ISpiderScheduler) ???@property ???def eggstorage(self): ???????return self.app.getComponent(IEggStorage) ???@property ???def poller(self): ???????return self.app.getComponent(IPoller)class Home(resource.Resource): ???def __init__(self, root, local_items): ???????resource.Resource.__init__(self) ???????self.root = root ???????self.local_items = local_items ???def render_GET(self, txrequest): ???????vars = { ???????????‘projects‘: ‘, ‘.join(self.root.scheduler.list_projects()) ???????} ???????s = """<html><head><meta charset=‘UTF-8‘><title>Scrapyd</title></head><body><h1>Scrapyd</h1><p>Available projects: <b>%(projects)s</b></p><ul><li><a href="/jobs">Jobs</a></li>""" % vars ???????if self.local_items: ???????????s += ‘<li><a href="/items/">Items</a></li>‘ ???????s += """<li><a href="/logs/">Logs</a></li><li><a href="http://scrapyd.readthedocs.org/en/latest/">Documentation</a></li></ul><h2>How to schedule a spider?</h2><p>To schedule a spider you need to use the API (this web UI is only formonitoring)</p><p>Example using <a href="http://curl.haxx.se/">curl</a>:</p><p><code>curl http://localhost:6800/schedule.json -d project=default -d spider=somespider</code></p><p>For more information about the API, see the <a href="http://scrapyd.readthedocs.org/en/latest/">Scrapyd documentation</a></p></body></html>""" % vars ???????return s.encode(‘utf-8‘)class Jobs(resource.Resource): ???def __init__(self, root, local_items): ???????resource.Resource.__init__(self) ???????self.root = root ???????self.local_items = local_items ???def render(self, txrequest): ???????cols = 9 ######## 8 ???????s = "<html><head><meta charset=‘UTF-8‘><title>Scrapyd</title></head>" ???????s += "<body>" ???????s += "<h1>Jobs</h1>" ???????s += "<p><a href=‘..‘>Go back</a></p>" ???????s += "<table border=‘1‘>" ???????s += "<tr><th>Project</th><th>Spider</th><th>Job</th><th>PID</th><th>Start</th><th>Runtime</th><th>Finish</th><th>Log</th>" ???????if self.local_items: ???????????s += "<th>Items</th>" ???????????#cols = 9 ######## ???????????cols += 1 ######## ???????s += "</tr>" ???????s += "<tr><th colspan=‘%s‘ style=‘background-color: #ddd‘>Pending</th></tr>" % cols ???????for project, queue in self.root.poller.queues.items(): ???????????for m in queue.list(): ???????????????s += "<tr>" ???????????????s += "<td>%s</td>" % project ???????????????s += "<td>%s</td>" % str(m[‘name‘]) ???????????????s += "<td>%s</td>" % str(m[‘_job‘]) ???????????????s += "</tr>" ???????s += "<tr><th colspan=‘%s‘ style=‘background-color: #ddd‘>Running</th></tr>" % cols ???????for p in self.root.launcher.processes.values(): ???????????s += "<tr>" ???????????for a in [‘project‘, ‘spider‘, ‘job‘, ‘pid‘]: ???????????????s += "<td>%s</td>" % getattr(p, a) ???????????s += "<td>%s</td>" % p.start_time.replace(microsecond=0) ???????????s += "<td>%s</td>" % (datetime.now().replace(microsecond=0) - p.start_time.replace(microsecond=0)) ???????????s += "<td></td>" ???????????s += "<td><a href=‘/logs/%s/%s/%s.log‘>Log</a></td>" % (p.project, p.spider, p.job) ???????????s += "<td><a href=‘/logs/UTF-8.html?project=%s&spider=%s&job=%s‘ target=‘_blank‘>UTF-8</a></td>" % (p.project, p.spider, p.job) ######## ???????????if self.local_items: ???????????????s += "<td><a href=‘/items/%s/%s/%s.jl‘>Items</a></td>" % (p.project, p.spider, p.job) ???????????s += "</tr>" ???????s += "<tr><th colspan=‘%s‘ style=‘background-color: #ddd‘>Finished</th></tr>" % cols ???????for p in self.root.launcher.finished: ???????????s += "<tr>" ???????????for a in [‘project‘, ‘spider‘, ‘job‘]: ???????????????s += "<td>%s</td>" % getattr(p, a) ???????????s += "<td></td>" ???????????s += "<td>%s</td>" % p.start_time.replace(microsecond=0) ???????????s += "<td>%s</td>" % (p.end_time.replace(microsecond=0) - p.start_time.replace(microsecond=0)) ???????????s += "<td>%s</td>" % p.end_time.replace(microsecond=0) ???????????s += "<td><a href=‘/logs/%s/%s/%s.log‘>Log</a></td>" % (p.project, p.spider, p.job) ???????????s += "<td><a href=‘/logs/UTF-8.html?project=%s&spider=%s&job=%s‘ target=‘_blank‘>UTF-8</a></td>" % (p.project, p.spider, p.job) ######## ???????????if self.local_items: ???????????????s += "<td><a href=‘/items/%s/%s/%s.jl‘>Items</a></td>" % (p.project, p.spider, p.job) ???????????s += "</tr>" ???????s += "</table>" ???????s += "</body>" ???????s += "</html>" ???????txrequest.setHeader(‘Content-Type‘, ‘text/html; charset=utf-8‘) ???????txrequest.setHeader(‘Content-Length‘, len(s)) ???????return s.encode(‘utf-8‘)
3.新建 UTF-8.html 页面
根据 http://scrapyd.readthedocs.io/en/stable/config.html 确定 Scrapyd 所使用的 logs_dir,在该目录下添加如下文件 UTF-8.html
<html><head><meta charset="UTF-8"></head><iframe src="" width="100%" height="100%"></iframe><script>function parseQueryString(url) { ???var urlParams = {}; ???url.replace( ???????new RegExp("([^?=&]+)(=([^&]*))?", "g"), ???????function($0, $1, $2, $3) { ???????????urlParams[$1] = $3; ???????} ???); ???return urlParams;}var kwargs = parseQueryString(location.search);document.querySelector(‘iframe‘).src = "/logs/" + kwargs.project + ‘/‘ + kwargs.spider + ‘/‘ + kwargs.job + ‘.log‘</script><html>
4.实现效果
Scrapyd 改进: Web Interface 添加 charset=UTF-8, 避免查看 log 出现中文乱码
原文地址:https://www.cnblogs.com/my8100/p/scrapyd_add_charset_UTF-8.html