爬取网站内容并保存为PDF格式
1、安装pdf依赖包 pip install pdfkit
但是使用pdfkit时,还是会报错
Traceback (most recent call last): ?File "C:\Users\zhan\AppData\Roaming\Python\Python36\site-packages\pdfkit\configuration.py", line 21, in __init__ ???with open(self.wkhtmltopdf) as f:FileNotFoundError: [Errno 2] No such file or directory: b‘‘During handling of the above exception, another exception occurred:OSError: No wkhtmltopdf executable found: "b‘‘"If this file exists please check that this process can read it. Otherwise please install wkhtmltopdf - https://github.com/JazzCore/python-pdfkit/wiki/Installing-wkhtmltopdf
根据提示官网下载 wkhtmltopdf ,并安装记录安装路径。
通过如下代码使用pdfkit
# path_wk = r‘D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe‘ #安装位置# config = pdfkit.configuration(wkhtmltopdf = path_wk)# pdfkit.from_string("hello world","1.pdf",configuration=config)
准备工作完成后开始代码实现:
#!/usr/bin/env python #coding:utf8import sysimport ?requestsimport ?pdfkitimport ?reimport ?osclass HtmlToPdf(): ???def __init__(self): ???????self.path_wk = r‘D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe‘ ???????self.config = pdfkit.configuration(wkhtmltopdf=self.path_wk) ???????self.url = "http://www.apelearn.com/study_v2/" ???????# self.reg = re.compile(r‘<li class="toctree-l1"><a.*?href="(.*?)">.*?</a></li>‘) ???????self.reg = re.compile(r‘<li class="toctree-l1"><a.*?href="(.*?)">(.*?)</a></li>‘) ???????self.dirName = "aminglinuxbook" ???????self.result = "" ???????self.chapter = "" ???????self.chapter_content = "" ???def get_html(self): ???????s = requests.session() ???????response = s.get(self.url) ???????response.encoding = ‘utf-8‘ ???????text = self.reg.findall(response.text) ???????self.result = list(set(text)) ???def get_pdfdir(self): ???????if not os.path.exists(self.dirName): ???????????os.makedirs(self.dirName) ???def get_chapter(self): ???????self.get_pdfdir() ???????for chapter in self.result: ???????????pdfFileName = ?"{0}-{1}.pdf".format(chapter[0].split(‘.‘)[0],chapter[1]) ???????????# pdfFileName = chapter[0].replace("html", "pdf") ???????????pdfUrl = "{0}{1}".format(self.url, chapter[0]) ???????????filePath = os.path.join(self.dirName, pdfFileName).strip() ???????????print(pdfUrl) ???????????print(filePath) ???????????try: ???????????????pdfkit.from_url(pdfUrl, filePath, configuration=self.config) ???????????except Exception as e: ???????????????print(e)def main(): ???html2pdf = HtmlToPdf() ???html2pdf.get_html() ???html2pdf.get_chapter()if __name__ == "__main__": ???main()
运行结果:
在目录中查看下载到的PDF文件
数据处理(html to ?pdf)
原文地址:https://www.cnblogs.com/pythonlx/p/8280155.html