分享web开发知识

注册/登录|最近发布|今日推荐

主页 IT知识网页技术软件开发前端开发代码编程运营维护技术分享教程案例
当前位置:首页 > 教程案例

分析Ajax抓取今日头条街拍美图

发布时间:2023-09-06 01:07责任编辑:白小东关键词:Ajax

spider.py

 ?1 # -*- coding:utf-8 -*- ?2 from urllib import urlencode ?3 import requests ?4 from requests.exceptions import RequestException ?5 import json ?6 import re ?7 import os ?8 from hashlib import md5 ?9 from bs4 import BeautifulSoup 10 import pymongo 11 from multiprocessing import Pool 12 from json.decoder import JSONDecoder 13 from config import * 14 ?15 client = pymongo.MongoClient(MONGO_URL, connect=False) 16 db = client[MONGO_DB] 17 ?18 def get_page_index(offset,keyword): 19 ????data = { 20 ????????‘offset‘: offset, 21 ????????‘format‘: ‘json‘, 22 ????????‘keyword‘: keyword, 23 ????????‘autoload‘: ‘true‘, 24 ????????‘count‘: ‘20‘, 25 ????????‘cur_tab‘: 3 26 ????} 27 ????url = ‘http://www.toutiao.com/search_content/?‘ + urlencode(data) 28 ????try: 29 ????????response = requests.get(url) 30 ????????if response.status_code == 200: 31 ????????????return response.text 32 ????????return None 33 ????except RequestException: 34 ????????print u‘请求索引页失败‘, url 35 ????????return None 36 ?37 def parse_page_index(html): 38 ????data = json.loads(html) 39 ????if data and ‘data‘ in data.keys(): 40 ????????for item in data.get(‘data‘): 41 ????????????yield item.get(‘article_url‘) 42 ?43 def get_page_detail(url): 44 ????try: 45 ????????response = requests.get(url) 46 ????????if response.status_code == 200: 47 ????????????return response.text 48 ????????return None 49 ????except RequestException: 50 ????????print u‘请求详情页失败‘, url 51 ????????return None 52 ?53 def parse_page_detail(html, url): 54 ????soup = BeautifulSoup(html, ‘lxml‘) 55 ????title = soup.select(‘title‘)[0].get_text() 56 ????print(title) 57 ????images_pattern = re.compile(‘gallery: (.*?),\n‘, re.S) 58 ????result = re.search(images_pattern, html) 59 ????if result: 60 ????????data = json.loads(result.group(1)) 61 ????????if data and ‘sub_images‘ in data.keys(): 62 ????????????sub_images = data.get(‘sub_images‘) 63 ????????????images = [item.get(‘url‘) for item in sub_images] 64 ????????????for image in images: download_image(image) 65 ????????????return { 66 ????????????????‘title‘: title, 67 ????????????????‘url‘: url, 68 ????????????????‘images‘: images 69 ????????????} 70 ?71 def save_to_mongo(result): 72 ????if db[MONGO_TABLE].insert(result): 73 ????????print u‘存储到MongoDB成功‘, result 74 ????????return True 75 ????return False 76 ?77 def download_image(url): 78 ????print u‘正在下载‘, url 79 ????try: 80 ????????response = requests.get(url) 81 ????????if response.status_code == 200: 82 ????????????save_image(response.content) 83 ????????return None 84 ????except RequestException: 85 ????????print u‘请求图片失败‘, url 86 ????????return None 87 ?88 def save_image(content): 89 ????file_path = ‘{0}/{1}.{2}‘.format(os.getcwd(), md5(content).hexdigest(), ‘jpg‘) 90 ????if not os.path.exists(file_path): 91 ????????with open(file_path, ‘wb‘) as f: 92 ????????????f.write(content) 93 ????????????f.close() 94 ?95 def main(offset): 96 ????html = get_page_index(offset, KEYWORD) 97 ????for url in parse_page_index(html): 98 ????????html = get_page_detail(url) 99 ????????if html:100 ????????????result = parse_page_detail(html, url)101 ????????????if result: save_to_mongo(result)102 103 if __name__ == ‘__main__‘:104 ????groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]105 ????pool = Pool()106 ????pool.map(main, groups)
View Code

config.py

1 # -*- coding:utf-8 -*-2 MONGO_URL = ‘localhost‘3 MONGO_DB = ‘toutiao‘4 MONGO_TABLE = ‘toutiao‘5 6 GROUP_START = 07 GROUP_END = 208 9 KEYWORD = ‘街拍‘
View Code

分析Ajax抓取今日头条街拍美图

原文地址:http://www.cnblogs.com/stonelovy/p/7644651.html

知识推荐

我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8 不良信息举报平台 互联网安全管理备案 Copyright 2023 www.wodecom.cn All Rights Reserved