spider.py
?1 # -*- coding:utf-8 -*- ?2 from urllib import urlencode ?3 import requests ?4 from requests.exceptions import RequestException ?5 import json ?6 import re ?7 import os ?8 from hashlib import md5 ?9 from bs4 import BeautifulSoup 10 import pymongo 11 from multiprocessing import Pool 12 from json.decoder import JSONDecoder 13 from config import * 14 ?15 client = pymongo.MongoClient(MONGO_URL, connect=False) 16 db = client[MONGO_DB] 17 ?18 def get_page_index(offset,keyword): 19 ????data = { 20 ????????‘offset‘: offset, 21 ????????‘format‘: ‘json‘, 22 ????????‘keyword‘: keyword, 23 ????????‘autoload‘: ‘true‘, 24 ????????‘count‘: ‘20‘, 25 ????????‘cur_tab‘: 3 26 ????} 27 ????url = ‘http://www.toutiao.com/search_content/?‘ + urlencode(data) 28 ????try: 29 ????????response = requests.get(url) 30 ????????if response.status_code == 200: 31 ????????????return response.text 32 ????????return None 33 ????except RequestException: 34 ????????print u‘请求索引页失败‘, url 35 ????????return None 36 ?37 def parse_page_index(html): 38 ????data = json.loads(html) 39 ????if data and ‘data‘ in data.keys(): 40 ????????for item in data.get(‘data‘): 41 ????????????yield item.get(‘article_url‘) 42 ?43 def get_page_detail(url): 44 ????try: 45 ????????response = requests.get(url) 46 ????????if response.status_code == 200: 47 ????????????return response.text 48 ????????return None 49 ????except RequestException: 50 ????????print u‘请求详情页失败‘, url 51 ????????return None 52 ?53 def parse_page_detail(html, url): 54 ????soup = BeautifulSoup(html, ‘lxml‘) 55 ????title = soup.select(‘title‘)[0].get_text() 56 ????print(title) 57 ????images_pattern = re.compile(‘gallery: (.*?),\n‘, re.S) 58 ????result = re.search(images_pattern, html) 59 ????if result: 60 ????????data = json.loads(result.group(1)) 61 ????????if data and ‘sub_images‘ in data.keys(): 62 ????????????sub_images = data.get(‘sub_images‘) 63 ????????????images = [item.get(‘url‘) for item in sub_images] 64 ????????????for image in images: download_image(image) 65 ????????????return { 66 ????????????????‘title‘: title, 67 ????????????????‘url‘: url, 68 ????????????????‘images‘: images 69 ????????????} 70 ?71 def save_to_mongo(result): 72 ????if db[MONGO_TABLE].insert(result): 73 ????????print u‘存储到MongoDB成功‘, result 74 ????????return True 75 ????return False 76 ?77 def download_image(url): 78 ????print u‘正在下载‘, url 79 ????try: 80 ????????response = requests.get(url) 81 ????????if response.status_code == 200: 82 ????????????save_image(response.content) 83 ????????return None 84 ????except RequestException: 85 ????????print u‘请求图片失败‘, url 86 ????????return None 87 ?88 def save_image(content): 89 ????file_path = ‘{0}/{1}.{2}‘.format(os.getcwd(), md5(content).hexdigest(), ‘jpg‘) 90 ????if not os.path.exists(file_path): 91 ????????with open(file_path, ‘wb‘) as f: 92 ????????????f.write(content) 93 ????????????f.close() 94 ?95 def main(offset): 96 ????html = get_page_index(offset, KEYWORD) 97 ????for url in parse_page_index(html): 98 ????????html = get_page_detail(url) 99 ????????if html:100 ????????????result = parse_page_detail(html, url)101 ????????????if result: save_to_mongo(result)102 103 if __name__ == ‘__main__‘:104 ????groups = [x*20 for x in range(GROUP_START, GROUP_END+1)]105 ????pool = Pool()106 ????pool.map(main, groups)
config.py
1 # -*- coding:utf-8 -*-2 MONGO_URL = ‘localhost‘3 MONGO_DB = ‘toutiao‘4 MONGO_TABLE = ‘toutiao‘5 6 GROUP_START = 07 GROUP_END = 208 9 KEYWORD = ‘街拍‘
分析Ajax抓取今日头条街拍美图
原文地址:http://www.cnblogs.com/stonelovy/p/7644651.html