selenium+PhantomJS 抓取淘宝搜索商品

发布时间：2023-09-06 01:41责任编辑：熊小新关键词：selenium
最近项目有些需求，抓取淘宝的搜索商品，抓取的品类还多。直接用selenium+PhantomJS 抓取淘宝搜索商品，快速完成。
#-*- coding:utf-8 -*-
__author__ =‘‘
import logging
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,re
from tqdm import tqdm
from pyquery import PyQuery as pq
from tianmao.data_tmall import keywords
from dbutils import mysql_util
from config import retry_count
"""
抓取天猫，
"""
logging.basicConfig(level=logging.INFO,
 ???????????????????????format=‘%(asctime)s [%(levelname)s] [%(filename)s] [%(threadName)s] [line:%(lineno)d] [%(funcName)s] %(message)s‘,
 ???????????????????????datefmt=‘%Y-%m-%d %H:%M:%S‘)


class tianmao_spider(object):

 ???def __init__(self):
 ???????self.SERVICE_ARGS = [‘--disk-cache=true‘,‘--load-images=false‘]
 ???????self.target_url =‘https://www.tmall.com/‘
 ???????self.browser = webdriver.PhantomJS(service_args=self.SERVICE_ARGS)
 ???????self.wait = WebDriverWait(self.browser, 10) #设置10秒超时
 ???????self.browser.set_window_size(1400, 900)
 ???????# self.browser.add_cookie()
 ???????self.mysql_util = mysql_util()

 ???def search(self, category, keyword, page=2):
 ???????print(‘正在搜索:{0}‘.format(keyword))
 ???????total = 0
 ???????for i in range(0, retry_count): #重试3次，3次不成功则跳过
 ???????????try:
 ???????????????self.browser.get(self.target_url)
 ???????????????input = self.wait.until(
 ???????????????????EC.presence_of_element_located((By.CSS_SELECTOR, "#mq"))
 ???????????????)
 ???????????????submit = self.wait.until(
 ???????????????????EC.element_to_be_clickable((By.CSS_SELECTOR, ‘#mallSearch > form > fieldset > div > button‘)))
 ???????????????input.send_keys(keyword)
 ???????????????submit.click()
 ???????????????total = self.wait.until(
 ???????????????????EC.presence_of_all_elements_located((By.CSS_SELECTOR, ‘#content > div > div.ui-page > div > b.ui-page-skip > form‘)))
 ???????????????if page == 2: ?# 从首页开始抓取
 ???????????????????self.get_products(category, keyword)
 ???????????????total = int(re.compile(‘(\d+)‘).search(total[0].text).group(1))
 ???????????????break
 ???????????except TimeoutException as e:
 ???????????????logging.info("正在重试第{0}次，出现：{1}".format(i + 1, e))
 ???????????????if i == retry_count - 1: ?# 将类目和搜索关键字按字典写入txt
 ???????????????????f = open(‘tmall_retry_crawl.txt‘, ‘a‘, encoding=‘utf-8‘)
 ???????????????????f.write(category + ":" + keyword + ‘\n‘)
 ???????????????time.sleep(1)
 ???????return total

 ???def get_products(self, category, keyword):
 ???????self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ‘#J_ItemList‘)))
 ???????html = self.browser.page_source
 ???????doc = pq(html)
 ???????items = doc(‘#J_ItemList .product‘).items()
 ???????for item in items:
 ???????????#下载图片
 ???????????img_url = item.find(‘img‘).attr(‘src‘) if item.find(‘img‘).attr(‘src‘) != None else item.find(
 ???????????????‘img‘).attr(‘data-src‘)
 ???????????if img_url != None:
 ???????????????if not str(img_url).startswith("http"):
 ???????????????????img_url = ‘http:{0}‘.format(img_url)
 ???????????????img_save_path = ‘‘ #IMG_PATH + ‘/{0}.jpg‘.format(uuid.uuid4())
 ???????????????#download_img(img_url, img_save_path)
 ???????????#获取详情页面
 ???????????item_url = item.find(‘a‘).attr(‘href‘)
 ???????????logging.info(‘详情页面url：{0}‘.format(item_url))
 ???????????if item_url == None:
 ???????????????item_detail = ‘‘
 ???????????else:
 ???????????????if not str(item_url).startswith(‘http‘):
 ???????????????????item_url = "https:" + item_url
 ???????????????# item_detail = get_item_detail(item_url)
 ???????????????item_detail = ‘‘
 ???????????#保存到MySQL
 ???????????product = {
 ???????????????‘target‘: ‘tmall‘,
 ???????????????‘category‘: category,
 ???????????????‘keyword‘: keyword,
 ???????????????‘item_url‘: item_url,
 ???????????????‘image_url‘: img_url,
 ???????????????‘image_save_path‘: img_save_path, #div:nth-child(3) >
 ???????????????‘title‘: item.find(‘div > div.productTitle‘).text(),
 ???????????????‘price‘: item.find(‘div > p.productPrice‘).text(),
 ???????????????‘deal‘: item.find(‘div > p.productStatus‘).text().replace(‘阿里旺旺‘,‘‘).strip(),
 ???????????????‘shop‘: item.find(‘div > div.productShop‘).text(),
 ???????????????‘location‘: ‘‘,
 ???????????????‘item_detail‘: item_detail,
 ???????????????‘create_time‘: time.strftime(‘%Y-%m-%d %H:%M:%S‘, time.localtime(time.time()))

 ???????????}
 ???????????self.mysql_util.sava_to_mysql(‘t_tmall‘, product)


 ???def next_page(self, page_number, category, keyword):
 ???????for i in range(0, retry_count): #重试3次
 ???????????try:
 ???????????????input = self.wait.until(
 ???????????????????EC.presence_of_element_located((By.CSS_SELECTOR, "#content > div > div.ui-page > div > b.ui-page-skip > form > input.ui-page-skipTo"))
 ???????????????)
 ???????????????submit = self.wait.until(EC.element_to_be_clickable(
 ???????????????????(By.CSS_SELECTOR, ‘#content > div > div.ui-page > div > b.ui-page-skip > form > button‘)))
 ???????????????input.clear()
 ???????????????input.send_keys(page_number)
 ???????????????submit.click()
 ???????????????self.wait.until(EC.element_to_be_clickable(
 ???????????????????(By.CSS_SELECTOR, ‘#content > div > div.ui-page > div > b.ui-page-skip > form > button‘)))
 ???????????????self.get_products(category, keyword)
 ???????????????break
 ???????????except TimeoutException as e:
 ???????????????logging.info(e)
 ???????????????time.sleep(0.5)
 ???????????????# self.next_page(page_number, category, keyword)

 ???def start_crawler(self, page=2):
 ???????try:
 ???????????for category in keywords.keys():
 ???????????????keyword_list = keywords[category]
 ???????????????for keyword in keyword_list:
 ???????????????????total = self.search(category, keyword, page=page)
 ???????????????????# total = int(re.compile(‘(\d+)‘).search(total).group(1))
 ???????????????????for i in tqdm(range(page, total + 1)):
 ???????????????????????print(‘总共{0}页,正在翻第{1}页，抓取类别：{2}，搜索关键字：{3}‘.format(total, i, category, keyword))
 ???????????????????????self.next_page(i, category, keyword)
 ???????????????????if page != 2: ?# 下一个产品必须从第二页开始抓取，中断后可以直接从中断页继续抓取
 ???????????????????????page = 2
 ???????????????time.sleep(0.5)
 ???????except Exception as e:
 ???????????print(e)
 ???????finally:
 ???????????self.browser.close()


if __name__ == ‘__main__‘:
 ???tmall = tianmao_spider()
 ???tmall.start_crawler()
原文地址：https://www.cnblogs.com/hd-zg/p/8412693.html
selenium+PhantomJS 抓取淘宝搜索商品

知识推荐