上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子:
酷狗音乐:
import java.io.BufferedInputStream;import java.io.FileOutputStream;import java.io.InputStream;import java.net.URL;import java.net.URLEncoder;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.nodes.Element;import com.alibaba.fastjson.JSONArray;import com.alibaba.fastjson.JSONObject;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.Page;import com.gargoylesoftware.htmlunit.WebClient;public class worm7 { private static String name="离骚"; ????public static WebClient getWebClient(boolean flag){ ????WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45); ?????webClient.getOptions().setUseInsecureSSL(true); ????webClient.getOptions().setCssEnabled(false); ?????????????webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); ????????webClient.getOptions().setThrowExceptionOnScriptError(false); ????????webClient.getOptions().setRedirectEnabled(true); ????????webClient.getOptions().setAppletEnabled(false); ????????webClient.getOptions().setJavaScriptEnabled(flag); ????????????webClient.getOptions().setTimeout(60000); ????????webClient.getOptions().setPrintContentOnFailingStatusCode(false); ????????webClient.setAjaxController(new NicelyResynchronizingAjaxController()); ?????????return webClient; ????} ????public static String getMp3Url(WebClient webClient){ ????FileOutputStream outputStream = null; ????????InputStream inputStream = null; ????????BufferedInputStream bis = null; ???try {Page page=webClient.getPage("http://songsearch.kugou.com/song_search_v2?"+ "callback=jQuery112408395432201569397_1532930925600"+ "&keyword="+URLEncoder.encode(name, "utf-8")+ "&page=1"+ "&pagesize=30"+ "&userid=-1"+ "&clientver="+ "&platform=WebFilter"+ "&tag=em"+ "&filter=2"+ "&iscorrection=1"+ "&privilege_filter=0"+ "&_="+System.currentTimeMillis());//System.out.println(page.getWebResponse().getContentAsString());//System.out.println(zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))"));JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data");System.out.println("job:"+job);JSONArray list=job.getJSONArray("lists");System.out.println("list"+list);for(int i=0;i<list.size();i++){String id1=list.getJSONObject(i).getString("FileHash");String id2=list.getJSONObject(i).getString("AlbumID");String detailUrl="http://www.kugou.com/yy/index.php?r=play/getdata"+ "&hash="+id1+ "&album_id="+id2+ "&_="+System.currentTimeMillis();Page page2=webClient.getPage(detailUrl);JSONObject job2=JSONObject.parseObject(page2.getWebResponse().getContentAsString()).getJSONObject("data");System.out.println("标题:"+job2.getString("audio_name"));//System.out.println("歌词:"+job2.getString("lyrics"));System.out.println("mp3:"+job2.getString("play_url")); ??????????????????String outImage = job2.getString("audio_name")+ ".mp3"; ???????????????URL imgUrl = new URL(job2.getString("play_url"));//获取输入流 ???????????????inputStream = imgUrl.openConnection().getInputStream(); ???????????????//将输入流信息放入缓冲流提升读写速度 ???????????????bis = new BufferedInputStream(inputStream); ?????????????????//读取字节娄 ???????????????byte[] buf = new byte[1024]; ???????????????//生成文件 ???????????????outputStream = new FileOutputStream("f://"+ outImage); ???????????????int size = 0; ???????????????//边读边写 ???????????????while ((size = bis.read(buf)) != -1) { ????????????????????outputStream.write(buf, 0, size); ???????????????} ???????????????//刷新文件流 ???????????????outputStream.flush(); ???????????}} catch (Exception e) {e.printStackTrace();} return name; ?????????} ????private static String zzee(String str, String zz) { String list = null; Pattern p = Pattern.compile(zz); Matcher m = p.matcher(str); while (m.find()) { list = m.group(); } ?return list; }public static void main(String[] args) {WebClient webClient=getWebClient(false);getMp3Url(webClient);}}
运行结果:
qq音乐抓取实例:
import java.io.BufferedInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.net.MalformedURLException;import java.net.URL;import java.net.URLEncoder;import java.util.UUID;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.nodes.Element;import com.alibaba.fastjson.JSON;import com.alibaba.fastjson.JSONArray;import com.alibaba.fastjson.JSONObject;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.Page;import com.gargoylesoftware.htmlunit.WebClient;public class worm6 { private static String name="离骚"; static String id1=null; static String id2=null; static String id3=null; static String id4=null; static String name1=null; static String name2=null; static String url = null; static JSONObject ?job2=null; ????public static WebClient getWebClient(boolean flag){ ????WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45); ?????webClient.getOptions().setUseInsecureSSL(true); ????webClient.getOptions().setCssEnabled(false); ?????????????webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); ????????webClient.getOptions().setThrowExceptionOnScriptError(false); ????????webClient.getOptions().setRedirectEnabled(true); ????????webClient.getOptions().setAppletEnabled(false); ????????webClient.getOptions().setJavaScriptEnabled(flag); ????????????webClient.getOptions().setTimeout(60000); ????????webClient.getOptions().setPrintContentOnFailingStatusCode(false); ????????webClient.setAjaxController(new NicelyResynchronizingAjaxController()); ?????????return webClient; ????} ????public static String getMp3Url(WebClient webClient){ ????????try {Page page=webClient.getPage("https://c.y.qq.com/soso/fcgi-bin/client_search_cp?"+ "ct=24"+ "&qqmusic_ver=1298"+ "&new_json=1"+ "&remoteplace=txt.yqq.center"+ "&searchid=36047978388657978"+ "&t=0"+ "&aggr=1"+ "&cr=1"+ "&catZhida=1"+ "&lossless=0"+ "&p=1"+ "&n=20"+ "&w="+URLEncoder.encode(name, "utf-8")+ "&g_tk=5381"+ "&jsonpCallback=MusicJsonCallback6176591962889693"+ "&loginUin=0"+ "&hostUin=0"+ "&format=jsonp"+ "&inCharset=utf8"+ "&outCharset=utf-8"+ "¬ice=0"+ "&platform=yqq"+ "&needNewCode=0");//System.out.println("page:"+page);//System.out.println("------"+page.getWebResponse().getContentAsString());//System.out.println("======"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))"));JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data");//System.out.println("job:"+job);String job0=job.getString("song");//System.out.println("job0"+job0);job=JSON.parseObject(job0);JSONArray list=job.getJSONArray("list");//System.out.println("list:"+list);for(int i=0;i<list.size();i++){id1=list.getJSONObject(i).getString("mid");//System.out.println("id1"+id1);id2=list.getJSONObject(i).getString("file");//System.out.println("id"+id2);id2="C400"+JSONObject.parseObject(id2).getString("media_mid")+".m4a";//System.out.println("id"+id2);name1=list.getJSONObject(i).getString("title");name2=list.getJSONObject(i).getString("singer");//System.out.println(name2);JSONArray name=JSON.parseArray(name2);//System.out.println("job4:"+name);name2=name.getJSONObject(0).getString("name");//System.out.println(name.getJSONObject(0).getString("name"));/*String detailUrl="https://c.y.qq.com/v8/fcg-bin/fcg_play_single_song.fcg?"+ "songmid="+id1+ "&tpl=yqq_song_detail&format=jsonp&callback=getOneSongInfoCallback&g_tk=5381&jsonpCallback=getOneSongInfoCallback&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0";Page page2=webClient.getPage(detailUrl);//System.out.println(page2);String b="{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}";//System.out.println("b"+b);JSONObject job1=JSONObject.parseObject("{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("url");System.out.println("job1:"+job1);String job2=job1.getString(id2);System.out.println("job2"+job2);*/String url1="https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg?g_tk=5381&jsonpCallback=MusicJsonCallback32651599216689386&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0&cid=205361747&callback=MusicJsonCallback32651599216689386&uin=0"+"&songmid="+id1+"&filename="+id2+"&guid=2241489759";;Page page2=webClient.getPage(url1);//System.out.println("page2"+page2);JSONObject job2=JSONObject.parseObject("{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\\(\\{).*?(?=\\}\\))")+"}").getJSONObject("data");//System.out.println("标题:"+job2.getString("items"));String job3=job2.getString("items");JSONArray job4=JSON.parseArray(job3);//System.out.println("job4:"+job4);//System.out.println(job4.getJSONObject(0).getString("vkey"));url ="http://dl.stream.qqmusic.qq.com/"+id2+"?vkey="+job4.getJSONObject(0).getString("vkey")+"&guid=2241489759&uin=0&fromtag=66";System.out.println("name:"+name1+"--"+name2);System.out.println("url:"+url);download();} ???????????} catch (Exception e) {e.printStackTrace();} return name; ?????????} ????private static String zzee(String str, String zz) { String list = null; Pattern p = Pattern.compile(zz); Matcher m = p.matcher(str); while (m.find()) { list = m.group(); } ?return list; } ????private static void download() throws IOException{ ????FileOutputStream outputStream = null; ????????InputStream inputStream = null; ????????BufferedInputStream bis = null; ????String outImage = name1+"--"+name2+ ".mp3"; ????????URL imgUrl = new URL(url);//获取输入流 ????????inputStream = imgUrl.openConnection().getInputStream(); ????????//将输入流信息放入缓冲流提升读写速度 ????????bis = new BufferedInputStream(inputStream); ??????????//读取字节娄 ????????byte[] buf = new byte[1024]; ????????//生成文件 ????????outputStream = new FileOutputStream("f://"+ outImage); ????????int size = 0; ????????//边读边写 ????????while ((size = bis.read(buf)) != -1) { ?????????????outputStream.write(buf, 0, size); ????????} ????????//刷新文件流 ????????outputStream.flush(); ????}public static void main(String[] args) {WebClient webClient=getWebClient(false);getMp3Url(webClient);}}
运行结果:
相比之下,酷狗音乐相对好爬一些,QQ音乐有些繁琐。。。
htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载
原文地址:https://www.cnblogs.com/xr210/p/9404325.html