需求:
将嗨学网上所有需要学习的全部视频下载下来并保存为合适的名字。
由于对前端知识不太了解,完成过程比较困难。
关键点分析:
1.拿到下载视频的链接
第一想法是分析网页源码得到组装链接的逻辑,然后用java代码组装,但是网页源代码太复杂,而且对前端实在是不熟,最终采用了一个投机取巧的方法:在点击下载的同时用截屏捕捉下载地址。
其中有两个变量itemId和goodsId。
再结合源代码进行查找分析得知itemId就是goodsCatalogVideoId,goodsId一直是41889不变。
这样一来就得到了下载视频的地址。
2.通过登陆验证
之前一直认为登陆之后需要保存一些cookie相关的东西,分析了一下网页返回的cookie,比较复杂不知如何下手。一番搜索,最终得知,只需要登陆下载保持用同一个会话就可以了,cookie之类的东西都不用自己操心。(详情见代码)
3.正则匹配,得到课程名字以及goodsCatalogVideoId。
这个过程不是很复杂,详情见代码。
源代码贴上,方便自己将来查看,也许对他人也会有些许帮助。
package Spider;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HeaderIterator;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.ParseException;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
public class DownloadVideo {
static String loginUrl = "http://haixue.com/doLogin.do";
// String downloadUrl =
// "http://haixue.com/goods/downloadUrl.do?itemId=161789577&type=Video&isCatalog=No&goodsId=41889";
static String path = "E:/videos/";
static CloseableHttpClient client = HttpClients.createDefault();
// key:goodsCatalogVideoId value:0.name 1.goodsId
//后面发现goodsId根本不会变,懒得改数据结构了。
static Map<String, List<String>> coureseInfoMap = new LinkedHashMap<String, List<String>>();
public static void main(String[] args) {
try {
login();
String page = getCoursePage();
//String page = read("C:/Users/copbint/Desktop/test.html");
//System.out.println(page);
getIdAndName(page);
//getAnotherId();
download();
} catch (Exception e) {
e.printStackTrace();
}
}
public static String getCoursePage() throws ParseException, IOException{
HttpResponse httpResponse = null;
try{
HttpGet httpGet = new HttpGet("http://haixue.com/course/video/watchVideo.do?goodsCatalogVideoId=161789633&goodsId=0");
httpResponse = client.execute(httpGet);
} catch(Exception e){
e.printStackTrace();
}
return EntityUtils.toString(httpResponse.getEntity());
}
public static void download(){
String url = null;
for(String goodsCatalogVideoId : coureseInfoMap.keySet()){
url = "http://haixue.com/goods/downloadUrl.do?itemId="
+ goodsCatalogVideoId + "&type=Video&isCatalog=No&goodsId=41889";
String fileName = coureseInfoMap.get(goodsCatalogVideoId).get(0) + ".flv";
downloadVideo(url,fileName);
}
}
public static void getAnotherId(){
try{
for(String goodsCatalogVideoId : coureseInfoMap.keySet()){
HttpGet httpGet = new HttpGet("http://haixue.com/course/video/watchVideo.do?goodsCatalogVideoId="
+goodsCatalogVideoId+"&goodsId=0");
HttpResponse httpResponse = client.execute(httpGet);
String page = EntityUtils.toString(httpResponse.getEntity());
Pattern pattern = Pattern.compile("<input type=\"hidden\" id=\"goodsId\" value=\"(.*?)\"/>");
Matcher m = pattern.matcher(page);
List<String> list = coureseInfoMap.get(goodsCatalogVideoId);
if(m.find()){
list.add(m.group(1));
coureseInfoMap.put(goodsCatalogVideoId, list);
}else{
System.out.println("没有找到goodsId:" + list.get(0));
coureseInfoMap.remove(goodsCatalogVideoId);
}
}
} catch(Exception e){
e.printStackTrace();
}
}
public static void getIdAndName(String in) {
//.*?中非贪心匹配
Pattern pattern = Pattern.compile("<div class=\"tit\">(.*?)</div>.*?"
+ "<div class=\"con-bottom hideinfo\">.*?"
+ "<input type=\"hidden\" value=\".*?\"/>.*?"
+ "<input type=\"hidden\" value=\"(.*?)\"/>.*?"
+ "<span>时长 </span>.*?"
+ "<span>已观看 </span>.*?"
+ "</div>"
,Pattern.DOTALL);
//Pattern pattern = Pattern.compile("<div class=\"tit\">(.*?)</div>\n",Pattern.DOTALL);
java.util.regex.Matcher m = pattern.matcher(in);
while(m.find()){
String name = m.group(1).trim();
String goodsCatalogVideoId = m.group(2);
List<String> list = new ArrayList<String>();
list.add(name);
System.out.println("课程名称:"+m.group(1).trim()+"\ngoodsCatalogVideoId:" + m.group(2));
coureseInfoMap.put(goodsCatalogVideoId,list);
}
}
public static void login() throws Exception {
HttpPost httpPost = new HttpPost(loginUrl);
Map<String, String> parameterMap = new HashMap<String, String>();
parameterMap.put("j_username", "******");
parameterMap.put("j_password", "****");
parameterMap.put("_spring_security_remember_me", "no");
UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), "UTF-8");
httpPost.setEntity(postEntity);
System.out.println("request line:" + httpPost.getRequestLine());
try {
HttpResponse httpResponse = client.execute(httpPost);
printResponse(httpResponse);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void downloadVideo(String downloadUrl, String fileName) {
try {
System.out.println("strart download video:" + fileName);
HttpGet httpGet = new HttpGet(downloadUrl);
HttpResponse httpResponse1 = client.execute(httpGet);
InputStream in = httpResponse1.getEntity().getContent();
byte[] buffer = new byte[1024 * 1024];
int n = -1;
// byte[] result =
// EntityUtils.toByteArray(httpResponse1.getEntity());
BufferedOutputStream bw = null;
File f = new File(path + fileName);
if (!f.getParentFile().exists())
f.getParentFile().mkdirs();
bw = new BufferedOutputStream(new FileOutputStream(f));
while ((n = in.read(buffer)) != -1) {
bw.write(buffer, 0, n);
}
bw.close();
System.out.println("finished!");
} catch (Exception e) {
e.printStackTrace();
}
}
public static void printResponse(HttpResponse httpResponse) throws ParseException, IOException {
// 获取响应消息实体
HttpEntity entity = httpResponse.getEntity();
// 响应状态
System.out.println("status:" + httpResponse.getStatusLine());
System.out.println("headers:");
HeaderIterator iterator = httpResponse.headerIterator();
while (iterator.hasNext()) {
System.out.println("\t" + iterator.next());
}
// 判断响应实体是否为空
if (entity != null) {
String responseString = EntityUtils.toString(entity);
System.out.println("response length:" + responseString.length());
System.out.println("response content:" + responseString.replace("\r\n", ""));
}
}
public static List<NameValuePair> getParam(Map parameterMap) {
List<NameValuePair> param = new ArrayList<NameValuePair>();
Iterator it = parameterMap.entrySet().iterator();
while (it.hasNext()) {
Entry parmEntry = (Entry) it.next();
param.add(new BasicNameValuePair((String) parmEntry.getKey(), (String) parmEntry.getValue()));
}
return param;
}
public static String read(String filename) throws IOException {
// Reading input by lines:
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(filename),"UTF-8"));
String s;
StringBuilder sb = new StringBuilder();
while ((s = in.readLine()) != null)
sb.append(s + "\n");
in.close();
return sb.toString();
}
}