package com.crawler.maoyan.age.sex.index;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.maoyan.movie.contents.MContents;
import com.maoyan.movie.html.MovieHtml;
import com.maoyan.movie.html.PrecessHtml;
import com.maoyan.movie.ttf.encode.DownParseTTF;
import com.maoyan.mysql.configure.DBConfig;
import com.maoyan.mysql.configure.DbAttribute;
import com.maoyan.mysql.manage.ContentToMySQL;
import com.maoyan.mysql.manage.IFRepetition;
import com.maoyan.mysql.manage.UpdateData;
/**
* @author 作者 E-mail: ZH519080@163.com
* @date 创建时间:2017年1月17日 上午11:46:10
* @jdk 版本:jdk1.7.0_79
*
* @类说明:受众性别占比和受众年龄占比
*/
public class AgeSexIndex {
public static void main(String[] args) {
AgeSexIndex ageSexIndex = new AgeSexIndex();
DbAttribute dbAttribute = new DbAttribute();
String branchURL = "http://piaofang.maoyan.com/movie/";
String ageSexURL = "";
String movieIdKey;
try {
movieIdKey = new String(dbAttribute.maoyanMovieID.getBytes("ISO-8859-1"), "utf-8");
String[] splitMovieID = movieIdKey.split("#");
for (int i = 0; i < splitMovieID.length; i++) {
ageSexURL = branchURL+splitMovieID[i]+"/wantindex";
System.out.println(ageSexURL);
String movieNameId = ageSexIndex.getMovieNameId(ageSexURL);
Document document = ageSexIndex.getDocument(ageSexURL);
ageSexIndex.exeSexIndex(document, movieNameId);
ageSexIndex.exeAgeIndex(document,movieNameId);
}
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("over");
}
//获取电影的性别占比
public void exeSexIndex(Document document,String movieNameId){
MContents mContents = new MContents();
ContentToMySQL contentToMySQL = new ContentToMySQL();
IFRepetition ifRepetition = new IFRepetition();
UpdateData updateData = new UpdateData();
SimpleDateFormat sDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");
mContents.setTaskTime(sDateFormat.format(new Date()));
String[] split = movieNameId.split(",");
//电影名称
mContents.setMovieName(split[0]);
//猫眼电影的id号
mContents.setPlatformID(split[1]);
//男性占比
Elements meles = document.select("section div.stackcolumn div.stackcolumn-desc i.cs");
String male = meles.eq(0).text();
mContents.setMaleRate(male+"%");
//女性占比
Elements feeles = document.select("div.stackcolumn div.stackcolumn-desc p.stackcolumn-desc-right i.cs");
String female = feeles.eq(0).text();
mContents.setFemaRate(female+"%");
if (!ifRepetition.sexRepetition(mContents)) {
contentToMySQL.saveGenderRate(mContents);
}else {
updateData.updateSex(mContents);
}
}
//获取电影的年龄占比
public void exeAgeIndex(Document document,String movieNameId){
MContents mContents = new MContents();
ContentToMySQL contentToMySQL = new ContentToMySQL();
IFRepetition ifRepetition = new IFRepetition();
UpdateData updateData = new UpdateData();
SimpleDateFormat sDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");
mContents.setTaskTime(sDateFormat.format(new Date()));
JSONParser jsonParser = new JSONParser();
Elements eles = document.select("body script#pageData");
int beginIndex = eles.toString().indexOf("{");
int endIndex = eles.toString().lastIndexOf("}");
String ageJson = eles.toString().substring(beginIndex, endIndex+1);
String[] split = movieNameId.split(",");
//电影名称
mContents.setMovieName(split[0]);
//猫眼电影的id号
mContents.setPlatformID(split[1]);;
String ageRate = "";//年龄占比
String age = "";//年龄段
String ageAgeRate = "";
try {
JSONObject jsObjectRoot = (JSONObject)jsonParser.parse(ageJson);
//获取年龄占比的相关数据
JSONObject ageJsonObject = (JSONObject)jsObjectRoot.get("ageRatesChart");
JSONArray jsonArray = (JSONArray) ageJsonObject.get("series");
Iterator iterator = jsonArray.iterator();
while(iterator.hasNext()){
JSONObject seriesJsonObject = (JSONObject)iterator.next();
//得到年龄占比精确数据
JSONArray pointsJsonArray = (JSONArray) seriesJsonObject.get("points");
//此处的for循环和while循环是一样的
for(int i = 0 ,length = pointsJsonArray.size();i < length;i++){
JSONObject xyValue = (JSONObject)pointsJsonArray.get(i);
ageRate = xyValue.get("yValue").toString();
age = xyValue.get("xValue").toString();
ageAgeRate = age+ageRate;
if (ageAgeRate.contains("20岁以下")) {
mContents.setF16to20(ageAgeRate.replace("20岁以下", ""));
}else if (ageAgeRate.contains("20~24")) {
mContents.setF21to25(ageAgeRate.replace("20~24", ""));
}else if (ageAgeRate.contains("25~29")) {
mContents.setF26to30(ageAgeRate.replace("25~29", ""));
}else if (ageAgeRate.contains("30~34")) {
mContents.setF31to35(ageAgeRate.replace("30~34", ""));
}else if (ageAgeRate.contains("35~39")) {
mContents.setF36to40(ageAgeRate.replace("35~39", ""));
}else {
mContents.setF41to45(ageAgeRate.replace("40岁以上", ""));
}
}
}
if (!ifRepetition.ageRepetition(mContents)) {
contentToMySQL.saveAgeRate(mContents);
}else {
updateData.updateAge(mContents);
}
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//获取电影的名称和所对应的猫眼电影id号
public String getMovieNameId(String ageSexIndexURL){
//获取电影的名称,电影名称的获取和性别占比不是同一个链接
String movieURL = ageSexIndexURL.substring(0, ageSexIndexURL.indexOf("/wantindex"));
String movieNameId = "";
String platformId = "";
String regex = "[^0-9]";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(ageSexIndexURL);
//猫眼电影的id号
platformId = matcher.replaceAll("");
try {
Document movieNameDocu = Jsoup.connect(movieURL).get();
movieNameId = movieNameDocu.title()+","+platformId;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return movieNameId;
}
//获取电影性别和年龄的占比的document数据
public Document getDocument(String ageSexIndexURL){
MovieHtml movieHtml = new MovieHtml();
DownParseTTF downParseTTF = new DownParseTTF();
PrecessHtml precessHtml = new PrecessHtml();
String sourceHtml = movieHtml.getHtml(ageSexIndexURL).toString();
String ttfCode = downParseTTF.parseTTF(sourceHtml);//下载ttf文件并解析
String precSourceHtml = precessHtml.precSourceHtml(sourceHtml, ttfCode);
Document document = Jsoup.parse(precSourceHtml);
return document;
}
static{
DBConfig.initPropertis("./config/config.properties");
}
}
某票务平台的信息采集
最新推荐文章于 2024-08-14 21:47:36 发布