package demo.flink.retained;
import cn.hutool.core.date.DatePattern;
import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.FilterOperator;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.FileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import demo.flink.wordcount.util.MappedBiggerFileReaderWrite;
import java.io.FileOutputStream;
import java.util.*;
/**
* flink 计算留存
* 数据源文件:每天一个文件
*
* @Author: lizixian
* @date: 2022/9/7 23:43
*/
public class Retained4Demo {
private static Logger logger = LoggerFactory.getLogger(Retained4Demo.class);
private static String DATA_TEMP_PATH = "G:\\demo\\temp.txt";
private static String DATA_BASE_PATH = "G:\\demo\\%s.txt";
public static void main(String[] args) throws Exception {
Map<String, List<String>> map = new TreeMap<>();
Date date = DateUtil.parse("20220701");
retained("20220701");
System.out.println(map);
}
/**
* 计算留存率
*
* @Author: lizixian
* @date: 2022/7/8 22:30
*/
public static List<String> retained(String day) throws Exception {
logger.warn("计算 " + day + "这天的留存");
if (!FileUtil.exist(String.format(DATA_BASE_PATH, day))) {
logger.error(day + "这天 这天的文件不存在");
return null;
}
// 合并临时文件
FileUtil.del(DATA_TEMP_PATH);
FileOutputStream fileOut = new FileOutputStream(DATA_TEMP_PATH, true);
Date oldDay = DateUtil.parse("20150101");
Date thatDay = DateUtil.parse(day);
Date thatMonthFirst = DateUtil.offsetDay(thatDay, -29);
while (oldDay.before(thatMonthFirst)) {
// logger.warn("合并:" + oldDay.toLocaleString());
String file = String.format(DATA_BASE_PATH, DateUtil.format(oldDay, DatePattern.PURE_DATE_FORMAT));
oldDay = DateUtil.offsetDay(oldDay, 1);
if (!FileUtil.exist(file)) {
continue;
}
MappedBiggerFileReaderWrite reader = new MappedBiggerFileReaderWrite(file, 655360);
while (reader.read() != -1) {
fileOut.write(reader.getArray());
}
fileOut.write("\n".getBytes());
reader.close();
}
fileOut.flush();
fileOut.close();
// 合并临时文件
final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
// env.setParallelism(1);
List<String> resultList = new ArrayList<>();
// 从头读取全量设备,去重
DataSet<Tuple3<String, String, Integer>> allData = env.readTextFile(DATA_TEMP_PATH).map(new Tokenizer(2)).distinct(0);
logger.warn("allData Count= " + allData.count());
DataSet<Tuple3<String, String, Integer>> union30Dya = null;
for (int i = 29; i > 0; i--) {
// 该日数据
DateTime dateTime = DateUtil.offsetDay(thatDay, -i);
String formatDay = DateUtil.format(dateTime, DatePattern.PURE_DATE_FORMAT);
String itemDayFilePath = String.format(DATA_BASE_PATH, formatDay);
if (!FileUtil.exist(itemDayFilePath)) {
logger.error(formatDay + " 日的数据文件不存在");
continue;
}
DataSet<Tuple3<String, String, Integer>> itemDayData = env.readTextFile(itemDayFilePath).map(new Tokenizer(1)).distinct(0);
// 该日新增
FilterOperator<Tuple3<String, String, Integer>> itemDayNewData = allData.union(itemDayData).groupBy(0).sum(2).filter(j -> j.f2 == 1);
// 该日的留存数据
if (union30Dya == null) {
union30Dya = itemDayNewData;
} else {
union30Dya = union30Dya.union(itemDayNewData);
}
}
// 聚合近30天的设备
AggregateOperator<Tuple3<String, String, Integer>> sumMonth = union30Dya.groupBy(0).sum(2);
logger.info("相较于历史,近30天内新增=" + sumMonth.count());
// 这天之前的设备
DataSet<Tuple3<String, String, Integer>> map = allData.union(sumMonth).map((MapFunction<Tuple3<String, String, Integer>, Tuple3<String, String, Integer>>) item -> {
item.f2 = 2;
return item;
}).returns(Types.TUPLE(Types.STRING, Types.STRING, Types.INT)).distinct(0);
logger.info(day + " 这天之前的设备数=" + map.count());
// 该日数据
String filePath = String.format(DATA_BASE_PATH, day);
DataSet<Tuple3<String, String, Integer>> thatDayData = env.readTextFile(filePath).map(new Tokenizer(1)).distinct(0);
logger.info("当天去重设备数=" + thatDayData.count());
// 该日新增
FilterOperator<Tuple3<String, String, Integer>> thatDayNewData = map.union(thatDayData).groupBy(0).sum(2).filter(i -> i.f2 == 1);
resultList.add("1活=" + thatDayNewData.count());
// 用30天内数据跟该日数据连接()
DataSet<Tuple3<String, String, Integer>> join = thatDayData.join(sumMonth).where(0).equalTo(0).map(i -> i.f1).returns(Types.TUPLE(Types.STRING, Types.STRING, Types.INT));
// join.writeAsText( String.format(DATA_BASE_PATH, "sumMonth"), FileSystem.WriteMode.OVERWRITE).setParallelism(1);
// 统计活跃数据
long countTow = join.filter(i -> i.f2 == 1).count();
long countThree = join.filter(i -> i.f2 == 2).count();
long countFour = join.filter(i -> i.f2 == 3).count();
long countFives = join.filter(i -> i.f2 == 4).count();
long countSix = join.filter(i -> i.f2 == 5).count();
resultList.add("2活= " + countTow);
resultList.add("3活= " + countThree);
resultList.add("4活= " + countFour);
resultList.add("5活= " + countFives);
resultList.add("6活= " + countSix);
join.writeAsText(DATA_TEMP_PATH, FileSystem.WriteMode.OVERWRITE);
// env.execute("留存统计" + System.currentTimeMillis());
System.out.println(resultList);
return resultList;
}
/**
* 自定义分词器
*
* @Author: lizixian
* @date: 2022/7/9 10:17
*/
public static class Tokenizer implements MapFunction<String, Tuple3<String, String, Integer>> {
private int initNum;
public Tokenizer(int initNum) {
this.initNum = initNum;
}
@Override
public Tuple3<String, String, Integer> map(String s) throws Exception {
if (StrUtil.isEmpty(s)) {
return new Tuple3<>("", "", initNum);
}
List<String> split = StrUtil.split(s, "\t");
if (split.size() == 1) {
return new Tuple3<>(split.get(0), split.get(0), initNum);
}
return new Tuple3<>(split.get(0), split.get(1), initNum);
}
}
}
flink 计算留存 Java 版本
最新推荐文章于 2024-10-14 11:50:56 发布